<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v4i1e16</article-id>
    <article-id pub-id-type="pmid">29426815</article-id>
    <article-id pub-id-type="doi">10.2196/publichealth.8186</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Associations of Topics of Discussion on Twitter With Survey Measures of Attitudes, Knowledge, and Behaviors Related to Zika: Probabilistic Study in the United States</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Goodin</surname>
          <given-names>Amie J</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Akram</surname>
          <given-names>Hammad</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1" corresp="yes">
      <name name-style="western">
        <surname>Farhadloo</surname>
        <given-names>Mohsen</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <address>
        <institution>University of Illinois at Urbana-Champaign</institution>
        <addr-line>603 E Daniel St</addr-line>
        <addr-line>Champaign, IL,</addr-line>
        <country>United States</country>
        <phone>1 209 761 5350</phone>
        <email>mfarhad@illinois.edu</email>
      </address>  
      <xref rid="aff2" ref-type="aff">2</xref>
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7104-4380</ext-link></contrib>
      <contrib contrib-type="author" id="contrib2">
        <name name-style="western">
          <surname>Winneg</surname>
          <given-names>Kenneth</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-1871-3570</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3">
        <name name-style="western">
          <surname>Chan</surname>
          <given-names>Man-Pui Sally</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-2984-0487</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4">
        <name name-style="western">
          <surname>Hall Jamieson</surname>
          <given-names>Kathleen</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-4167-3688</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib5">
        <name name-style="western">
          <surname>Albarracin</surname>
          <given-names>Dolores</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-9878-942X</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
      <sup>1</sup>
      <institution>University of Illinois at Urbana-Champaign</institution>
      <addr-line>Champaign, IL</addr-line>
      <country>United States</country>
    </aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Annenberg Public Policy Center</institution>
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Mohsen Farhadloo 
      <email>mfarhad@illinois.edu</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Jan-Mar</season><year>2018</year></pub-date>
    <pub-date pub-type="epub">
      <day>09</day>
      <month>02</month>
      <year>2018</year>
    </pub-date>
    <volume>4</volume>
    <issue>1</issue>
    <elocation-id>e16</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>12</day>
        <month>6</month>
        <year>2017</year>
      </date>
      <date date-type="rev-request">
        <day>9</day>
        <month>9</month>
        <year>2017</year>
      </date>
      <date date-type="rev-recd">
        <day>8</day>
        <month>11</month>
        <year>2017</year>
      </date>
      <date date-type="accepted">
        <day>22</day>
        <month>11</month>
        <year>2017</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Mohsen Farhadloo, Kenneth Winneg, Man-Pui Sally Chan, Kathleen Hall Jamieson, Dolores Albarracin. Originally published in JMIR Public Health and Surveillance (http://publichealth.jmir.org), 09.02.2018.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on http://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://publichealth.jmir.org/2018/1/e16/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Recent outbreaks of Zika virus around the world led to increased discussions about this issue on social media platforms such as Twitter. These discussions may provide useful information about attitudes, knowledge, and behaviors of the population regarding issues that are important for public policy.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>We sought to identify the associations of the topics of discussions on Twitter and survey measures of Zika-related attitudes, knowledge, and behaviors, not solely based upon the volume of such discussions but by analyzing the content of conversations using probabilistic techniques.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>Using probabilistic topic modeling with US county and week as the unit of analysis, we analyzed the content of Twitter online communications to identify topics related to the reported attitudes, knowledge, and behaviors captured in a national representative survey (N=33,193) of the US adult population over 33 weeks.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>Our analyses revealed topics related to “congress funding for Zika,” “microcephaly,” “Zika-related travel discussions,” “insect repellent,” “blood transfusion technology,” and “Zika in Miami” were associated with our survey measures of attitudes, knowledge, and behaviors observed over the period of the study.</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>Our results demonstrated that it is possible to uncover topics of discussions from Twitter communications that are associated with the Zika-related attitudes, knowledge, and behaviors of populations over time. Social media data can be used as a complementary source of information alongside traditional data sources to gauge the patterns of attitudes, knowledge, and behaviors in a population.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>Zika</kwd>
      <kwd>Twitter</kwd>
      <kwd>topic modeling</kwd>
      <kwd>public policy</kwd>
      <kwd>public health</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Outbreaks of Zika virus in 2016 in various areas of the world [<xref ref-type="bibr" rid="ref1">1</xref>] led to increased communications about this issue on Twitter and other social media platforms. These communications may provide digital markers of attitudes, behaviors, and knowledge in a population, thus supplying an easily accessible thermometer of variations in psychological responses that are important for public policy. In this paper, our objective was to identify these markers by correlating Twitter data with survey measures of attitudes, behaviors, and knowledge in a representative sample of the US adult population.</p>
      <p>Studying attitudes, knowledge, and behaviors has a longstanding theoretical interest. People’s attitudes and knowledge about the world allow them to make behavioral decisions and avoid public health threats throughout the course of their lives. An attitude is regarded as an evaluation of an object as positive or negative [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. For example, an attitude relevant to Zika may entail favoring policies that can reduce infection, such as spraying. These attitudes may be linked to knowledge—factual information about the life cycle of the <italic>Aedes</italic> mosquitos and types of transmission of the Zika virus, for example. Attitudes may be assessed with semantic differential, Likert, or other items tapping evaluations, whereas knowledge measures involve true/false determinations about factual statements about Zika. Attitudes and knowledge as well as behaviors such as repellent use or calls for action on Congress are classic psychological responses to public health information [<xref ref-type="bibr" rid="ref3">3</xref>]. In decision making and behavior studying, theories of reasoned action [<xref ref-type="bibr" rid="ref4">4</xref>] and planned behavior [<xref ref-type="bibr" rid="ref5">5</xref>] specify a limited number of psychological variables that influence a behavior: (1) intention, (2) attitude toward behavior, (3) subjective norms, and (4) perceived behavioral control. Also, there are studies that have investigated the relationships among knowledge, belief, and behavior [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref7">7</xref>] and studies that involve social media along with other psychological variables [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>Designing a health communication strategy often requires an understanding of how messages may be linked to the attitudes, knowledge, and behaviors of an audience. With the advent of online communication technologies, this understanding can be derived from the analysis of social media data. Twitter, for example, has been used to predict the age, gender, and political orientation [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], level of depression [<xref ref-type="bibr" rid="ref12">12</xref>], and emotions and attitudes [<xref ref-type="bibr" rid="ref9">9</xref>] of social media users. The work of De Choudhury et al [<xref ref-type="bibr" rid="ref12">12</xref>] has discovered that those with major depressive disorder have less Twitter activity and higher self-attentional focus and express greater negative emotion, relational and medical concerns, and a greater number of religious thoughts. Also, an analysis of Facebook postings in relation to personality found variations in language with respect to personality, age, and gender [<xref ref-type="bibr" rid="ref14">14</xref>]. This research suggests that Twitter may be useful in identifying discussion topics in the Zika domain as well.</p>
      <p>In this study, we investigated the possibility of discovering topics of discussion on Twitter that are related to the ongoing public health challenge of Zika virus and whether their variations reveal important information regarding changes of Zika-related attitudes, knowledge, and behaviors of populations over time. To discover the topics of discussion, we used probabilistic topic modeling techniques and examined different weighting schemes (binary, term occurrence, and term frequency–inverse document frequency [tfidf]) on the learned models.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We examined Twitter data to identify topics in the content of the online communications that were about Zika. Employing latent Dirichlet allocation (LDA), we analyzed tweets aggregated based on location information and used the learned models to infer the probability of occurrence of each topic over time using weekly aggregated test tweets. To train the topic models, different weighting schemes (binary, term occurrence, and tfidf) were compared to find the model with the best weighting scheme. We then explored the associations of topics that showed variability over time with weekly aggregated measures of Zika attitudes, knowledge, and behaviors obtained from a survey representing the US adult population. The resulting correlations were used to describe the topics of discussion associated with the psychological measures of our samples.</p>
      </sec>
      <sec>
        <title>Twitter Data</title>
        <p>Our Zika corpus was collected from the Twitter network by searching for a set of Zika-related keywords (“Zika,” “dengue,” “yellow fever,” “Zika virus,” “Zika fever,” “flaviviridae,” “brain shrink,” “fetal brain disruption sequence,” “mosquitoes,” “birth defects,” “insect bites,” “mosquito bites,” “insect-borne virus,” “mosquito-borne virus,” “microcephaly,” and “Guillain-Barre syndrome”) using Twitter streaming application programming interface. The resulting dataset contains 3.8 million tweets from February 1, 2016, to August 30, 2016. Using location information, we were able to map about 10% of all the tweets in our corpus into 2695 different US counties. The rest of the tweets (the other 90%) were aggregated using the timestamp information of each tweet, assigning each to weekly documents. We aggregated tweets for the weeks of February 16, 2016, to August 18, 2016, to match the survey data described next.</p>
      </sec>
      <sec>
        <title>Survey Data</title>
        <p>The Annenberg Public Policy Center of the University of Pennsylvania designed and carried out a survey of attitudes, knowledge, and behaviors relevant to Zika virus over 33 weeks (N=33,193). <xref ref-type="table" rid="table1">Table 1</xref> summarizes the attitude, knowledge, and behavior questions that were asked of the participants each week. Each week, a dual-frame sample was designed to represent the adult US population (including Hawaii and Alaska). A fully replicated, single-stage, random-digit-dialing sample of landline telephone households, along with randomly generated cell phone numbers, was employed. Each weekly wave consisted of 1000 interviews of which at least 600 were obtained from cell phone respondents. Within each landline household, a single respondent (the youngest adult) was selected. Because the interview could take place outside the respondent’s home, cell phone respondents were considered separately from landlines. Surveys were conducted in 5-day intervals, in English and Spanish, typically from Wednesday through Sunday to include both weekdays and weekends.</p>
        <p>Each weekly wave was weighted to provide nationally representative and projectable estimates of the adult population 18 years of age and older. The weighting process took into account the disproportionate probability of household and respondent selection due to the number of separate telephone landlines and cell phones answered by respondents and their households, as well as the probability associated with the random selection of an individual household member. Following application of the weights, the sample was post-stratified and balanced by the key demographics of age, race, sex, region, and education. The sample was also weighted to reflect the distribution of phone usage in the general population, meaning the proportion of those who are cell phone only, landline only, and mixed users (see <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref> for more details regarding the survey design and data collection method). Our data included 51.61% females, 39.58% college-educated participants (14.88% with some college and 45.54% with high school education or less), 37.21% living in regions at risk for Zika virus, 45.99% aged 18 to 44 years (17.68% ages 45 to 54 years and 36.33% aged 55 years or older), and 5.57% with current/intended pregnancy. The average response rate over the weeks was 7.50%, a figure comparable to that of other national surveys conducted in the United States [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <p>We analyzed the content of a sample of communications from Twitter (within a 10-month time span) following the process depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p>
        <p>We analyzed the sampled tweets using the topic modeling of LDA [<xref ref-type="bibr" rid="ref17">17</xref>] to uncover topics that users addressed in their communications. This method is a probabilistic way to discover salient patterns (topics) in a collection of text documents; along with its multiple variations, this approach has been used to analyze long news articles, blogs, and scientific papers in various domains [<xref ref-type="bibr" rid="ref18">18</xref>]. However, optimal applications of LDA to Twitter data deserve further attention because a single tweet is short (140 characters), uses informal language, and contains misspellings, emoticons, acronyms, and nonstandard abbreviations, as well as Twitter names, hashtags, and URLs.</p>
        <p>When LDA-based methods are applied directly to posts from microblogging platforms (considering each single tweet as a document), which are usually short and often noisy, these methods result in topics that are uninformative and hard to interpret [<xref ref-type="bibr" rid="ref19">19</xref>]. To improve the performance of topic modeling of tweets, we incorporated 2 aggregation techniques (<xref ref-type="fig" rid="figure1">Figure 1</xref>, phase 2). In order to amass a sufficient number of documents in our training set and generate longer documents, before learning the topic models we created a document for each county in the United States by aggregating all tweets of a county. Not all tweets and Twitter accounts are associated with location information. Typically, 1% of Twitter users have enabled the geocoordinate mobile device service, which tags each tweet with their current geocoordinates [<xref ref-type="bibr" rid="ref20">20</xref>]. Additionally, some Twitter users have completed the free response location field in their Twitter account profile. For the tweets that contain geolocation coordinates, we found the county corresponding to the coordinates. In addition to using the precise coordinates to locate counties, we geotagged tweets based on the location field when information about a city/state pair or a city name was included. These methods have been described elsewhere [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref21">21</xref>] and were used to generate the training set of tweets. The testing set used in analyses (<xref ref-type="fig" rid="figure1">Figure 1</xref>, phase 2) included the tweets that did not have any location information, which comprised 90% of our corpus. Using the timestamp information of each tweet, all tweets created in a week were merged into a single document.</p>
        <p>We first applied LDA to the tweets pooled by location that constituted our training data to discover topics from the online communications (<xref ref-type="fig" rid="figure1">Figure 1</xref>, phase 3). Within each topic, some terms have high probabilities, whereas others have low ones. After discovering the topics, the proportion of each topic for a particular week document was calculated using Bayesian inferences from the learned models (<xref ref-type="fig" rid="figure1">Figure 1</xref>, phase 4). To accomplish this, we used the documents in our test set that were pooled weekly based on their timestamp information. This modeling resulted in a signal indicating the variation of each topic over time. We then correlated the extracted topics with our survey items (<xref ref-type="fig" rid="figure1">Figure 1</xref>, phase 5). Each item measuring attitudes, knowledge, and behaviors was averaged for a particular week, and these averages were then correlated with the variation of topics over weeks.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Attitudes, knowledge, and behavior questions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="800"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Category and survey item</td>
                <td>Survey question</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2"><bold>Attitude</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZI-22</td>
                <td>If there were cases of people getting infected with Zika virus in your city or town, would you approve or disapprove of special spraying at the ground level against mosquitoes to prevent the spread of the Zika virus (on a scale 1=strongly disapprove to 5=strongly approve)?</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZI-23</td>
                <td>If there were cases of people getting infected with the Zika virus in your city or town, would you approve or disapprove of special spraying from the air against mosquitoes to prevent the spread of the Zika virus (on a scale 1=strongly disapprove to 5=strongly approve)?</td>
              </tr>
              <tr valign="top">
                <td colspan="2"><bold>Knowledge</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZG-03b</td>
                <td>How do scientists think someone can get the Zika virus? By sitting next to someone who has the Zika virus (on a scale 1=not likely at all to 4=very likely).</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZG-03c</td>
                <td>How do scientists think someone can get the Zika virus? By being bitten by a mosquito that has already bitten someone who has the Zika virus (on a scale 1=not likely at all to 4=very likely).</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZG-05</td>
                <td>How accurate is it to say that a pregnant woman who is infected with the Zika virus is more likely to have a baby with an unusually small head and brain (on a scale 1=not accurate at all to 4=very accurate)?</td>
              </tr>
              <tr valign="top">
                <td colspan="2"><bold>Behavior</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZG-47</td>
                <td>If there were a vaccine that protected you from getting Zika how likely, if at all, is it that you would get the vaccine (on a scale 1=not likely at all to 4=very likely)?</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>ZG-54</td>
                <td>In the past 3 months, have you done anything to protect yourself from getting Zika (on a scale 0 to 1)?</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td>GM-20</td>
                <td>In the past week, how many days, if any, did you discuss the effects of the Zika virus with family or friends (on a scale 0 to 7)?</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Flow of data processing and analyses.</p>
          </caption>
          <graphic xlink:href="publichealth_v4i1e16_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The perplexity formula used to compare the probability models. The log-likelihood of a set of held-out documents can be calculated and used for comparing the models.</p>
          </caption>
          <graphic xlink:href="publichealth_v4i1e16_fig2.jpg" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>LDA uses the bag-of-words approach and represents each document using a vector with the dimension of the considered vocabulary size. To examine the impact of various weighting schemes on the topic models, we compared 3 popular weighting schemes: binary, term occurrence, and tfidf representations. Topic models are probability models for a collection of documents. One approach to evaluate probability models involves comparing how well they model a held-out test set. A trained topic model is described by topic matrix Ψ and hyper parameter α for topic distribution of documents. Given those parameters, the log-likelihood of a set of held-out documents can be calculated and used for comparing the models. Traditionally, the perplexity of a model can be calculated as illustrated (see <xref ref-type="fig" rid="figure2">Figure 2</xref>), which shows a decreasing function of the log-likelihood of the held-out test set—the lower the perplexity, the better the model.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Identification of Optimal Modeling Parameters</title>
        <p>For different numbers of topics (k=5, 10, 15, 20, 30, 40, 50, 100, 150, and 200) and the 3 different weighting schemes (binary, term occurrence, and tfidf), we trained topic models and calculated the perplexity of the held-out test set using Bayesian inference. As mentioned before, perplexity is a common measure used to compare different probability models. The lower the perplexity on the test set, the better the model. As <xref ref-type="fig" rid="figure3">Figure 3</xref> shows, the perplexity of the trained models with term occurrence weighting scheme is lower than the binary and tfidf weighting schemes. Thus, we decided to use term occurrence representation as the weighting scheme for the rest of the study.</p>
      </sec>
      <sec>
        <title>Probabilistic Topic Discovery</title>
        <p>To qualitatively demonstrate the discovered topics using different topic models with k=100, 150, and 200, <xref ref-type="table" rid="table2">Table 2</xref> represents each topic with its top 10 most probable terms. The topics in <xref ref-type="table" rid="table2">Table 2</xref> reveal that many of the Zika-related issues such as “mosquito,” “pregnancy,” “microcephaly,” “dengue,” “Congress act for Zika funding,” “insect repellent,” “Florida,” “Miami,” “Brazil,” and “Puerto Rico” are discoverable through analysis of the content of the communications using topic modeling.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Comparison of weighting schemes (binary, term occurrence, and term frequency–inverse document frequency [tfidf]) for a vocabulary size of 8200. Perplexity of the held-out test set is the lowest for the term occurrence weighting scheme.</p>
          </caption>
          <graphic xlink:href="publichealth_v4i1e16_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Top 10 words of some of the topics of the trained latent Dirichlet allocation (LDA) models used to examine the association with the survey items. Terms that could be used to label a topic are italicized.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="125"/>
            <col width="865"/>
            <thead>
              <tr valign="top">
                <td>Topic number</td>
                <td>Top 10 terms</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>4-LDA100</td>
                <td><italic>zika</italic>, <italic>virus</italic>, <italic>mosquito</italic>, health, amp, zika, <italic>pregnant</italic>, new, zikavirus, first</td>
              </tr>
              <tr valign="top">
                <td>57-LDA100</td>
                <td><italic>mosquito</italic>, many, <italic>zika</italic>, amp, <italic>virus</italic>, wish, <italic>summer</italic>, full, look, leg</td>
              </tr>
              <tr valign="top">
                <td>63-LDA100</td>
                <td><italic>congress</italic>, <italic>funding</italic>, <italic>act</italic>, <italic>emergency</italic>, tell, via2026, <italic>approve</italic>, add, lirikoph, zika2014fast</td>
              </tr>
              <tr valign="top">
                <td>96-LDA100</td>
                <td>mosquito, virus, zika, <italic>microcephaly</italic>, bill, <italic>repellent</italic>, <italic>mosquito</italic>, <italic>summer</italic>, amp, natural</td>
              </tr>
              <tr valign="top">
                <td>2-LDA150</td>
                <td>virus, zika, mosquito, <italic>birth</italic>, like, first, <italic>health</italic>, buy, <italic>bill</italic>, <italic>dengue</italic></td>
              </tr>
              <tr valign="top">
                <td>12-LDA150</td>
                <td>virus, cdcgov, zika, via, test, <italic>pregnant</italic>, mosquito, <italic>cdc</italic>, <italic>prevention</italic>, amp, primarily, <italic>contraception</italic></td>
              </tr>
              <tr valign="top">
                <td>15-LDA150</td>
                <td>virus, <italic>zika</italic>, mosquito, thing, hot, <italic>congress</italic>, pregnant, amp, <italic>funding</italic>, <italic>emergency</italic></td>
              </tr>
              <tr valign="top">
                <td>73-LDA150</td>
                <td>zika, virus, <italic>tech</italic>, <italic>blood</italic>, mosquito, outbreak, <italic>threat</italic>, <italic>pregnancy</italic>, <italic>government</italic>, brazil</td>
              </tr>
              <tr valign="top">
                <td>120-LDA150</td>
                <td>zika, virus, mosquito, <italic>zika</italic>, health, amp, <italic>funding</italic>, <italic>congress</italic>, via, <italic>pregnant</italic></td>
              </tr>
              <tr valign="top">
                <td>149-LDA150</td>
                <td>virus, <italic>zika</italic>, mosquito, around, like, <italic>protect</italic>, money, suspected, <italic>health</italic>, <italic>travel</italic></td>
              </tr>
              <tr valign="top">
                <td>10-LDA200</td>
                <td>zika, virus, amp, <italic>zika</italic>, <italic>funding</italic>, <italic>mosquito</italic>, <italic>congress</italic>, new, like, house</td>
              </tr>
              <tr valign="top">
                <td>37-LDA200</td>
                <td>zika, virus, <italic>insect</italic>, <italic>abortion</italic>, virus, <italic>repellent</italic>, <italic>prevent</italic>, rubio, mosquito, pregnant</td>
              </tr>
              <tr valign="top">
                <td>39-LDA200</td>
                <td>mcilroy, rory, zika, virus, <italic>pesticide</italic>, crisis, rio, <italic>florida</italic>, <italic>zika</italic>, <italic>vaccine</italic></td>
              </tr>
              <tr valign="top">
                <td>87-LDA200</td>
                <td>mosquito, virus, <italic>zika</italic>, <italic>miami</italic>, state, <italic>dengue</italic>, national, need, <italic>fever</italic>, month</td>
              </tr>
              <tr valign="top">
                <td>104-LDA200</td>
                <td>zika, virus, <italic>medical</italic>, <italic>rio</italic>, <italic>brazil</italic>, new, everything, <italic>mosquito</italic>, <italic>baby</italic>, amp</td>
              </tr>
              <tr valign="top">
                <td>135-LDA200</td>
                <td>virus, zika, <italic>mosquito</italic>, <italic>central</italic>, light, <italic>zika</italic>, <italic>health</italic>, clean, video, amp</td>
              </tr>
              <tr valign="top">
                <td>165-LDA200</td>
                <td><italic>virus</italic>, zika, mosquito, <italic>cdc</italic>, breaking, <italic>zika</italic>, <italic>pregnant</italic>, amp, u.s., rather</td>
              </tr>
              <tr valign="top">
                <td>197-LDA200</td>
                <td>virus, zika, <italic>rico</italic>, <italic>puerto</italic>, <italic>congress</italic>, <italic>funding</italic>, obamacare, zika, emergency, scott</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Associations of Topics With Weekly Measures of Attitudes, Knowledge, and Behaviors</title>
        <p>The associations of topics with the weekly averages of attitudes, knowledge, and behavior items appear in <xref ref-type="table" rid="table1">Table 1</xref>. As one can surmise, the content of the online communications of each week focuses on a handful of the discovered topics. In other words, there are topics that only appear in some weeks and there are a limited number of topics that appear in many. For the correlation analysis presented here, the topics that appeared in just a few (2 to 3) weeks were discarded; we only considered those topics that appeared in almost all weeks. We calculated correlations using our trained topic models with a vocabulary size of 8200, term occurrence weighting scheme, and with k=100, 150, and 200 topics. We were interested in discovering topics whose variations over time mimic the variations of the Zika survey items. Therefore, we calculated the posterior probability of all topics in each week and looked into the variation of the topics over time (weeks).</p>
        <p><xref ref-type="fig" rid="figure4">Figure 4</xref> shows the variations of topics and survey items over time. Because some of the survey items were not asked of the respondents in all of the weeks, there are missing data in the survey. Since the variation of topics over time generated from our analyses can be used to monitor/predict the variation of survey measures, Twitter data can be used as a good measure to gauge attitudes, knowledge, and behaviors.</p>
        <p><xref ref-type="table" rid="table3">Table 3</xref> shows topics with significant correlation with attitude, knowledge, or behavior questions, and <xref ref-type="fig" rid="figure5">Figure 5</xref> depicts these topics with their word clouds. We have reported the <italic>P</italic> values for the <italic>t</italic> tests of the Pearson correlations to determine statistically significant correlations in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        <p>Items ZI-22 and ZI-23 of the Zika survey measured attitudes toward ground and air spraying against mosquitoes; these correlated positively with topics about “Congress funding,” “Zika protection and travel,” and “Zika in Miami” (<xref ref-type="table" rid="table3">Table 3</xref>). “Congress funding” and “Zika protection and travel” also correlated with the microcephaly knowledge item, and this item also positively correlated with a topic that specifically captures discussions about “microcephaly” online.</p>
        <p>According to the results (<xref ref-type="table" rid="table3">Table 3</xref>), the behavioral item GM-20 that is measuring the amount of Zika-related discussion with family/friends of the respondents correlates with “Zika,” “Zika protection and travel,” “Congress funding,” and “insect repellent” topics. Thus, our content analyses of the online communications not only discovered topics that can serve as a gauge of the amount of Zika-related discussion but also revealed the most probable topics that construct the body of their communications (in this case, “travel plan/change” and “insect repellent”). The other behavioral item of the survey that is asking about recent preventive action to avoid Zika infection correlates with the topic “insect repellent.” Some of the discovered topics do not correlate significantly with any of the Zika items. For instance, topic 104/LDA200 can be interpreted as capturing those communications regarding the “rio2016 olympics.” Because none of the analyzed survey items were directly related to this topic, there is not a significant correlation between this topic and any of the survey items. This observation indicates the possibility of discovering topics by analyzing the content of the communications that may not be measured by conducted surveys but were captured by our model.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Probability of topics (circle markers) and survey items (square markers) over time. Using the trained model, the probability of each topic can be calculated in each week. The survey items at each week are the average of the participants' responses. Survey items missing in some weeks were not asked of the respondents in those weeks. Left: Attitude toward ground spraying (survey) compared with congress funding (Twitter) (197/LDA200). Right: Knowledge about microcephaly (survey) compared with Zika protection and travel (Twitter) (149/LDA150). LDA: latent Dirichlet allocation.</p>
          </caption>
          <graphic xlink:href="publichealth_v4i1e16_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Summary of topic correlates for survey items. LDA: latent Dirichlet allocation.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="30"/>
            <col width="740"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Category, survey item, and topic</td>
                <td>Correlation (<italic>P</italic> value)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="3"><bold>Attitude</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td colspan="2"><bold>Ground spraying (ZI-22)</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Congress funding” (197/LDA200)</td>
                <td>.88 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Zika protection and travel” (149/LDA150)</td>
                <td>.68 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td colspan="2"><bold>Aerial spraying (ZI-23)</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Congress funding” (197/LDA200)</td>
                <td>.92 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Zika in Miami” (87/LDA200)</td>
                <td>.67 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td colspan="3"><bold>Knowledge</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td colspan="2"><bold>Microcephaly (ZG-05)</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Zika protection and travel” (149/LDA150)</td>
                <td>.52 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Congress funding” (99/LDA100)</td>
                <td>.51 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Microcephaly” (96/LDA100)</td>
                <td>.43 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td colspan="3"><bold>Behavior</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td colspan="2"><bold>Getting Zika vaccine (ZG-47)</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Blood transfusion tech” (73/LDA150)</td>
                <td>–.68 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td colspan="2"><bold>Practicing any preventive behavior to avoid Zika (ZG-54)</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Zika” (57/LDA100)</td>
                <td>.65 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Insect repellent” (37/LDA200)</td>
                <td>.59 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td colspan="2"><bold>Discussing Zika with family/friends (GM-20)</bold></td>
                <td><break/></td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Zika” (57/LDA100)</td>
                <td>.47 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Insect repellent” (37/LDA200)</td>
                <td>.45 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Congress funding” (197/LDA200)</td>
                <td>.44 (&#60;.001)</td>
              </tr>
              <tr valign="top">
                <td><break/></td>
                <td><break/></td>
                <td>“Zika protection and travel” (149/LDA150)</td>
                <td>.30 (&#60;.001)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Word cloud of topics that showed significant correlation with survey items. LDA: latent Dirichlet allocation.</p>
          </caption>
          <graphic xlink:href="publichealth_v4i1e16_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this paper, we used topic modeling to analyze the content of online communications on the Twitter microblogging service. Instead of relying simply on the volume of the online communications, we analyzed their content to identify topics whose variations over time could be associated with the variations in attitudes, knowledge, and behaviors measured with survey methods. After collecting a corpus of tweets related to Zika virus, we aggregated them into longer documents by either using location or timestamp information. To parse out topics of discussion, we used LDA probabilistic topic modeling and calculated the variation of topics over time using Bayesian inference. Our results demonstrated the possibility of discovering evidence from social media that enables us to identify community attitudes, knowledge, and behaviors in a timely manner at low cost. Our methodology can be applied to collections of tweets from other domains of interest, from business to politics to other public health areas.</p>
        <p>We went beyond the frequency-based measures by analyzing the content of online discussions of Twitter. Tweets are in-the-moment updates and contain useful observations about the larger world. Analyzing the actual content of Twitter messages provides a finer granularity and enables us to identify topics of the discussions and associate particular topics to particular measures of interest. For instance, our analysis revealed that community members not only have discussed Zika with their family and friends but also that their discussions were primarily about insect repellent, Congress funding, and Zika-related travel.</p>
        <p>Topic modeling techniques can discover patterns from a collection of text documents and automatically extract topics in the form of multinomial distributions over words. A challenge in applying topic models to any text mining problem is labeling and interpreting a multinomial topic model accurately. Interpreting the topics or labeling them is a step that is usually done manually after topic discovery. In our analyses, the correlations with attitude, knowledge, and behavior items helped to avoid the challenge of topic interpretation.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>One of the limitations of our approach is that we are unable to discover on Twitter all of the constructs that are being measured in a survey because not all of the items of interest may appear in online communications content. However, this approach allows researchers to identify topics that have not been measured in a survey but have appeared in online discussions. These topics can then directly be measured using the proposed methodology or can be included in future surveys.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In this paper, we investigated the associations between the online communications on Twitter and attitudes, knowledge, and behaviors regarding the public health challenge of the Zika virus. Our results demonstrated that it is possible to uncover topics of discussions from Twitter communications that are associated with Zika-related attitudes, knowledge, and behaviors of populations over time. Our analyses showed that the discovered topics of US congressional funding for Zika, microcephaly, Zika-related travel discussions, insect repellent, blood transfusion technology, and Zika in Miami can be used to monitor and predict the population’s attitudes toward ground and aerial special spraying, knowledge about microcephaly, and various preventive behaviors such as travel change or getting a Zika vaccine. Our work demonstrated that social media data can be used as a complementary source of information alongside traditional data sources to gauge patterns of attitudes, knowledge, and behaviors of a population and further developing and improving text mining tools have practical applications in public health domain.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>Nationally representative telephone samples: SSRS omnibus and custom studies.</p>
        <media xlink:href="publichealth_v4i1e16_app1.pdf" xlink:title="PDF File (Adobe PDF File), 100KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">LDA</term>
          <def>
            <p>latent Dirichlet allocation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">tfidf</term>
          <def>
            <p>term frequency–inverse document frequency</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was funded by the Science of Science Communication endowment of the Annenberg Public Policy Center of the University of Pennsylvania and the National Institutes of Health.</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>MF prepared the first draft of the paper and did the data analysis and contributed to data interpretation and study design. MSC helped with Twitter data collection. KW provided the survey data. DA and KHJ were the principal investigators of the study. All authors contributed to editing and revising the paper.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
        <source>Centers for Disease Control and Prevention</source>  
        <year>2017</year>  
        <comment>Areas with risk of Zika 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://www.cdc.gov/zika/geo/index.html">https://www.cdc.gov/zika/geo/index.html</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6qsoikiDb"/></comment> </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ajzen</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <person-group person-group-type="editor">
          <name name-style="western">
            <surname>Deaux</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Snyder</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <source>Attitudes and Persuasion: The Oxford Handbook of Personality and Social Psychology</source>  
        <year>2012</year>  
        <publisher-loc>New York</publisher-loc>
        <publisher-name>Oxford University Press</publisher-name></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Maio</surname>
            <given-names>GR</given-names>
          </name>
          <name name-style="western">
            <surname>Haddock</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <source>The Psychology of Attitudes and Attitude Change</source>  
        <year>2010</year>  
        <publisher-loc>Thousand Oaks</publisher-loc>
        <publisher-name>Sage Publications</publisher-name></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fishbein</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ajzen</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <source>An Introduction to Theory and Research</source>  
        <year>1975</year>  
        <publisher-loc>Reading</publisher-loc>
        <publisher-name>Addison-Wesley</publisher-name></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ajzen</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <person-group person-group-type="editor">
          <name name-style="western">
            <surname>Kuhl</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Beckmann</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>From intentions to actions: a theory of planned behavior</article-title>
        <source>Action Control: From Cognition to Behavior</source>  
        <year>1985</year>  
        <publisher-loc>New York</publisher-loc>
        <publisher-name>Springer-Verlag</publisher-name>
        <fpage>11</fpage>  
        <lpage>39</lpage> </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wyer</surname>
            <given-names>RS</given-names>
          </name>
          <name name-style="western">
            <surname>Albarracín</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <person-group person-group-type="editor">
          <name name-style="western">
            <surname>Albarracín</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Johnson</surname>
            <given-names>BT</given-names>
          </name>
          <name name-style="western">
            <surname>Zanna</surname>
            <given-names>MP</given-names>
          </name>
        </person-group>
        <article-title>Belief formation, organization, and change: cognitive and motivational influences</article-title>
        <source>Handbook of Attitudes</source>  
        <year>2005</year>  
        <publisher-loc>Hillsdale</publisher-loc>
        <publisher-name>Lawrence Erlbaum</publisher-name>
        <fpage>273</fpage>  
        <lpage>322</lpage> </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Albarracín</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Fishbein</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Middlestadt</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Generalizing behavioral findings across times, samples and measures: a replication and extension in St Vincent and the Grenadines</article-title>
        <source>J App Soc Psychol</source>  
        <year>1998</year>  
        <volume>28</volume>  
        <fpage>657</fpage>  
        <lpage>674</lpage> </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chan</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Morales</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Farhadloo</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Palmer</surname>
            <given-names>RP</given-names>
          </name>
          <name name-style="western">
            <surname>Albarracín</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <person-group person-group-type="editor">
          <name name-style="western">
            <surname>Blanton</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Webster</surname>
            <given-names>GD</given-names>
          </name>
        </person-group>
        <article-title>Harvesting and harnessing social media data for psychological research</article-title>
        <source>Social Psychological Research Methods: Social Psychological Measurement</source>  
        <year>2018</year>  
        <publisher-loc>[in press]</publisher-loc>
        <publisher-name>[in press]</publisher-name></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zamal</surname>
            <given-names>FA</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Homophily and latent attribute inference: inferring latent attributes of Twitter users from neighbors</article-title>
        <year>2012</year>  
        <conf-name>Sixth International AAAI Conference on Weblogs and Social Media</conf-name>
        <conf-date>2012</conf-date>
        <conf-loc>Dublin</conf-loc></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Van</surname>
            <given-names>D</given-names>
          </name>
          <collab>July)</collab>
        </person-group>
        <article-title>Streaming analysis of discourse participants</article-title>
        <year>2012</year>  
        <conf-name>Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning</conf-name>
        <conf-date>2012</conf-date>
        <conf-loc>Jeju Island</conf-loc>
        <fpage>48</fpage>  
        <lpage>58</lpage> </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fang</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Ounis</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Habel</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Macdonald</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Limsopatham</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>Topic-centric classification of twitter user's political orientation</article-title>
        <year>2015</year>  
        <conf-name>Proceedings of the 38th International ACM SIGIR Conference on Research and Development in Information Retrieval</conf-name>
        <conf-date>2015</conf-date>
        <conf-loc>Santiago</conf-loc>
        <fpage>791</fpage>  
        <lpage>794</lpage> </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>De Choudury</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Gamon</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Counts</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Horvitz</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Predicting depression via social media</article-title>
        <year>2013</year>  
        <conf-name>Proceedings of the Seventh International Conference of Web and Social Media</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Cambridge</conf-loc>
        <fpage>1</fpage>  
        <lpage>10</lpage> </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Volkova</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Bachrach</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Armstrong</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Sharma</surname>
            <given-names>V</given-names>
          </name>
        </person-group>
        <article-title>Inferring latent user properties from texts published in social media</article-title>
        <year>2015</year>  
        <conf-name>Twenty-Ninth AAAI Conference on Artificial Intelligence</conf-name>
        <conf-date>2015</conf-date>
        <conf-loc>Austin</conf-loc>
        <fpage>4296</fpage>  
        <lpage>4297</lpage> </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>HA</given-names>
          </name>
          <name name-style="western">
            <surname>Eichstaedt</surname>
            <given-names>JC</given-names>
          </name>
          <name name-style="western">
            <surname>Kern</surname>
            <given-names>ML</given-names>
          </name>
          <name name-style="western">
            <surname>Dziurzynski</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ramones</surname>
            <given-names>SM</given-names>
          </name>
          <name name-style="western">
            <surname>Agrawal</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Shah</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Kosinski</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Stillwell</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Seligman</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Personality, gender, and age in the language of social media: the open-vocabulary approach</article-title>
        <source>PLoS One</source>  
        <year>2013</year>  
        <volume>8</volume>  
        <issue>9</issue>  
        <fpage>e73791</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0073791"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0073791</pub-id>
        <pub-id pub-id-type="medline">24086296</pub-id>
        <pub-id pub-id-type="pii">PONE-D-13-03858</pub-id>
        <pub-id pub-id-type="pmcid">PMC3783449</pub-id></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Keeter</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Kennedy</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Dimock</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Best</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Gauging the impact of growing nonresponse on estimates from a national RDD telephone survey</article-title>
        <source>Pub Opin Q</source>  
        <year>2006</year>  
        <volume>70</volume>  
        <issue>5</issue>  
        <fpage>759</fpage>  
        <lpage>779</lpage> <pub-id pub-id-type="doi">10.1093/poq/nfl035</pub-id></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
        <source>Assessing the representativeness of public opinion surveys</source>  
        <year>2017</year>  
        <access-date>2018-01-15</access-date>
        <publisher-loc>Washington</publisher-loc>
        <publisher-name>Pew Internet and American Life Project</publisher-name>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://assets.pewresearch.org/wp-content/uploads/sites/5/legacy-pdf/Assessing%20the%20Representativeness%20of%20Public%20Opinion%20Surveys.pdf">http://assets.pewresearch.org/wp-content/uploads/sites/5/legacy-pdf/Assessing%20the%20Representativeness%20of%20Public%20Opinion%20Surveys.pdf</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6wVF2cd70"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>DM</given-names>
          </name>
          <name name-style="western">
            <surname>Ng</surname>
            <given-names>AY</given-names>
          </name>
          <name name-style="western">
            <surname>Jordan</surname>
            <given-names>MI</given-names>
          </name>
        </person-group>
        <article-title>Latent Dirichlet allocation</article-title>
        <source>J Mach Learn Res</source>  
        <year>2003</year>  
        <volume>3</volume>  
        <fpage>993</fpage>  
        <lpage>1022</lpage> </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Farhadloo</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Rolland</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Fundamentals of sentiment analysis and its applications</article-title>
        <source>Sentiment Analysis and Ontology Engineering</source>  
        <year>2016</year>  
        <publisher-loc>New York</publisher-loc>
        <publisher-name>Springer International</publisher-name>
        <fpage>1</fpage>  
        <lpage>24</lpage> </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>WX</given-names>
          </name>
          <name name-style="western">
            <surname>Jiang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Weng</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>He</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Lim</surname>
            <given-names>EP</given-names>
          </name>
          <name name-style="western">
            <surname>Yan</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Comparing twitter and traditional media using topic models</article-title>
        <year>2011</year>  
        <conf-name>Proceedings of the 33rd European conference on Advances in information retrieval</conf-name>
        <conf-date>2011</conf-date>
        <conf-loc>Heidelberg</conf-loc>
        <fpage>338</fpage>  
        <lpage>349</lpage> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kumar</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Morstatter</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <source>Twitter Data Analytics</source>  
        <year>2013</year>  
        <publisher-loc>New York</publisher-loc>
        <publisher-name>Springer Science &#38; Business Media</publisher-name></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ireland</surname>
            <given-names>ME</given-names>
          </name>
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Q</given-names>
          </name>
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>HA</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>LH</given-names>
          </name>
        </person-group>
        <article-title>Action tweets linked to reduced county-level HIV prevalence in the United States: online messages and structural determinants</article-title>
        <source>AIDS Behav</source>  
        <year>2016</year>  
        <volume>20</volume>  
        <issue>6</issue>  
        <fpage>1256</fpage>  
        <lpage>1264</lpage> <pub-id pub-id-type="doi">10.1007/s10461-015-1252-2</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
