<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v4i4e10834</article-id>
    <article-id pub-id-type="pmid">30522989</article-id>
    <article-id pub-id-type="doi">10.2196/10834</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Characterizing Tweet Volume and Content About Common Health Conditions Across Pennsylvania: Retrospective Analysis</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Eysenbach</surname>
          <given-names>Gunther</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Hoffman</surname>
          <given-names>Beth</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Liang</surname>
          <given-names>Hai</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1">
        <name name-style="western">
          <surname>Tufts</surname>
          <given-names>Christopher</given-names>
        </name>
        <degrees>MSc</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-7332-9358</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2">
        <name name-style="western">
          <surname>Polsky</surname>
          <given-names>Daniel</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <xref rid="aff3" ref-type="aff">3</xref>
        <xref rid="aff4" ref-type="aff">4</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-8403-9612</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib3">
        <name name-style="western">
          <surname>Volpp</surname>
          <given-names>Kevin G</given-names>
        </name>
        <degrees>MD, PhD</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <xref rid="aff5" ref-type="aff">5</xref>
        <xref rid="aff6" ref-type="aff">6</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-1423-4599</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib4">
        <name name-style="western">
          <surname>Groeneveld</surname>
          <given-names>Peter W</given-names>
        </name>
        <degrees>MD, MS</degrees>
        <xref rid="aff2" ref-type="aff">2</xref>
        <xref rid="aff6" ref-type="aff">6</xref>
        <xref rid="aff7" ref-type="aff">7</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-7374-4292</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib5">
        <name name-style="western">
          <surname>Ungar</surname>
          <given-names>Lyle</given-names>
        </name>
        <degrees>PhD</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <xref rid="aff8" ref-type="aff">8</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0003-2047-1443</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib6" corresp="yes">
      <name name-style="western">
        <surname>Merchant</surname>
        <given-names>Raina M</given-names>
      </name>
      <degrees>MD MSHP</degrees>
      <xref rid="aff1" ref-type="aff">1</xref>
      <address>
        <institution>Center for Digital Health</institution>
        <institution>Penn Medicine</institution>
        <addr-line>3400 Civic Center Boulevard</addr-line>
        <addr-line>Philadelphia, PA,</addr-line>
        <country>United States</country>
        <phone>1 215 615 0890</phone>
        <email>Raina.Merchant@uphs.upenn.edu</email>
      </address>  
      <xref rid="aff9" ref-type="aff">9</xref>
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-9801-6881</ext-link></contrib>
      <contrib contrib-type="author" id="contrib7">
        <name name-style="western">
          <surname>Pelullo</surname>
          <given-names>Arthur P</given-names>
        </name>
        <degrees>MS</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-3667-4301</ext-link>
      </contrib>
    </contrib-group>
    <aff id="aff1">
    <label>1</label>
    <institution>Center for Digital Health</institution>
    <institution>Penn Medicine</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff2">
    <label>2</label>
    <institution>Leonard Davis Institute of Health Economics</institution>
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff3">
    <label>3</label>
    <institution>Center for Health Equity Research and Promotion</institution>
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff4">
    <label>4</label>
    <institution>Population Studies Center</institution>
    <institution>School of Arts and Science</institution>  
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff5">
    <label>5</label>
    <institution>Department of Medical Ethics and Policy</institution>
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff6">
      <label>6</label>
      <institution>Philadelphia VA Medical Center</institution>
      <addr-line>Philadelphia, PA</addr-line>
      <country>United States</country>
    </aff>
    <aff id="aff7">
    <label>7</label>
    <institution>Department of General Internal Medicine</institution>
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff8">
    <label>8</label>
    <institution>Department of Computer and Information Science</institution>
    <institution>School of Engineering and Applied Science</institution>  
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <aff id="aff9">
    <label>9</label>
    <institution>Department of Emergency Medicine</institution>
    <institution>University of Pennsylvania</institution>  
    <addr-line>Philadelphia, PA</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Raina M Merchant 
      <email>Raina.Merchant@uphs.upenn.edu</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Oct-Dec</season><year>2018</year></pub-date>
    <pub-date pub-type="epub">
      <day>06</day>
      <month>12</month>
      <year>2018</year>
    </pub-date>
    <volume>4</volume>
    <issue>4</issue>
    <elocation-id>e10834</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>20</day>
        <month>4</month>
        <year>2018</year>
      </date>
      <date date-type="rev-request">
        <day>7</day>
        <month>6</month>
        <year>2018</year>
      </date>
      <date date-type="rev-recd">
        <day>18</day>
        <month>7</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>23</day>
        <month>7</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Christopher Tufts, Daniel Polsky, Kevin G Volpp, Peter W Groeneveld, Lyle Ungar, Raina M Merchant, Arthur P Pelullo. Originally published in JMIR Public Health and Surveillance (http://publichealth.jmir.org), 06.12.2018.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on http://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://publichealth.jmir.org/2018/4/e10834/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Tweets can provide broad, real-time perspectives about health and medical diagnoses that can inform disease surveillance in geographic regions. Less is known, however, about how much individuals post about common health conditions or what they post about.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>We sought to collect and analyze tweets from 1 state about high prevalence health conditions and characterize the tweet volume and content.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>We collected 408,296,620 tweets originating in Pennsylvania from 2012-2015 and compared the prevalence of 14 common diseases to the frequency of disease mentions on Twitter. We identified and corrected bias induced due to variance in disease term specificity and used the machine learning approach of differential language analysis to determine the content (words and themes) most highly correlated with each disease.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>Common disease terms were included in 226,802 tweets (174,381 tweets after disease term correction). Posts about breast cancer (39,156/174,381 messages, 22.45%; 306,127/12,702,379 prevalence, 2.41%) and diabetes (40,217/174,381 messages, 23.06%; 2,189,890/12,702,379 prevalence, 17.24%) were overrepresented on Twitter relative to disease prevalence, whereas hypertension (17,245/174,381 messages, 9.89%; 4,614,776/12,702,379 prevalence, 36.33%), chronic obstructive pulmonary disease (1648/174,381 messages, 0.95%; 1,083,627/12,702,379 prevalence, 8.53%), and heart disease (13,669/174,381 messages, 7.84%; 2,461,721/12,702,379 prevalence, 19.38%) were underrepresented. The content of messages also varied by disease. Personal experience messages accounted for 12.88% (578/4487) of prostate cancer tweets and 24.17% (4046/16,742) of asthma tweets. Awareness-themed tweets were more often about breast cancer (9139/39,156 messages, 23.34%) than asthma (1040/16,742 messages, 6.21%). Tweets about risk factors were more often about heart disease (1375/13,669 messages, 10.06%) than lymphoma (105/4927 messages, 2.13%).</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>Twitter provides a window into the Web-based visibility of diseases and how the volume of Web-based content about diseases varies by condition. Further, the potential value in tweets is in the rich content they provide about individuals’ perspectives about diseases (eg, personal experiences, awareness, and risk factors) that are not otherwise easily captured through traditional surveys or administrative data.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>Twitter messaging</kwd>
      <kwd>disease</kwd>
      <kwd>prevalence</kwd>
      <kwd>public health surveillance</kwd>
      <kwd>social media</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Communities are increasingly identified as a driver of health, yet our ability to track changes in the health of communities has been limited by the nature of community-level data. These data are typically survey-based or derived from administrative health care claims. In both of these cases, delays in data availability can preclude timely interventions. Social media channels, like Twitter, offer a new opportunity to track regional health trends by observing health-related communication generated by the public and for the public [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref7">7</xref>].</p>
      <p>There is an opportunity to determine how emerging digital data sources are complementary (ie, social media data have similar findings to traditional health data sources) and augmentative (ie, social media provides new real-time information about health not available in data collected through traditional means). To better quantify the value added by social media for public health surveillance, an understanding of how much data exist about different health conditions is needed. High prevalence conditions that affect much of a population may be underrepresented on the Web, whereas low prevalence conditions could be discussed more frequently on Twitter. Further, it is likely that there are different drivers (eg, disease morbidity and mortality, celebrity news, acuity, and stigma) that may influence the volume of Web-based health conversations.</p>
      <p>To better characterize health-related tweet volume and content, we compared the volume of Twitter messages about common diseases with the prevalence of the disease determined from inpatient and outpatient claims. We then characterized the public perception of common diseases by identifying the content (words and themes) most frequently associated with each condition.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Context</title>
        <p>This was a retrospective analysis of publicly available data about health conditions posted on Twitter in Pennsylvania. This study was approved by the University of Pennsylvania Institutional Review Board.</p>
        <p>We collected tweets originating from Pennsylvania related to 5 of the top causes of death in the United States. The causes of death were then further divided into subcategories: heart disease (heart disease and hypertension), diabetes, stroke, cancer (breast, skin, lung, lymphoma, leukemia, prostate, pancreatic, and ovarian), and chronic lung disease (asthma, chronic obstructive pulmonary disease, COPD).</p>
      </sec>
      <sec>
        <title>Data Sources</title>
        <sec>
          <title>Twitter Data</title>
          <p>Twitter is a social media platform that allows users to send and receive short messages called “tweets.” At the time of data collection, tweets were limited to 140 characters; this limit was doubled to 280 characters in 2017. All tweets were collected via the Twitter Application Programming Interface (API) as described in Preotiuc-Pietro et al [<xref ref-type="bibr" rid="ref8">8</xref>]. First, the Twitter Streaming API was used to collect a random 1% sample of public tweets from 2012-2015. This initial dataset was then filtered to contain only geolocated tweets or tweets originating from users with nonempty location fields in their profile. The county of origin of each tweet user was determined, and the dataset was filtered to obtain only tweets for users in Pennsylvania. To increase the sample size of tweets from the state, all unique user IDs were recorded, and the Twitter search API was used to extract timelines (each user’s prior 3200 tweets) filtered by timestamps ranging from 2012-2015.</p>
        </sec>
        <sec>
          <title>Disease Keywords</title>
          <p>The dataset analyzed was filtered for messages containing at least 1 keyword referencing a disease. The lexica of keywords (<xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>) for each disease was derived from the Consumer Health Vocabulary [<xref ref-type="bibr" rid="ref9">9</xref>] and supplemented by the authors of the study. The precision of the keyword filtering was estimated for each disease via a correction factor derived from a manual review of the tweets. The correction factor was then used to calculate corrected message counts.</p>
        </sec>
        <sec>
          <title>Tweet Location</title>
          <p>All tweets used in this analysis were classified as originating from a county in Pennsylvania. The tweets were mapped to a county using a combination of coordinates and the user-provided location field as per the method described in Schwartz et al [<xref ref-type="bibr" rid="ref10">10</xref>]. For county mapping, we identified if coordinates were present with the tweet. If coordinates were present, these were used to identify the county of origin via the Google Maps API. For tweets without coordinates, we used the location field provided in the user’s profile to identify the county. When the field contained only a city or city nickname, it was mapped to a county as long as it met the following criteria: at least 90% of the population in all the cities with that name are in 1 specific city. For example, “Chicago” would get mapped to Chicago, Illinois, because greater than 90% of the population in all cities named “Chicago” in the United States are located in Chicago, Illinois. “Springfield” would not be mapped, as there are approximately 50 different regions named “Springfield” in the United States of similar population density. The same process in the previous step was used if the county name was listed without a specified state. Cities that were among the top 1000 English or Spanish nouns, verbs, and adjectives were not considered.</p>
        </sec>
        <sec>
          <title>Deriving Topics About Individual Diseases</title>
          <p>Utilizing all messages from the dataset, 200 topics (ie, groups of co-occurring words) were generated using the Mallet implementation of latent Dirichlet allocation (LDA). The input data for LDA were filtered to remove all disease keywords along with all words used by less than 5% of tweet authors.</p>
          <p>The topic distribution of each message was then calculated as described in Schwartz et al [<xref ref-type="bibr" rid="ref11">11</xref>]. The Pearson correlation between topic distribution and a binary label of whether or not the tweet contained the disease mentioned was calculated. All correlations were corrected for false discovery rate using the Benjamini-Hochberg procedure.</p>
        </sec>
        <sec>
          <title>Organizing Topics into Themes</title>
          <p>We created 10 themes by clustering the 200 LDA topics using nonnegative matrix factorization of the LDA topics derived from the messages. We identified the resulting clusters of topics as “themes.” The LDA topics specify the probability of each word given each topic. Nonnegative matrix factorization provides a weighted value indicating how much each topic, and hence each word in each topic, contributes to each theme. Theme distributions for each message were then calculated in the same manner as described previously for the topic distributions, using Bayes’ rule to compute p(theme&#124;word). The resulting themes were manually labeled as follows: News, Research, Slang or Popular Culture Reference, Environment, Diagnosis and Survivorship, Treatment, Diet and Prevention, Awareness, Risk Factor, and Personal Experience.</p>
        </sec>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <sec>
          <title>Disease Prevalence</title>
          <p>Outpatient and inpatient hospitalization claims were retrieved from 2013 and 2014 claims data from the Pennsylvania Health Care Cost Containment Council. Claims corresponding to each disease were identified using the primary and secondary diagnostic codes that were encoded via the corresponding International Classification of Diseases, 9th edition. The codes pertaining to a specified disease were determined using the grouping provided by Clinical Classification Software developed as part of the Healthcare Cost and Utility Project [<xref ref-type="bibr" rid="ref12">12</xref>]. Disease prevalence is defined as the number of unique patients in each county that have a claim related to a given disease divided by the total population of the county. The average of those county-level prevalences was used as the state prevalence for each disease.</p>
        </sec>
        <sec>
          <title>Adjusted Message Counts and Correction Factors</title>
          <p>Due to ambiguity in some of the disease lexica, the message counts for each disease need to be scaled to reflect that many uses of terms such as “heart attack” or “stroke” are metaphorical or refer to other subjects such as golf “stroke.” The scaling is accomplished via a correction factor based on the manual review of tweets by 2 researchers using the methods outlined in Weeg, et al [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
          <p>To calculate the correction factor for a disease, a sample of 30 tweets for each keyword were sampled. Those tweets were then classified as being a reference to a disease or not a reference to a disease. The percentage of tweets from the sample pertaining to a disease was identified as the correction factor for that keyword, <italic>w</italic><sub><italic>k</italic> </sub>. To calculate the corrected message count for a disease (<xref ref-type="fig" rid="figure1">Figure 1</xref>), the product of the correction factor, <italic>w</italic><sub><italic>k</italic> </sub>, and the number of messages containing that keyword, <italic>n</italic><sub><italic>k</italic> </sub>, are summed for all keywords for a single disease.</p>
        </sec>
        <sec>
          <title>Comparing Tweet Volume to Disease Prevalence in Pennsylvania</title>
          <p>We used summary statistics to compare the volume of posts on Twitter with the disease prevalence in Pennsylvania for those conditions.</p>
        </sec>
        <sec>
          <title>Associating Disease with Themes</title>
          <p>The distribution of themes was investigated using 2 different metrics: the probability of the theme given the disease and the pointwise mutual information (PMI) between the disease and theme (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The probability of the theme given the disease provides insight into the most prevalent topics of conversation for the given disease.</p>
          <p>The PMI of a disease and theme provides a measure of how often a disease and theme co-occur relative to how often the 2 would co-occur if independent of one another. This provides insight into theme-disease co-occurrence that may be somewhat rare but is significantly different from random chance.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Equation for deriving a disease's corrected message count.</p>
            </caption>
            <graphic xlink:href="publichealth_v4i4e10834_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Equation for deriving the pointwise mutual information between a disease and a theme. PMI: pointwise mutual information.</p>
            </caption>
            <graphic xlink:href="publichealth_v4i4e10834_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Tweet Volume and Disease Prevalence Comparison</title>
        <sec>
          <title>Tweet Volume</title>
          <p>The initial sample of tweets from Pennsylvania consisted of 408,296,620 tweets. The data were filtered for messages containing disease-related language, resulting in a dataset containing 226,802 messages. This estimated size of this dataset was further reduced to 174,381 messages after correction factors were applied to the disease message counts. Breast cancer (n=39,156), stroke (n=53,858), and diabetes (n=41,615) were the most frequent conditions represented in the dataset (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        </sec>
        <sec>
          <title>Correction Factors and Corrected Message Counts</title>
          <p>Of the 14 diseases, we identified only 2, COPD and stroke, with a correction factor below 90% (<xref ref-type="table" rid="table1">Table 1</xref>). Messages containing terms related to pancreatic and ovarian cancer were always a direct reference to the disease. References to stroke were nonmedical or references to other health topics, such as heat stroke, 84.88% (45,716/53,858 messages) of the time.</p>
        </sec>
        <sec>
          <title>Comparing Tweet Volume to Disease Prevalence in Pennsylvania</title>
          <p>When comparing prevalence to corrected message counts (<xref ref-type="fig" rid="figure3">Figure 3</xref>) we identified that hypertension (17,245/174,381 messages, 9.89%; 4614,776/12,702,379 prevalence, 36.33%), COPD (1648/174,381 messages, 0.95%; 1,083,627/12,702,379 prevalence, 8.53%), and heart disease (13,669/174,381 messages, 7.84%; 2,461,721/12,702,379 prevalence, 19.38%) were underrepresented on Twitter. Breast cancer was overrepresented when comparing corrected message counts and prevalence (39,156/174,381 messages, 22.45%; 306,127/12,702,379 prevalence, 2.41%).</p>

          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Characteristics of the study sample: tweet data and user data.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="320"/>
              <col width="150"/>
              <col width="180"/>
              <col width="220"/>
              <col width="100"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Disease</td>
                  <td>Message count, n</td>
                  <td>Correction factor, %</td>
                  <td>Corrected message count, n</td>
                  <td>Users, n</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="6"><bold>Cancer</bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Breast cancer</td>
                  <td>39,169</td>
                  <td>100</td>
                  <td>39,156</td>
                  <td>19,960</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Leukemia</td>
                  <td>9129</td>
                  <td>95.1</td>
                  <td>8682</td>
                  <td>5855</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Lung cancer</td>
                  <td>5745</td>
                  <td>92.6</td>
                  <td>5317</td>
                  <td>3719</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Lymphoma</td>
                  <td>5276</td>
                  <td>93.4</td>
                  <td>4927</td>
                  <td>2758</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Ovarian cancer</td>
                  <td>3063</td>
                  <td>99.9</td>
                  <td>3060</td>
                  <td>1212</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Pancreatic cancer</td>
                  <td>3231</td>
                  <td>100</td>
                  <td>3231</td>
                  <td>1189</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Prostate cancer</td>
                  <td>4487</td>
                  <td>100</td>
                  <td>4487</td>
                  <td>2311</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Skin cancer</td>
                  <td>7866</td>
                  <td>99.9</td>
                  <td>7859</td>
                  <td>4048</td>
                </tr>
                <tr valign="top">
                  <td colspan="6"><bold>Chronic lung disease</bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Asthma</td>
                  <td>18,082</td>
                  <td>92.6</td>
                  <td>16,742</td>
                  <td>10,185</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Chronic obstructive pulmonary disease</td>
                  <td>2137</td>
                  <td>77.1</td>
                  <td>1648</td>
                  <td>726</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Diabetes</td>
                  <td>41,615</td>
                  <td>96.6</td>
                  <td>40,217</td>
                  <td>16,321</td>
                </tr>
                <tr valign="top">
                  <td colspan="6"><bold>Heart disease</bold></td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Heart disease</td>
                  <td>14,740</td>
                  <td>92.7</td>
                  <td>13,669</td>
                  <td>7992</td>
                </tr>
                <tr valign="top">
                  <td><break/></td>
                  <td>Hypertension</td>
                  <td>18,404</td>
                  <td>93.7</td>
                  <td>17,245</td>
                  <td>12,203</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Stroke</td>
                  <td>53,858</td>
                  <td>15.1</td>
                  <td>8141</td>
                  <td>34,298</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Proportion of messages versus prevalence. COPD: chronic obstructive pulmonary disease.</p>
            </caption>
            <graphic xlink:href="publichealth_v4i4e10834_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>Characterizing Tweet Topics About Individual Diseases</title>
        
       
        <p> For each disease, we identified all statistically significant (<italic>P</italic>&#60;.001) correlations between topics and a binary label indicating whether or not a message contained a reference to the disease. Topics most correlated with asthma were related to first-person accounts of managing the disease (<italic>attack</italic> and <italic>inhaler</italic>), discomfort associated with the disease (<italic>can’t</italic> and<italic> breathe</italic>), or conditions that pose additional risk <italic>(pollution, mold,</italic> and <italic>dust)</italic> such as allergens. The majority of topics associated with cancer referenced some variety of charity campaign (<italic>pink, ribbon,</italic> and <italic>bracelet</italic>) or awareness effort (<italic>support, awareness, October,</italic> and <italic>pink</italic>). Topics related to stroke were rarely related to cerebrovascular accident, but more often related to other definitions of stroke (eg, golf stroke, paint stroke, and heat stroke). Diabetes, heart disease, and hypertension messages were correlated with topics that focused on disease management (<italic>weight loss, insulin,</italic> and <italic>reduce stress</italic>) and lifestyle choices (<italic>diet</italic> and <italic>exercise)</italic>. Complete topic word clouds for each disease can be found in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>.</p>
      </sec>
      <sec>
        <title>Characterizing Tweet Themes Across Diseases</title>
        <sec>
          <title>Probability of Theme Given Disease</title>
          <p>The probability of a theme given the disease provides insight into the most prevalent topics of conversation for a specific disease (<xref ref-type="fig" rid="figure4">Figure 4</xref>). We identified that messages referencing breast cancer were more likely to be about disease <italic>awareness</italic> (9139/39,156 messages, 23.34%). Heart disease messages mostly focused on <italic>risk factors</italic> such as stress, sleep, and obesity (1375/13,669 messages, 10.06%). In most cases, asthma messages referenced a <italic>personal experience</italic>.</p>
        </sec>
        <sec>
          <title>Pointwise Mutual Information</title>
          <p>PMI provides a measure of association between the theme and the disease (<xref ref-type="fig" rid="figure4">Figure 4</xref>). We found that diagnosis was a small proportion of the theme distribution for each disease. However, if diagnosis or survivorship is mentioned, it is much more likely to be mentioned in conjunction with lymphoma and leukemia than with the other diseases (PMI 0.67-0.96). Similarly, a relationship between the <italic>risk factors</italic> theme and hypertension and heart disease was found (PMI 0.54-0.77).</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Theme distribution. P(t&#124;d): probability of theme given disease; COPD: chronic obstructive pulmonary disease; PMI: pointwise mutual information.</p>
            </caption>
            <graphic xlink:href="publichealth_v4i4e10834_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>There is increasing focus on the potential for big data from digital sources in health care. There are challenges associated with using these sources, as they are not always collected for the purposes of health tracking.</p>
        <p>We explored the potential for using Twitter to better understand the Web-based conversation about common health conditions. We identified that in some cases, traditional health metrics are associated with the volume of tweets for a given disease. Although traditional methods of determining disease prevalence are robust, they are often delayed in availability because the process for data acquisition and tracking to determine reliable and valid estimates is considerable. Twitter data are available in real-time, much faster than traditional methods, and with significant volume providing a measure of public discourse about health. While tweets would not replace traditional surveillance in the way initially posed by Google flu trends [<xref ref-type="bibr" rid="ref14">14</xref>], they do provide something unique that prevalence statistics do not; a narrative about patient and public thoughts, knowledge, and experiences with health. Twitter provides context to the conversation surrounding disease and allows for characterization of public discussion of high prevalence conditions. We identified that individuals are using Twitter to talk about several diseases, although variation exists in the frequency of disease mention and the content.</p>
        <p>We observed that people are using Twitter for talking about the most common health conditions in Pennsylvania. Prior work has demonstrated the use of Twitter to monitor influenza [<xref ref-type="bibr" rid="ref15">15</xref>], postpartum depression [<xref ref-type="bibr" rid="ref16">16</xref>], concussion [<xref ref-type="bibr" rid="ref17">17</xref>], epilepsy [<xref ref-type="bibr" rid="ref18">18</xref>], and migraine [<xref ref-type="bibr" rid="ref19">19</xref>]. The prevalence of disease has been correlated with the frequency of Twitter posting across a variety of diseases [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>We also identified variability in disease mentions and the specificity of terms. This finding provides us with several insights. First, heart disease and stroke cannot be analyzed without preprocessing owing to the ambiguity of many of the keywords associated with the diseases. To resolve these varying issues, other methods will need to be developed to filter out much of the noise associated with these diseases. However, this finding also assures us that the majority of the language we find associated with other diseases can be analyzed using the open vocabulary methods previously described with minimal preprocessing.</p>
        <p>Although disease prevalence often coincides with disease mention on Twitter, we found significant variability. The frequency of mentions of breast cancer on Twitter was several orders of magnitude higher than lung cancer, although lung cancer has a higher rate of death and relatively similar prevalence. Breast cancer has a large social media presence owing to awareness and charity campaigns in conjunction with a large community base from those affected by the disease. Lung cancer is tweeted about less often and is often the result of a pop culture reference from television or a celebrity death.</p>
        <p>Traditional metrics provide detailed information about prevalence but not insights about people’s understanding, concerns, and questions about health and disease. Our analysis identified several underlying themes that are specific to some diseases. Asthma tweets included references to personal experiences for both the person with asthma as well as parents expressing concern for their children’s asthma issues. Although the largest portion of tweets for the different types of cancer analyzed often referenced charity and awareness, we observed that across diseases in our sample, cancer conditions had the largest portion of tweets about diagnosis.</p>
        <p>Our findings also give insight into potential opportunities for using Twitter to inform public health and health communications practices. Future work could examine temporal relationships between Twitter volume and semantic data and traditional health data over larger timeframes and at varying timescales. Meaningful temporal relationships may indicate that Twitter data have value as an additional signal to augment existing surveillance systems, allowing for more precise health tracking and timely interventions.</p>
        <p>Twitter data could enhance community building and engagement. Prior work by Neiger et al [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] found that more two-way communication on Twitter between public health entities and individual citizens led to an increase in action and awareness that, in turn, resulted in an improvement in community health. Providing local and state public health entities with more accurate information on the public discourse surrounding health could enhance communication and contribute to the more effective dissemination of pertinent and timely health information to the public.</p>
        <p>Finally, understanding the interaction between social media use and individual health can identify opportunities for targeted interventions. Prior work by Park et al [<xref ref-type="bibr" rid="ref23">23</xref>] showed that interventions targeting the perception of social media interaction have the potential to positively impact individual health. We have shown that it is possible to capture a measure of public perception of individual diseases at the community level via analysis of topics and themes. These methods can be translated to individual subjects, where disease perceptions could be tracked over time and compared with actual measures of health, potentially identifying opportunities for intervention.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>We compared data from Twitter for 2012-2015 with disease prevalence from 2014, so there may be some variability by year in these estimates. We evaluated unadjusted data from 1 state, so this may not be representative of the conversation about health conditions across other states or geographic regions. Twitter data primarily originate from urban areas; hence, data may not be the most representative sample across the state of Pennsylvania. Future work could explore variations in language on Twitter relative to the size of geographic regions, socioeconomic factors (eg, race, income, urban or rural), and variations in news events or other triggers. Although our correction method eliminates nondisease references, it does not account for metaphorical and joking tweets. This impacts diseases such as heart disease, diabetes, and hypertension.</p>
        <p>The precision of the disease keyword filtering, which is the number of selected tweets that were relevant, is reasonably estimated by the corrected message count. However, the recall of the disease keyword filtering, which is the number of relevant tweets that were selected, is difficult to determine owing to the nature of the data and the subjectivity of relevance in the context of health-related tweets. Hopkins et al [<xref ref-type="bibr" rid="ref24">24</xref>] provides 3 different models for estimating recall: a hand-coding approach similar to the corrected message count presented here, a supervised learning approach for individual document classification, and a supervised learning approach to estimate document category proportions. Evaluating these methods in terms of cost and accuracy is beyond the scope of this study but should be considered for future work to provide more robust measures of keyword-filtered data quality.</p>
        <p>Location identification accuracy is difficult to measure for user-defined locations owing to the relative ambiguity of the data provided. The procedures used to estimate user-defined location provide a “soft” measure of accuracy, but more work is needed to ensure appropriate representation. Additionally, a very small proportion of tweets contains location information, thus, the sample may not be representative of the general Twitter landscape in Pennsylvania. Methods such as those detailed in Liang et al [<xref ref-type="bibr" rid="ref25">25</xref>] should be considered in future studies to correct for sampling bias.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We identified that the volume of tweets is often related to rates of health conditions across a state. The semantic content provided from Twitter provides insight into public perception and awareness of disease beyond what is available through traditional measures of disease prevalence.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>Consumer Health Vocabulary search terms: This study focuses on 14 diseases and each disease is represented by a lexicon of disease related terms. The appendix contains each of the 14 diseases along with the 274 terms which comprise the lexica.</p>
        <media xlink:href="publichealth_v4i4e10834_app1.xlsx" xlink:title="XLSX File (Microsoft Excel File), 55KB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>Correlation between topic and disease.</p>
        <media xlink:href="publichealth_v4i4e10834_app2.png" xlink:title="PNG File, 940KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">COPD</term>
          <def>
            <p>chronic obstructive pulmonary disease</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LDA</term>
          <def>
            <p>latent Dirichlet allocation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">PMI</term>
          <def>
            <p>pointwise mutual information</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This project is funded, in part, under a grant with the Pennsylvania Department of Health. The Department specifically disclaims responsibility for any analyses, interpretations, or conclusions.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Prieto</surname>
            <given-names>VM</given-names>
          </name>
          <name name-style="western">
            <surname>Matos</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Álvarez</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Cacheda</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Oliveira</surname>
            <given-names>JL</given-names>
          </name>
        </person-group>
        <article-title>Twitter: a good place to detect health conditions</article-title>
        <source>PLoS One</source>  
        <year>2014</year>  
        <month>1</month>  
        <volume>9</volume>  
        <issue>1</issue>  
        <fpage>e86191</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0086191"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0086191</pub-id>
        <pub-id pub-id-type="medline">24489699</pub-id>
        <pub-id pub-id-type="pii">PONE-D-13-10567</pub-id>
        <pub-id pub-id-type="pmcid">PMC3906034</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Eysenbach</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Infodemiology and infoveillance: framework for an emerging set of public health informatics methods to analyze search, communication and publication behavior on the Internet</article-title>
        <source>J Med Internet Res</source>  
        <year>2009</year>  
        <volume>11</volume>  
        <issue>1</issue>  
        <fpage>e11</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2009/1/e11/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.1157</pub-id>
        <pub-id pub-id-type="medline">19329408</pub-id>
        <pub-id pub-id-type="pii">v11i1e11</pub-id>
        <pub-id pub-id-type="pmcid">PMC2762766</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Laranjo</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Arguel</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Neves</surname>
            <given-names>AL</given-names>
          </name>
          <name name-style="western">
            <surname>Gallagher</surname>
            <given-names>AM</given-names>
          </name>
          <name name-style="western">
            <surname>Kaplan</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Mortimer</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Mendes</surname>
            <given-names>GA</given-names>
          </name>
          <name name-style="western">
            <surname>Lau</surname>
            <given-names>AYS</given-names>
          </name>
        </person-group>
        <article-title>The influence of social networking sites on health behavior change: a systematic review and meta-analysis</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2015</year>  
        <month>01</month>  
        <volume>22</volume>  
        <issue>1</issue>  
        <fpage>243</fpage>  
        <lpage>56</lpage>  
        <pub-id pub-id-type="doi">10.1136/amiajnl-2014-002841</pub-id>
        <pub-id pub-id-type="medline">25005606</pub-id>
        <pub-id pub-id-type="pii">amiajnl-2014-002841</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wehner</surname>
            <given-names>Mackenzie R</given-names>
          </name>
          <name name-style="western">
            <surname>Chren</surname>
            <given-names>Mary-Margaret</given-names>
          </name>
          <name name-style="western">
            <surname>Shive</surname>
            <given-names>Melissa L</given-names>
          </name>
          <name name-style="western">
            <surname>Resneck</surname>
            <given-names>Jack S</given-names>
          </name>
          <name name-style="western">
            <surname>Pagoto</surname>
            <given-names>Sherry</given-names>
          </name>
          <name name-style="western">
            <surname>Seidenberg</surname>
            <given-names>Andrew B</given-names>
          </name>
          <name name-style="western">
            <surname>Linos</surname>
            <given-names>Eleni</given-names>
          </name>
        </person-group>
        <article-title>Twitter: an opportunity for public health campaigns</article-title>
        <source>Lancet</source>  
        <year>2014</year>  
        <month>07</month>  
        <day>12</day>  
        <volume>384</volume>  
        <issue>9938</issue>  
        <fpage>131</fpage>  
        <lpage>2</lpage>  
        <pub-id pub-id-type="doi">10.1016/S0140-6736(14)61161-2</pub-id>
        <pub-id pub-id-type="medline">25016994</pub-id>
        <pub-id pub-id-type="pii">S0140-6736(14)61161-2</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>JL</given-names>
          </name>
          <name name-style="western">
            <surname>DeCamp</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Chisolm</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Berger</surname>
            <given-names>ZD</given-names>
          </name>
        </person-group>
        <article-title>What are health-related users tweeting? A qualitative content analysis of health-related users and their messages on twitter</article-title>
        <source>J Med Internet Res</source>  
        <year>2014</year>  
        <volume>16</volume>  
        <issue>10</issue>  
        <fpage>e237</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2014/10/e237/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.3765</pub-id>
        <pub-id pub-id-type="medline">25591063</pub-id>
        <pub-id pub-id-type="pii">v16i10e237</pub-id>
        <pub-id pub-id-type="pmcid">PMC4296104</pub-id></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hill</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Merchant</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Lessons Learned About Public Health From Online Crowd Surveillance</article-title>
        <source>Big Data</source>  
        <year>2013</year>  
        <month>09</month>  
        <day>10</day>  
        <volume>1</volume>  
        <issue>3</issue>  
        <fpage>160</fpage>  
        <lpage>167</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25045598"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1089/big.2013.0020</pub-id>
        <pub-id pub-id-type="medline">25045598</pub-id>
        <pub-id pub-id-type="pmcid">PMC4102381</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Eichstaedt</surname>
            <given-names>JC</given-names>
          </name>
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>HA</given-names>
          </name>
          <name name-style="western">
            <surname>Kern</surname>
            <given-names>ML</given-names>
          </name>
          <name name-style="western">
            <surname>Park</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Labarthe</surname>
            <given-names>DR</given-names>
          </name>
          <name name-style="western">
            <surname>Merchant</surname>
            <given-names>RM</given-names>
          </name>
          <name name-style="western">
            <surname>Jha</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Agrawal</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Dziurzynski</surname>
            <given-names>LA</given-names>
          </name>
          <name name-style="western">
            <surname>Sap</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Weeg</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Larson</surname>
            <given-names>EE</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>LH</given-names>
          </name>
          <name name-style="western">
            <surname>Seligman</surname>
            <given-names>MEP</given-names>
          </name>
        </person-group>
        <article-title>Psychological language on Twitter predicts county-level heart disease mortality</article-title>
        <source>Psychol Sci</source>  
        <year>2015</year>  
        <month>02</month>  
        <volume>26</volume>  
        <issue>2</issue>  
        <fpage>159</fpage>  
        <lpage>69</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25605707"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1177/0956797614557867</pub-id>
        <pub-id pub-id-type="medline">25605707</pub-id>
        <pub-id pub-id-type="pii">0956797614557867</pub-id>
        <pub-id pub-id-type="pmcid">PMC4433545</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Preotiuc-Pietro</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Samangooei</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Trendminer: An architecture for real time analysis of social media text</article-title>
        <year>2012</year>  
        <conf-name>Sixth Int AAAI Conf Weblogs Soc Media</conf-name>
        <conf-date>2012</conf-date>
        <conf-loc>Dublin, IE</conf-loc>
        <fpage>A</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.aaai.org/ocs/index.php/ICWSM/ICWSM12/paper/download/4739/5087"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
        <collab>Collaborative Consumer Health Vocabulary Initiative</collab>  
        <collab>Biomedical Informatics Department University of Utah</collab> </person-group>
        <source>Consumer Health Vocabulary Initiative</source>  
        <year>2018</year>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://consumerhealthvocab.chpc.utah.edu/CHVwiki/">http://consumerhealthvocab.chpc.utah.edu/CHVwiki/</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="6yovtPEFZ"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Eichstaedt</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Kern</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Dziurzynski</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Lucas</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Agrawal</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Park</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Lakshmikanth</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Jha</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Seligman</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Characterizing geographic variation in well-being using tweets</article-title>
        <year>2013</year>  
        <conf-name>Seventh Int AAAI Conf Weblogs Soc Media. ;(June )</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Boston, MA</conf-loc>
        <fpage>583</fpage>  
        <lpage>591</lpage> </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>HA</given-names>
          </name>
          <name name-style="western">
            <surname>Eichstaedt</surname>
            <given-names>JC</given-names>
          </name>
          <name name-style="western">
            <surname>Kern</surname>
            <given-names>ML</given-names>
          </name>
          <name name-style="western">
            <surname>Dziurzynski</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Ramones</surname>
            <given-names>SM</given-names>
          </name>
          <name name-style="western">
            <surname>Agrawal</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Shah</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Kosinski</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Stillwell</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Seligman</surname>
            <given-names>MEP</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>LH</given-names>
          </name>
        </person-group>
        <article-title>Personality, gender, and age in the language of social media: the open-vocabulary approach</article-title>
        <source>PLoS One</source>  
        <year>2013</year>  
        <month>9</month>  
        <volume>8</volume>  
        <issue>9</issue>  
        <fpage>e73791</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0073791"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0073791</pub-id>
        <pub-id pub-id-type="medline">24086296</pub-id>
        <pub-id pub-id-type="pii">PONE-D-13-03858</pub-id>
        <pub-id pub-id-type="pmcid">PMC3783449</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
        <source>Healthcare Cost and Utilization Project (HCUP)</source>  
        <access-date>2018-10-10</access-date>
        <publisher-loc>Rockville, MD</publisher-loc>
        <publisher-name>Agency for Healthcare Research and Quality</publisher-name>
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ahrq.gov/data/hcup/index.html">https://www.ahrq.gov/data/hcup/index.html</ext-link>
          <ext-link ext-link-type="webcite" xlink:href="734IX2AiT"/>
        </comment> </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Weeg</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>HA</given-names>
          </name>
          <name name-style="western">
            <surname>Hill</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Merchant</surname>
            <given-names>RM</given-names>
          </name>
          <name name-style="western">
            <surname>Arango</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Ungar</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Using Twitter to Measure Public Discussion of Diseases: A Case Study</article-title>
        <source>JMIR Public Health Surveill</source>  
        <year>2015</year>  
        <month>06</month>  
        <day>26</day>  
        <volume>1</volume>  
        <issue>1</issue>  
        <fpage>e6</fpage>  
        <pub-id pub-id-type="doi">10.2196/publichealth.3953</pub-id></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ginsberg</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Mohebbi</surname>
            <given-names>MH</given-names>
          </name>
          <name name-style="western">
            <surname>Patel</surname>
            <given-names>RS</given-names>
          </name>
          <name name-style="western">
            <surname>Brammer</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Smolinski</surname>
            <given-names>MS</given-names>
          </name>
          <name name-style="western">
            <surname>Brilliant</surname>
            <given-names>L</given-names>
          </name>
        </person-group>
        <article-title>Detecting influenza epidemics using search engine query data</article-title>
        <source>Nature</source>  
        <year>2009</year>  
        <month>02</month>  
        <day>19</day>  
        <volume>457</volume>  
        <issue>7232</issue>  
        <fpage>1012</fpage>  
        <lpage>4</lpage>  
        <pub-id pub-id-type="doi">10.1038/nature07634</pub-id>
        <pub-id pub-id-type="medline">19020500</pub-id>
        <pub-id pub-id-type="pii">nature07634</pub-id></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>You are what you Tweet: Analyzing Twitter for public health</article-title>
        <year>2011</year>  
        <conf-name>Fifth Int AAAI Conf Weblogs Soc Media</conf-name>
        <conf-date>2011</conf-date>
        <conf-loc>Barcelona, Spain</conf-loc></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>De Choudhury</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Counts</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Horvitz</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Predicting postpartum changes in emotion and behavior via social media</article-title>
        <year>2013</year>  
        <conf-name>SIGCHI Conf Hum Factors Comput Syst</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Paris, France</conf-loc>
        <fpage>3267</fpage> </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sullivan</surname>
            <given-names>SJ</given-names>
          </name>
          <name name-style="western">
            <surname>Schneiders</surname>
            <given-names>AG</given-names>
          </name>
          <name name-style="western">
            <surname>Cheang</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Kitto</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Redhead</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Ward</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Ahmed</surname>
            <given-names>OH</given-names>
          </name>
          <name name-style="western">
            <surname>McCrory</surname>
            <given-names>PR</given-names>
          </name>
        </person-group>
        <article-title>'What's happening?' A content analysis of concussion-related traffic on Twitter</article-title>
        <source>Br J Sports Med</source>  
        <year>2012</year>  
        <month>03</month>  
        <volume>46</volume>  
        <issue>4</issue>  
        <fpage>258</fpage>  
        <lpage>63</lpage>  
        <pub-id pub-id-type="doi">10.1136/bjsm.2010.080341</pub-id>
        <pub-id pub-id-type="medline">21406451</pub-id>
        <pub-id pub-id-type="pii">bjsm.2010.080341</pub-id></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>McNeil</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Brna</surname>
            <given-names>PM</given-names>
          </name>
          <name name-style="western">
            <surname>Gordon</surname>
            <given-names>KE</given-names>
          </name>
        </person-group>
        <article-title>Epilepsy in the Twitter era: a need to re-tweet the way we think about seizures</article-title>
        <source>Epilepsy Behav</source>  
        <year>2012</year>  
        <month>02</month>  
        <volume>23</volume>  
        <issue>2</issue>  
        <fpage>127</fpage>  
        <lpage>30</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.yebeh.2011.10.020</pub-id>
        <pub-id pub-id-type="medline">22134096</pub-id>
        <pub-id pub-id-type="pii">S1525-5050(11)00618-4</pub-id></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Nascimento</surname>
            <given-names>TD</given-names>
          </name>
          <name name-style="western">
            <surname>DosSantos</surname>
            <given-names>MF</given-names>
          </name>
          <name name-style="western">
            <surname>Danciu</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>DeBoer</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>van Holsbeeck</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Lucas</surname>
            <given-names>SR</given-names>
          </name>
          <name name-style="western">
            <surname>Aiello</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Khatib</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Bender</surname>
            <given-names>MA</given-names>
          </name>
          <name name-style="western">
            <surname>UMSo</surname>
            <given-names>DCO2</given-names>
          </name>
          <name name-style="western">
            <surname>Zubieta</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>DaSilva</surname>
            <given-names>AF</given-names>
          </name>
        </person-group>
        <article-title>Real-time sharing and expression of migraine headache suffering on Twitter: a cross-sectional infodemiology study</article-title>
        <source>J Med Internet Res</source>  
        <year>2014</year>  
        <month>04</month>  
        <volume>16</volume>  
        <issue>4</issue>  
        <fpage>e96</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2014/4/e96/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.3265</pub-id>
        <pub-id pub-id-type="medline">24698747</pub-id>
        <pub-id pub-id-type="pii">v16i4e96</pub-id>
        <pub-id pub-id-type="pmcid">PMC4004155</pub-id></nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Young</surname>
            <given-names>SD</given-names>
          </name>
          <name name-style="western">
            <surname>Rivers</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Lewis</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Methods of using real-time social media technologies for detection and remote monitoring of HIV outcomes</article-title>
        <source>Prev Med</source>  
        <year>2014</year>  
        <month>06</month>  
        <volume>63</volume>  
        <fpage>112</fpage>  
        <lpage>5</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24513169"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.ypmed.2014.01.024</pub-id>
        <pub-id pub-id-type="medline">24513169</pub-id>
        <pub-id pub-id-type="pii">S0091-7435(14)00055-3</pub-id>
        <pub-id pub-id-type="pmcid">PMC4031268</pub-id></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Neiger</surname>
            <given-names>BL</given-names>
          </name>
          <name name-style="western">
            <surname>Thackeray</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Burton</surname>
            <given-names>SH</given-names>
          </name>
          <name name-style="western">
            <surname>Thackeray</surname>
            <given-names>CR</given-names>
          </name>
          <name name-style="western">
            <surname>Reese</surname>
            <given-names>JH</given-names>
          </name>
        </person-group>
        <article-title>Use of twitter among local health departments: an analysis of information sharing, engagement, and action</article-title>
        <source>J Med Internet Res</source>  
        <year>2013</year>  
        <volume>15</volume>  
        <issue>8</issue>  
        <fpage>e177</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2013/8/e177/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.2775</pub-id>
        <pub-id pub-id-type="medline">23958635</pub-id>
        <pub-id pub-id-type="pii">v15i8e177</pub-id>
        <pub-id pub-id-type="pmcid">PMC3758023</pub-id></nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Thackeray</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Neiger</surname>
            <given-names>BL</given-names>
          </name>
          <name name-style="western">
            <surname>Burton</surname>
            <given-names>SH</given-names>
          </name>
          <name name-style="western">
            <surname>Thackeray</surname>
            <given-names>CR</given-names>
          </name>
        </person-group>
        <article-title>Analysis of the purpose of state health departments' tweets: information sharing, engagement, and action</article-title>
        <source>J Med Internet Res</source>  
        <year>2013</year>  
        <volume>15</volume>  
        <issue>11</issue>  
        <fpage>e255</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2013/11/e255/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.3002</pub-id>
        <pub-id pub-id-type="medline">24217361</pub-id>
        <pub-id pub-id-type="pii">v15i11e255</pub-id>
        <pub-id pub-id-type="pmcid">PMC3841368</pub-id></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Park</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Lee</surname>
            <given-names>DS</given-names>
          </name>
          <name name-style="western">
            <surname>Shablack</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Verduyn</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Deldin</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Ybarra</surname>
            <given-names>O</given-names>
          </name>
          <name name-style="western">
            <surname>Jonides</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Kross</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>When perceptions defy reality: The relationships between depression and actual and perceived Facebook social support</article-title>
        <source>J Affect Disord</source>  
        <year>2016</year>  
        <month>08</month>  
        <volume>200</volume>  
        <fpage>37</fpage>  
        <lpage>44</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.jad.2016.01.048</pub-id>
        <pub-id pub-id-type="medline">27126138</pub-id>
        <pub-id pub-id-type="pii">S0165-0327(15)30863-6</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hopkins</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>King</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>A Method of Automated Nonparametric Content Analysis for Social Science</article-title>
        <source>Am J Political Sci</source>  
        <year>2010</year>  
        <volume>54</volume>  
        <fpage>229</fpage>  
        <lpage>247</lpage> </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Liang</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Shen</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Fu</surname>
            <given-names>K</given-names>
          </name>
        </person-group>
        <article-title>Privacy protection and self-disclosure across societies: A study of global Twitter users</article-title>
        <source>New Media &#38; Society</source>  
        <year>2016</year>  
        <month>05</month>  
        <day>12</day>  
        <volume>19</volume>  
        <issue>9</issue>  
        <fpage>1476</fpage>  
        <lpage>1497</lpage>  
        <pub-id pub-id-type="doi">10.1177/1461444816642210</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
