<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v4i2e10150</article-id>
    <article-id pub-id-type="pmid">29959106</article-id>
    <article-id pub-id-type="doi">10.2196/10150</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>#Healthy Selfies: Exploration of Health Topics on Instagram</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Sanchez</surname>
          <given-names>Travis</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Allem</surname>
          <given-names>Jon-Patrick</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>O'Kane</surname>
          <given-names>Niamh</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Mejova</surname>
          <given-names>Yelena</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1">
        <name name-style="western">
          <surname>Muralidhara</surname>
          <given-names>Sachin</given-names>
        </name>
        <degrees>MS</degrees>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-3758-4359</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2" corresp="yes">
      <name name-style="western">
        <surname>Paul</surname>
        <given-names>Michael J</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff2" ref-type="aff">2</xref>
      <address>
        <institution>Department of Information Science</institution>
        <institution>University of Colorado Boulder</institution>
        <addr-line>315 UCB</addr-line>
        <addr-line>Boulder, CO, 80309</addr-line>
        <country>United States</country>
        <phone>1 303 735 7581</phone>
        <email>mpaul@colorado.edu</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-9149-7539</ext-link></contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>Department of Computer Science</institution>
    <institution>University of Colorado Boulder</institution>  
    <addr-line>Boulder, CO</addr-line>
    <country>United States</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Department of Information Science</institution>
    <institution>University of Colorado Boulder</institution>  
    <addr-line>Boulder, CO</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Michael J. Paul 
      <email>mpaul@colorado.edu</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Apr-Jun</season><year>2018</year></pub-date>
    <pub-date pub-type="epub">
      <day>29</day>
      <month>06</month>
      <year>2018</year>
    </pub-date>
    <volume>4</volume>
    <issue>2</issue>
    <elocation-id>e10150</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>15</day>
        <month>2</month>
        <year>2018</year>
      </date>
      <date date-type="rev-request">
        <day>29</day>
        <month>3</month>
        <year>2018</year>
      </date>
      <date date-type="rev-recd">
        <day>17</day>
        <month>5</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>29</day>
        <month>5</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Sachin Muralidhara, Michael J. Paul. Originally published in JMIR Public Health and Surveillance (http://publichealth.jmir.org), 29.06.2018.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on http://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://publichealth.jmir.org/2018/2/e10150/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>Social media provides a complementary source of information for public health surveillance. The dominate data source for this type of monitoring is the microblogging platform Twitter, which is convenient due to the free availability of public data. Less is known about the utility of other social media platforms, despite their popularity.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>This work aims to characterize the health topics that are prominently discussed in the image-sharing platform Instagram, as a step toward understanding how this data might be used for public health research.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>The study uses a topic modeling approach to discover topics in a dataset of 96,426 Instagram posts containing hashtags related to health. We use a polylingual topic model, initially developed for datasets in different natural languages, to model different modalities of data: hashtags, caption words, and image tags automatically extracted using a computer vision tool.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>We identified 47 health-related topics in the data (kappa=.77), covering ten broad categories: acute illness, alternative medicine, chronic illness and pain, diet, exercise, health care &#38; medicine, mental health, musculoskeletal health and dermatology, sleep, and substance use. The most prevalent topics were related to diet (8,293/96,426; 8.6% of posts) and exercise (7,328/96,426; 7.6% of posts).</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>A large and diverse set of health topics are discussed in Instagram. The extracted image tags were generally too coarse and noisy to be used for identifying posts but were in some cases accurate for identifying images relevant to studying diet and substance use. Instagram shows potential as a source of public health information, though limitations in data collection and metadata availability may limit its use in comparison to platforms like Twitter.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>social media</kwd>
      <kwd>Instagram</kwd>
      <kwd>image sharing</kwd>
      <kwd>topic modeling</kwd>
      <kwd>computer vision</kwd>
      <kwd>public health</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Social media can provide a vast source of insight into a wide variety of applications in public health monitoring and surveillance [<xref ref-type="bibr" rid="ref1">1</xref>]. The bulk of social media-based health monitoring has relied on Twitter, a microblogging platform with over 300 million active users worldwide [<xref ref-type="bibr" rid="ref2">2</xref>]. A wide variety of health topics are openly discussed on Twitter [<xref ref-type="bibr" rid="ref3">3</xref>], providing researchers with a rich source of data for monitoring the spread of disease [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], dietary patterns [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], drug abuse [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], foodborne illness [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], and depression [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], among many other applications.</p>
        <p>While Twitter has strengths as a data source, its dominance in research relative to other public platforms has been explained as a matter of convenience: Twitter provides free APIs to obtain large volumes of random or targeted samples of data [<xref ref-type="bibr" rid="ref1">1</xref>]. However, microblogs are only one type of social media. Other social media platforms contain different types of data and are used in different ways, like sharing visual media. Image-sharing platforms, such as Flickr, Tumblr, Pinterest, and Instagram, are very popular; for example, Instagram, the most popular image-sharing platform, is more than twice the size of Twitter, with over 700 million active users [<xref ref-type="bibr" rid="ref14">14</xref>]. Despite their popularity, relatively few public health studies have used these types of platforms as a data source [<xref ref-type="bibr" rid="ref1">1</xref>].</p>
        <p>Most prior health research using image-sharing platforms has focused on lifestyle issues, such as diet and substance use. Mejova et al [<xref ref-type="bibr" rid="ref15">15</xref>] analyzed posts of food on Instagram, focusing on the relationship between food consumption and obesity. De Choudhury et al [<xref ref-type="bibr" rid="ref16">16</xref>] also examined food consumption on Instagram, focusing specifically on dietary patterns in locations classified as “food deserts.” Yom-Tov et al [<xref ref-type="bibr" rid="ref17">17</xref>] and Pless et al [<xref ref-type="bibr" rid="ref18">18</xref>] examined imagery associated with eating disorders on Flickr and Tumblr, respectively. A few studies have looked at substance use on Instagram, including electronic cigarettes [<xref ref-type="bibr" rid="ref19">19</xref>], marijuana [<xref ref-type="bibr" rid="ref20">20</xref>], and opioids [<xref ref-type="bibr" rid="ref21">21</xref>], as well as the marketing of substances on Instagram [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. Garimella et al [<xref ref-type="bibr" rid="ref24">24</xref>] looked more broadly at lifestyle choices in Instagram, including diet, physical activity, and drinking.</p>
        <p>Many of these studies focused on the text features (eg, hashtags and captions) of the image posts, and some conducted a manual content analysis of the images themselves. Two of the studies cited above-used computer vision—a type of artificial intelligence that can automatically analyze the content of images—to perform automatic identification of certain types of images. Pless et al [<xref ref-type="bibr" rid="ref18">18</xref>] built image classification models to identify images promoting anorexia, since such content may not be tagged with informative text captions. Garimella et al [<xref ref-type="bibr" rid="ref24">24</xref>] attempted to estimate county-level health statistics from social media content and experimented with both text features and automatically-extracted image tags, finding that both types of information could be correlated with external health metrics.</p>
        <p>This study seeks to characterize the health content shared on Instagram, the most popular image-sharing site, toward the goal of identifying potential areas of research that may benefit from this type of data source. In particular, we consider the following research questions: (1) what health topics are prominently shared in Instagram, and (2) what are the characteristics of those topics, specifically the types of images associated with the topics? This study is related to exploratory topic analyses of other platforms for health research [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], with an additional contribution of characterizing the features of images in addition to text. The dataset is made available as a resource to the public health informatics community.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethics Statement</title>
        <p>This study was reviewed by the University of Colorado Boulder Institutional Review Board, which determined that it does not constitute human subjects research. However, given that publicly posted images may still be considered sensitive material by the users, we took steps to preserve privacy. We did not download any images as part of our study. Instead, we collected the URL pointers to images hosted on Instagram, which were processed by an external computer vision application programming interface (API). Our data collection only contains the abstract features extracted from the images.</p>
      </sec>
      <sec>
        <title>Data Collection and Preparation</title>
        <p>While Instagram provides APIs for specific applications, Instagram does not provide an API to collect public data [<xref ref-type="bibr" rid="ref26">26</xref>], in contrast to Twitter which provides widely-used streaming APIs [<xref ref-type="bibr" rid="ref27">27</xref>]. We instead built a “crawler” that queries Instagram’s search engine, which returns the nine most recent posts matching a specified hashtag. The crawler accesses the webpage of the hashtag search engine, analogous to how a person would access the search engine in their browser. The page is downloaded, and the HTML is parsed to extract information such as the set of tags and the caption of the image.</p>
        <p>We iteratively queried the search engine for 269 general health-related keywords that were used in previous work to obtain a general collection of health-related tweets [<xref ref-type="bibr" rid="ref3">3</xref>]. The keywords were obtained from dictionaries of terms related to diseases, symptoms, and treatments, in addition to general words like “sick” and “health” that were added manually. The original keyword set contained over 20,000 terms, which were reduced to 269 words that were most common in Twitter, to conform to API limits on how many keywords can be searched. While developed for Twitter, we use the same list for Instagram here, as the list contains a broad set of terms that have previously been shown to be useful for collecting health-related social media posts.</p>
        <p>Instagram only allows searching for hashtags rather than free text, so we treated each keyword as a hashtag (eg, “#flu” instead of “flu”). We repeated these 269 queries continuously from September 29, 2016, through October 25, 2016, attempting to simulate a “streaming” collection as with Twitter, and obtained 174,517 posts. We did not download the same post more than once as measured by a unique post identifier. However, if the same content was shared in multiple posts (eg, if multiple users shared the same image), these posts would be considered separate in the dataset.</p>
        <p>Each post includes an image, a set of hashtags, and an optional free text caption. We used langid.py to identify and remove posts containing non-English captions [<xref ref-type="bibr" rid="ref28">28</xref>]. We also removed hyperlinks, and nonalphanumeric characters. Stop words were removed using the natural language toolkit (NLTK) [<xref ref-type="bibr" rid="ref29">29</xref>]. After filtering and processing, our dataset contained 96,426 documents posted by 77,327 users with an average of 1.25 posts per user.</p>
        <p>We extracted “tag” features from each image using Microsoft’s Computer Vision API [<xref ref-type="bibr" rid="ref30">30</xref>]. This service returns key phrase descriptors of images, such as “person” or “running”. The API was able to extract at least 1 tag for 79.24% (76,407/96,426) of images in our dataset. We refer to these tags as “image tags” to distinguish them from hashtags.</p>
        <p>Once extracted, we treated the image tags as an additional type of text, along with captions and hashtags. In the final collection, there are 96,426 posts with a nonempty list of hashtags for an average of 15.2 tags per post. There were 95,208 posts (95,208/96,426; 98.74%) with nonempty captions for an average of 21.6-word tokens per caption. There were also 76,407 posts (76,407/96,426; 79.24%) with at least 1 image tag for an average of 3.7 tags per image. The dataset is shown in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Topic Modeling</title>
        <p>We use probabilistic topic models [<xref ref-type="bibr" rid="ref31">31</xref>] to characterize the major themes of health-related discussion in Instagram. Topic models are tools for clustering related words into themes or concepts called “topics” and for identifying the topic composition of documents. Topic models have been used in health research as a method of performing content analyses of large datasets [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>].</p>
        <p>A topic model is a statistical model with many latent variables and parameters that can be inferred by fitting the model to data. In this model, each “topic” has a probability distribution over words, estimated from data, and topics are usually represented by presenting the 10-20 most probable words in the topic. Additionally, each document has a probability distribution over topics, which can be used to characterize the topic composition of a document and to identify documents that describe particular topics.</p>
        <p>Topic models take documents, represented as vectors of word counts, as input. The model parameters (ie, the distribution over topics <italic>θ</italic><sub>d</sub> in each document <italic>d</italic>, and the distribution over words <italic>ϕ</italic><sub>k</sub> for each topic <italic>k</italic>) are estimated to fit the observed data (ie, the word counts in each document). The estimated parameters are often interpretable to people, and the words associated with each topic cluster can be used to assign a concept to the topic [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <sec>
          <title>Polylingual Topic Model</title>
          <p>The polylingual topic model [<xref ref-type="bibr" rid="ref36">36</xref>] is an extension of a traditional topic model that is applied to multiple languages. This model can be used for datasets in which documents have multiple versions in multiple languages. For example, translations of a document into other languages, or articles in different languages that are known to be about the same topic, like different versions of Wikipedia articles. In the polylingual topic model, the distribution over topics <italic>θ</italic><sub>d</sub> is shared across all versions of the document, while each topic has a different distribution over words specific to each language <italic>l</italic>, <italic>ϕ</italic><sub>lk</sub><italic>.</italic></p>
          <p>In this work, we treat the different modalities of data—captions, hashtags, and image tag features—as different “languages” and apply the polylingual topic model to these 3 types of data. That is, each topic has a distribution over caption words, a distribution over hashtags, and a distribution over image tags. This will provide different views of each topic, allowing us to leverage multiple types of data and provide a complete understanding of the topics.</p>
        </sec>
        <sec>
          <title>Model Estimation</title>
          <p>We used the Polylingual Topic Model implementation from MALLET [<xref ref-type="bibr" rid="ref37">37</xref>]. The hyperparameter for the topic distribution prior (ie, “alpha”) was set to 1.0, and we used the default algorithm settings. The number of topics was set to 150. The model does not require each document to have a version in all 3 “languages,” and if a document did not contain a caption or image tags, we still included the document but without those data types.</p>
          <p>Because the topic model output in this study is interpreted qualitatively to be used in a content analysis, we also used qualitative judgment in performing model selection [<xref ref-type="bibr" rid="ref1">1</xref>]. To avoid extensive model selection, we relied on default hyperparameters for the model. To choose the number of topics, we compared the output with 50, 100, and 150 topics. We selected 150 topics because this setting provided topics that were qualitatively more coherent.</p>
        </sec>
        <sec>
          <title>Topic Identification</title>
          <p>After running the topic model, we examined the 20 most probable words in each “language” of each topic. The two authors independently annotated each topic, labeling each topic with a phrase that describes the group of words or marking the topic with an “unknown” label if the words do not form a coherent theme. The annotators then discussed the independent labels with each other to determine if the 2 labels described the same concept (eg, the free text labels could be similar but different strings, such as “Running” and “Jogging”), and to decide on a final label.</p>
          <p>When comparing whether the 2 annotators thought a topic was coherent, as opposed to the “unknown” label, the annotators agreed on 124/150 (82.7%) of the topics (Cohen kappa=.62). When comparing whether the 2 annotators thought a topic was related to health, the agreement was 136/150 (90.7%) with Cohen kappa=.77.</p>
          <p>Additionally, we grouped the topics into coarse-grained categories, to make the results easier to summarize. One annotator created a grouping of the topics and then iterated with feedback from the other annotator. Categories are not mutually exclusive; topics could be assigned to more than 1 category.</p>
        </sec>
      </sec>
      <sec>
        <title>Topic Analysis</title>
        <p>The topic model gives the probability of each topic <italic>k</italic> in each document <italic>d</italic>, <italic>θ</italic><sub>dk</sub><italic>.</italic> To get the overall prevalence of each topic, we calculate the average proportion, where <italic>D</italic> is the number of documents: <graphic xlink:href="publichealth_v4i2e10150_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>To estimate the prevalence of the coarser categories, we simply define each category’s prevalence of the sum of its topic proportions, where <italic>C</italic><sub>j</sub> is the set of topics in category <italic>j</italic>: <graphic xlink:href="publichealth_v4i2e10150_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>To summarize the degree to which 2 topics have a tendency to occur together in documents, we calculate the Pearson correlation of the <italic>θ</italic><sub>dk</sub> values for all pairs of topics. Most of the topic pairs with high correlations were similar topics in the same category. To discover less obvious topic co-occurrences, we focus only on topic pairs from different categories.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Of all the topics, 93/150 (62%) were identified as coherent, with 47/150 (31%) related to health and 46/150 (31%) not related to health. The 47 health topics were grouped into 10 high-level categories: acute illness, alternative medicine, chronic illness and pain, diet, exercise, health care &#38; medicine, mental health, musculoskeletal health and dermatology, sleep, and substance use.</p>
      <p>Examples of health topics are shown in <xref ref-type="table" rid="table1">Table 1</xref>, while the complete set of 47 health topics organized across the 10 categories is provided in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>. Examples of nonhealth topics are shown in <xref ref-type="table" rid="table2">Table 2</xref>. As can happen with unsupervised topic models, many clusters have very similar and overlapping content, with similar or identical names given by the annotators.</p>
      <p>Comparing the different modalities, hashtags tend to contain words specific to the topic (eg, “cancer,” “diabetes,” “allergies”), while caption words give indications of the context of the posts (eg, “feel,” “love,” “hope,” “proud”). In some cases, the caption words include first-person (eg, “i’ve,” “i’ll”) and informal (eg, “awesome,” “lol”) language that might be observed in personal conversations. Other topics include caption words consistent with advertising (eg, “product,” “call,” “email,” “consultation”).</p>
      
      
      <p>Examining the image tags, there are often not many tags that are directly related to the health topics, with the exception of topics related to food and beverage, which usually had explicit image descriptors of the corresponding food. For example, the <italic>Meat</italic> topic contains the image tags “meat” and “barbeque,” <italic>Desserts</italic> contains “dessert” and “chocolate,” <italic>Alcohol</italic> contains “alcohol” and “beer,” and <italic>Caffeine</italic> contains “coffee” and “coffee cup.” A small number of other topics are also associated with image tags that are directly related to the topic: <italic>Sleep</italic> includes “pillow” and “bed,” and <italic>Dental health</italic> includes “toothbrush” and “mouth.”</p>
       <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>The top 10 words in each modality (hashtags, caption words, and image tags) for 6 example health topics.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="100"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <thead>
            <tr valign="top">
              <td>Modality</td>
              <td>Bodybuilding</td>
              <td>Cancer</td>
              <td>Caffeine</td>
              <td>Desserts</td>
              <td>Insomnia</td>
              <td>Suicide &#38; self-harm</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Hashtags</td>
              <td>bodybuilding<break/>shredded<break/>muscle<break/>gym<break/>abs<break/>physique<break/>veins<break/>gymlife<break/>bodybuilder<break/>gains</td>
              <td>cancer<break/>cure<break/>chemo<break/>breastcancer<break/>cancersucks<break/>breastcancerawareness<break/>pink<break/>chemotherapy<break/>fuckcancer<break/>hope</td>
              <td>caffeine<break/>coffee<break/>coffeelover<break/>coffeeaddict<break/>coffeetime<break/>coffeeholic<break/>hot<break/>drink<break/>cafe<break/>coffeegram</td>
              <td>food<break/>cravings<break/>foodporn<break/>delicious<break/>chocolate<break/>foodie<break/>yummy<break/>dessert<break/>sweet<break/>yum</td>
              <td>insomnia<break/>bedtime<break/>workout<break/>art<break/>selfie<break/>rest<break/>night<break/>natural<break/>sleepy<break/>amazing</td>
              <td>anxiety<break/>depressed<break/>sad<break/>suicide<break/>suicidal<break/>depression<break/>cutting<break/>sadness<break/>broken<break/>selfharm</td>
            </tr>
            <tr valign="top">
              <td>Captions</td>
              <td>bro<break/>man<break/>nice<break/>work<break/>likes<break/>gym<break/>muscle<break/>hard<break/>follow<break/>training</td>
              <td>cancer<break/>breast<break/>awareness<break/>month<break/>pink<break/>support<break/>women<break/>chemo<break/>fight<break/>family</td>
              <td>coffee<break/>day<break/>tea<break/>today<break/>love<break/>drink<break/>cup<break/>hot<break/>green<break/>feeling</td>
              <td>chocolate<break/>cream<break/>pumpkin<break/>eat<break/>good<break/>made<break/>butter<break/>ice<break/>cake<break/>peanut</td>
              <td>sleep<break/>night<break/>bed<break/>hours<break/>time<break/>back<break/>asleep<break/>sleeping<break/>make<break/>nights</td>
              <td>don’t<break/>feel<break/>talk<break/>people<break/>hate<break/>i’ve<break/>stop<break/>anymore<break/>fucking<break/>cry</td>
            </tr>
            <tr valign="top">
              <td>Image tags</td>
              <td>man<break/>underpants<break/>sport<break/>indoor<break/>barbell<break/>cellphone<break/>phone<break/>holding<break/>exercisedevice<break/>swimsuit</td>
              <td>group<break/>standing<break/>people<break/>beautiful<break/>crowd<break/>little<break/>girl<break/>wearing<break/>white<break/>pink</td>
              <td>beverage<break/>food<break/>coffee<break/>table<break/>drink<break/>coffeecup<break/>breakfast<break/>pastry<break/>dixiecup<break/>doughnut</td>
              <td>dessert<break/>chocolate<break/>slice<break/>food<break/>piece<break/>eaten<break/>cream<break/>plate<break/>fork<break/>pastry</td>
              <td>indoor<break/>lying<break/>bedclothes<break/>smiling<break/>pillow<break/>blanket<break/>sofa<break/>bedroom<break/>glasses<break/>cloth</td>
              <td>close<break/>dark<break/>woman<break/>staring<break/>clouds<break/>hand<break/>cloudy<break/>road<break/>nightsky<break/>mountain</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      
       
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Examples of topics that are not directly about health.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="100"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <col width="150"/>
          <thead>
            <tr valign="top">
              <td>Modality</td>
              <td>Inspiration</td>
              <td>Poetry &#38; quotes</td>
              <td>Spirituality</td>
              <td>Politics</td>
              <td>Cats</td>
              <td>Grunge/emo</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Hashtags</td>
              <td>inspiration<break/>selflove<break/>happiness<break/>recovery<break/>positivity<break/>positivevibes<break/>loveyourself<break/>heal<break/>hope<break/>positive</td>
              <td>poetry<break/>quotes<break/>pain<break/>words<break/>quote<break/>writer<break/>love<break/>writersofinstagram<break/>hurt<break/>writing</td>
              <td>heal<break/>healing<break/>energy<break/>meditation<break/>love<break/>spiritual<break/>soul<break/>mind<break/>spirit<break/>light</td>
              <td>trump<break/>vaccines<break/>nature<break/>vegan<break/>wakeup<break/>hillary<break/>blacklivesmatter<break/>usa<break/>clinton<break/>organic</td>
              <td>catsofinstagram<break/>cat<break/>cute<break/>cats<break/>kitty<break/>kitten<break/>pet<break/>meow<break/>fluffy<break/>animal</td>
              <td>grunge<break/>tumblr<break/>emo<break/>alternative<break/>depressed<break/>depression<break/>goth<break/>sad<break/>aesthetic<break/>punk</td>
            </tr>
            <tr valign="top">
              <td>Captions</td>
              <td>life<break/>things<break/>live<break/>past<break/>true<break/>grateful<break/>time<break/>living<break/>mind<break/>people</td>
              <td>love<break/>heart<break/>words<break/>world<break/>soul<break/>make<break/>life<break/>mind<break/>hurt<break/>give</td>
              <td>energy<break/>healing<break/>body<break/>soul<break/>light<break/>life<break/>heart<break/>deep<break/>space<break/>love</td>
              <td>people<break/>world<break/>trump<break/>media<break/>american<break/>america<break/>vote<break/>country<break/>drugs<break/>government</td>
              <td>feel<break/>hope<break/>poor<break/>baby<break/>glad<break/>aww<break/>cat<break/>hear<break/>rest<break/>sick</td>
              <td>don’t<break/>feel<break/>i’ll<break/>back<break/>i’ve<break/>make<break/>yeah<break/>hope<break/>feeling<break/>man</td>
            </tr>
            <tr valign="top">
              <td>Image tags</td>
              <td>indoor<break/>posing<break/>sky<break/>rock<break/>garden<break/>water<break/>mountain<break/>bushes<break/>clouds<break/>can</td>
              <td>wearing<break/>day<break/>mammal<break/>dark<break/>shore<break/>grass<break/>open<break/>plaque<break/>building<break/>abstract</td>
              <td>posing<break/>person<break/>fresh<break/>forest<break/>mountain<break/>mammal<break/>silhouette<break/>sunset<break/>sign<break/>distance</td>
              <td>indoor<break/>screen<break/>display<break/>electronics<break/>flat<break/>suit<break/>sign<break/>text<break/>newspaper<break/>computer</td>
              <td>domesticcat<break/>mammal<break/>laying<break/>animal<break/>sleeping<break/>cat<break/>white<break/>rodent<break/>grey<break/>gray</td>
              <td>indoor<break/>close<break/>person<break/>young<break/>hair<break/>blue<break/>glasses<break/>looking<break/>messy<break/>silhouette</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <p>While not explicitly about the health topic, some topics contain image tags that convey other characteristics of the imagery. For example, the image tags of the <italic>Mental health</italic> topic suggest positive imagery (eg, “smiling” and “nature”) while the tags of the <italic>Suicide &#38; self-harm</italic> topic suggest negative imagery (eg, “dark” and “cloudy”). The <italic>Cancer</italic> topic contains imagery associated specifically with breast cancer awareness, with the tag “pink” appearing as an image tag, as well as in the top hashtags and caption words.</p>
      <p>A common theme across topics is that images frequently contain people. The image tag “posing” is the top tag associated with 11 topics (ie, <italic>Chronic illness</italic>, <italic>Diabetes</italic>, <italic>Gym/fitness</italic>, <italic>Health care</italic>, <italic>Nursing</italic>, <italic>Hospitalization</italic>, <italic>Mental health</italic>, <italic>Skin health</italic>, <italic>Tanning</italic>, <italic>Cosmetic surgery</italic>, <italic>Dental health</italic>). Other topics have the top tag of “person” (ie, <italic>Illness</italic>), “group” (ie, <italic>Cancer</italic>), “woman” (ie, <italic>Headaches &#38; body aches</italic>), and “man” (ie, <italic>Bodybuilding</italic>). The image tags “swimsuit” and “underwear” are especially common in many of the exercise and fitness topics. This matches an observation in a previous study of fitness images on Instagram which found that “most images contained posed individuals with some degree of objectification” [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
      <p>Gender associations can also be observed in the image features. There are four topics that contain a male-associated image tag (ie, “man,” “boy,” “male”) but no female tag (ie, <italic>Diabetes</italic>, <italic>Massage</italic>, <italic>Gym/fitness training</italic>, <italic>Bodybuilding</italic>), and 6 topics that contain a female-associated tag (ie, “woman,” “girl,” “female”) but no male tag (ie, <italic>Cancer</italic>, <italic>Musculoskeletal pain</italic>, <italic>Headaches &#38; body aches</italic>, <italic>Exercise</italic>, <italic>Gym/bodybuilding</italic>, <italic>Gym/fitness</italic>). Only 1 topic (ie, <italic>Allergies</italic>) included both genders in the top 10 image tags (ie, “woman” and “boy”).</p>
      <p>In one case, 2 topics with very similar text features had different gender patterns in the images. Consider the topic with top hashtags, “workout,” “fitness,” “gym,” “fit,” “exercise,” and top caption words, “week,” “day,” “workout,” “work,” “good;” and a similar topic with hashtags, “workout,” “abs,” “gym,” “muscle,” “chest,” and caption words, “work,” “back,” “legs,” “leg,” “strong.” These top words do not explicitly contain gendered words, but in the image tags, the former contains “woman” while the latter contains “man” and “male.” <xref ref-type="table" rid="table3">Table 3</xref> shows the 10 topics with the highest average proportions in documents, and <xref ref-type="table" rid="table4">Table 4</xref> shows the cumulative proportions of each of the 10 categories.</p>
    
        
        
        
      <table-wrap position="float" id="table3">
      <label>Table 3</label>
        <caption>
          <p>The 10 most prevalent individual topics in the dataset, ranked by their average topic proportion out of 150 total topics. When multiple topics have the same name, we show the integer index of the topic in parentheses to distinguish them.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="700"/>
          <col width="300"/>
          <thead>
            <tr valign="top">
              <td>Topic</td>
              <td>Average Probability</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Suicide &#38; self-harm</td>
              <td>0.012</td>
            </tr>
            <tr valign="top">
              <td>Bodybuilding (Topic 135)</td>
              <td>0.010</td>
            </tr>
            <tr valign="top">
              <td>Exercise</td>
              <td>0.009</td>
            </tr>
            <tr valign="top">
              <td>Healthy food (Topic 2)</td>
              <td>0.009</td>
            </tr>
            <tr valign="top">
              <td>Gym/bodybuilding (Topic 14)</td>
              <td>0.009</td>
            </tr>
            <tr valign="top">
              <td>Marijuana</td>
              <td>0.008</td>
            </tr>
            <tr valign="top">
              <td>Healthy food (Topic 67)</td>
              <td>0.008</td>
            </tr>
            <tr valign="top">
              <td>Vitamins &#38; supplements</td>
              <td>0.008</td>
            </tr>
            <tr valign="top">
              <td>Skin health</td>
              <td>0.008</td>
            </tr>
            <tr valign="top">
              <td>Gym/fitness</td>
              <td>0.008</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>The topic categories ranked by prevalence, where each category’s prevalence is defined by the sum of the individual topic proportions of the category’s topics.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="700"/>
          <col width="300"/>
          <thead>
            <tr valign="top">
              <td>Topic Category</td>
              <td>Cumulative Probability</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Diet</td>
              <td>.086</td>
            </tr>
            <tr valign="top">
              <td>Exercise</td>
              <td>.076</td>
            </tr>
            <tr valign="top">
              <td>Musculoskeletal health &#38; dermatology</td>
              <td>.046</td>
            </tr>
            <tr valign="top">
              <td>Alternative medicine</td>
              <td>.042</td>
            </tr>
            <tr valign="top">
              <td>Chronic illness &#38; pain</td>
              <td>.039</td>
            </tr>
            <tr valign="top">
              <td>Health care &#38; medicine</td>
              <td>.033</td>
            </tr>
            <tr valign="top">
              <td>Mental health</td>
              <td>.026</td>
            </tr>
            <tr valign="top">
              <td>Substance use</td>
              <td>.021</td>
            </tr>
            <tr valign="top">
              <td>Sleep</td>
              <td>.013</td>
            </tr>
            <tr valign="top">
              <td>Acute illness</td>
              <td>.012</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      
      <p>Topics about diet and exercise are by far the most prevalent topics, while topics about acute illness and sleep are uncommon. Topics with high probabilities tended to be more coherent than low-probability topics. Of the topics with the highest probabilities, 24/25 (96%) were labeled as coherent by the annotators. Of the topics with the lowest probabilities, only 15/25 (60%) were labeled as coherent. The variability in average probabilities was low; the values ranged from .005 to .012.</p>
      <p><xref ref-type="table" rid="table5">Table 5</xref> shows the 10 pairs of health topics with the highest correlations. Some of the strongest correlations are with the <italic>Vitamins &#38; supplements</italic> topic, which co-occurs with a variety of other health topics. <xref ref-type="table" rid="table6">Table 6</xref> shows the most correlated 10 topic pairs such that one topic is a health topic and the other is a nonhealth topic. Inspirational and supportive topics (ie, <italic>Inspiration</italic> and <italic>Poetry &#38; quotes</italic>) tend to co-occur with mental health and exercise topics, and topics about religion and spirituality tend to co-occur with certain health topics, like <italic>Alternative medicine</italic> and <italic>Yoga</italic>.</p>
      
      <table-wrap position="float" id="table5">
        <label>Table 5</label>
        <caption>
          <p>The 10 most correlated pairs of health topics.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="400"/>
          <col width="300"/>
          <col width="300"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Topic pair</td>
           
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td>Topic A</td>
              <td>Topic B</td>
              <td>Pearson Correlation</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Vitamins &#38; supplements</td>
              <td>Energy &#38; hydration</td>
              <td>.138</td>
            </tr>
            <tr valign="top">
              <td>Vitamins &#38; supplements</td>
              <td>Health science</td>
              <td>.131</td>
            </tr>
            <tr valign="top">
              <td>Vitamins &#38; supplements</td>
              <td>Headaches &#38; body aches</td>
              <td>.121</td>
            </tr>
            <tr valign="top">
              <td>Energy &#38; hydration</td>
              <td>Headaches &#38; body aches</td>
              <td>.067</td>
            </tr>
            <tr valign="top">
              <td>Vitamins &#38; supplements</td>
              <td>Skin health</td>
              <td>.060</td>
            </tr>
            <tr valign="top">
              <td>Chronic illness</td>
              <td>Mental health</td>
              <td>.057</td>
            </tr>
            <tr valign="top">
              <td>Chronic illness</td>
              <td>Hospitalization</td>
              <td>.050</td>
            </tr>
            <tr valign="top">
              <td>Alternative medicine</td>
              <td>Health science</td>
              <td>.050</td>
            </tr>
            <tr valign="top">
              <td>Running &#38; cardio</td>
              <td>Injuries &#38; rehabilitation</td>
              <td>.049</td>
            </tr>
            <tr valign="top">
              <td>Headaches &#38; body aches</td>
              <td>Massage</td>
              <td>.047</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table6">
        <label>Table 6</label>
        <caption>
          <p>The 10 most correlated pairs of topics, where each pair contains 1 health topic and 1 nonhealth topic.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="400"/>
          <col width="300"/>
          <col width="300"/>
          <thead>
           <tr valign="top">
              <td colspan="2">Topic pair</td>
           
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td>Topic A</td>
              <td>Topic B</td>
              <td>Pearson Correlation</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Suicide &#38; self-harm</td>
              <td>Grunge/emo</td>
              <td>.107</td>
            </tr>
            <tr valign="top">
              <td>Mental health</td>
              <td>Inspiration</td>
              <td>.081</td>
            </tr>
            <tr valign="top">
              <td>Hospitalization</td>
              <td>Cats</td>
              <td>.080</td>
            </tr>
            <tr valign="top">
              <td>Vaccination</td>
              <td>Politics</td>
              <td>.063</td>
            </tr>
            <tr valign="top">
              <td>Hospitalization</td>
              <td>Religion/Christianity</td>
              <td>.059</td>
            </tr>
            <tr valign="top">
              <td>Yoga</td>
              <td>Spirituality</td>
              <td>.056</td>
            </tr>
            <tr valign="top">
              <td>Alternative medicine</td>
              <td>Spirituality</td>
              <td>.048</td>
            </tr>
            <tr valign="top">
              <td>Fitness training</td>
              <td>Sexuality</td>
              <td>.048</td>
            </tr>
            <tr valign="top">
              <td>Suicide &#38; self-harm</td>
              <td>Poetry &#38; quotes</td>
              <td>.043</td>
            </tr>
            <tr valign="top">
              <td>Gym/fitness</td>
              <td>Inspiration</td>
              <td>.036</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
      <title>Principal Findings</title>
      <p>The topic model results show a large and diverse set of health topics are discussed in Instagram. Qualitatively, we find that the top hashtags tend to be the best descriptors of topics, while caption words give some indication of what kind of messages are associated with the topics, such as whether they are more informational or conversational. The extracted image tags are generally much less coherent, though they do help characterize the types of images that are associated with each topic. For example, many of the topics related to pain contain images of animals, perhaps because users post cheerful images in response to pain. The tag “posing” appears in some topics, suggesting these posts may be informational rather than personal. The <italic>Cancer</italic> topic contains the image tags “group” and “crowd;” it appears these many posts in this topic are about cancer awareness events. In some cases, image tags were the defining characteristics that distinguished clusters that were otherwise very similar, which suggests that images are informative beyond the hashtags and captions to conduct content analyses of Instagram posts.</p>
      <p>Qualitatively, it appears that in most cases the image tags are not specific enough to be useful for directly identifying posts relevant to a specific health application. However, tags of food and beverages appear to be fairly specific and accurate, suggesting that computer vision may help in identifying posts for studies of diet and food consumption. The only previous work we are aware of that used automatically extracted image tags for this purpose is [<xref ref-type="bibr" rid="ref24">24</xref>], which found that image tags were predictive of lifestyle factors; for example, “glass”, “liquid” and “beverage” were associated with alcohol consumption. The authors suggested that image tags may be useful for identifying stigmatizing behaviors, where social media users may post images of an activity but not explicitly tag the activity. Even nonstigmatized activities, like general food consumption, may not be tagged by a user in a way that is specific enough to identify by text search, while computer vision may help. We observed that image tags extracted from the computer vision API did not usually identify a specific dish, but could at least identify broad categories, like “meat” and “vegetable,” and in some cases were more specific, like “potato” and “doughnut.” We, therefore, argue that this type of computer vision tool can expand the amount of data available for studying patterns in food consumption.</p>
      <p>We gained additional insights by considering the co-occurrences of health topics in the data. For example, the <italic>Vitamins &#38; supplements</italic> topic is less likely to appear in a post in isolation but instead co-occurs with other topics, likely because supplements are discussed in the context in which they are used. Using this data to study nutrition in a population may, therefore, be able to show how nutrition is discussed and applied to specific aspects of health. Some pairs of topics with high correlations may indicate comorbidities, such as <italic>Chronic illness</italic> and <italic>Mental health</italic> [<xref ref-type="bibr" rid="ref39">39</xref>].</p>
      <p>Co-occurrences with nonhealth topics may give insights into other contexts in which health is discussed. We observed that many health topics frequently co-occur with inspirational topics, such as topics containing poetry and quotes, or topics about nature, as well as topics related to spirituality and religion. These types of posts may give insight into how individuals cope with and support others with, illness and disease.</p>
      <p>An additional observation in some topics is the use of certain hashtags to identify a specific community of users [<xref ref-type="bibr" rid="ref40">40</xref>], such as the #wlscommunity in the weight loss topics. Online health communities have been studied to understand social support and behavior change in managing health conditions [<xref ref-type="bibr" rid="ref41">41</xref>]. Instagram-based communities may be a unique source for studying similar issues. Communities for specific demographic groups (eg, #girlswholift) are also present. In some cases, demographic associations could be gleaned from the image tags, even if the text tags were not explicitly gendered.</p>
      <p>One of our methodological contributions was to repurpose an existing tool, the polylingual topic model, for a new task of combining different modalities of data in a topic model. We showed that automatically extracted image tags from a computer vision API can be treated as text tokens in an existing topic model. Beyond topic models, our observations of the results suggest that these extracted image tags are in some cases useful descriptors of images. We suggest that this type of tool can be applied to images for health research more broadly.</p>
      <p>We observed the same broad set of topics in Instagram that have previously been seen in Twitter [<xref ref-type="bibr" rid="ref3">3</xref>], suggesting that Instagram could serve as a potential data source for many of the same applications for which Twitter is used. Moreover, the presence of the first-person language (eg, “i’ve”) in some topics indicates that health posts on Instagram include personal health mentions, which is an essential characteristic for some types of surveillance [<xref ref-type="bibr" rid="ref42">42</xref>]. This has implications for social media-based health surveillance because this suggests that Instagram could be used as a data source for similar areas of research, while having the potential benefit of covering a larger population than Twitter. We do not suggest that one platform is universally better than another, but instead, using data from multiple platforms can result in better surveillance than reliance on one platform [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>].</p>
      <p>Instagram may complement Twitter as a data source because it has a different demographic distribution. The user base of Instagram is younger, lower income, and more urban compared to Twitter, [<xref ref-type="bibr" rid="ref45">45</xref>]. These demographics cover populations that are traditionally harder to reach in health research [<xref ref-type="bibr" rid="ref46">46</xref>], and so Instagram may be well-suited for studying such populations. This argument has been made for using Twitter [<xref ref-type="bibr" rid="ref47">47</xref>], yet Instagram has an even heavier bias toward these populations. Additionally, Instagram has a gender bias that Twitter does not have, being nearly 50% more popular among women than men [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Not all health topics are discussed widely on Instagram, which may be a limitation of using Instagram. By far the most common topics in Instagram are related to diet and exercise, while topics on acute illness, which would be needed for a task like influenza surveillance, are the least common. This may explain why all prior work we identified using image-sharing platforms for health research was related to lifestyle factors, such as diet (most common) and physical activity. Nonetheless, topics about infectious disease do exist on Instagram, and so it may be worth investigating the utility of contributing this data to an ensemble surveillance system [<xref ref-type="bibr" rid="ref43">43</xref>]. To the best of our knowledge, no prior work has studied Instagram for infectious disease surveillance, which would be a good candidate for future research. However, this study did not collect Instagram posts from a large enough span of time to validate the data for such a task.</p>
        <p>Another limitation of using Instagram is the limited availability of metadata. When crawling Instagram, it is difficult to sample data uniformly across time, as Instagram does not provide a streaming API analogous to Twitter’s widely-used APIs, which would make it difficult to extract the long-term pattern, for example, to validate influenza tracking. Location data also appears to be difficult to obtain when crawling from the web. In our dataset, 46% of posts contained a user-specified location string, but these were not in a standard format, and many of them were names of businesses or other specific locations, without reference to a geographic area. Geolocation from Instagram is less well understood in social media research, as well as inference of other demographic attributes that may be important in public health research. Richer data may be available from certain resellers; for example, Gnip, who is the official seller of Twitter data, also sells Instagram data, which can be searched by either tag or geolocation.</p>
        <p>In addition to limitations of this data, there are limitations with the topic model methodology. Topic model evaluation is notoriously difficult [<xref ref-type="bibr" rid="ref35">35</xref>], though research has found that this methodology can provide overlapping insights with more traditional, manual text analysis [<xref ref-type="bibr" rid="ref48">48</xref>]. There is subjectiveness in choosing the number of topics and labeling and categorizing the topics, which we mitigated by having two researchers involved in each step. An advantage of the topic modeling approach for this study is that it can be applied to the entire dataset of nearly 100,000 posts, and the word distributions highlight the features associated with each topic across the three modalities. Furthermore, with such a large number of topics in the data (ie, 47 health topics identified by the approach used here), a typical sample size for manual content analysis, on the order of 1,000 posts [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>], would be insufficient for accurately learning the prevalence of each topic.</p>
        <p>Another limitation of topic modeling is that the topics characterize <italic>what</italic> is being discussed, but it is difficult to describe <italic>how</italic> the content is presented. For example, the topic model can identify posts that are related to marijuana, but it does not distinguish between personal marijuana use, information about marijuana, or advertisements for cannabis products—distinctions that have been made in prior work using more qualitative methods [<xref ref-type="bibr" rid="ref20">20</xref>]. However, the topic model is still an essential first step of filtering and retrieval, after which topic-specific posts could be analyzed in more depth.</p>
        <p>We note that there exist other methods for identifying thematic patterns in text beyond probabilistic topic models that have been used in health research, such as network-based clustering on term co-occurrence graphs [<xref ref-type="bibr" rid="ref49">49</xref>]. Most such methods, including topic models, rely on co-occurrence statistics of words and have similar properties and limitations. We used the polylingual topic model due to its ability to integrate different “languages” or modalities of data.</p>
        <p>Finally, the grouping of topics into ten overlapping categories is also limited. Some topics were difficult to categorize, and the boundaries of some categories were difficult to define. However, the goal of the categorization is to present the raw results (available from our dataset in <xref ref-type="app" rid="app1">Multimedia Appendix 1</xref>) more concisely. The mapping of topics to categories is transparent (viewable in <xref ref-type="app" rid="app2">Multimedia Appendix 2</xref>) so that the results can be interpreted correctly.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>This study shows that health is discussed on Instagram in a variety of ways, and there is potential for computer vision techniques to automatically characterize health-related images, which could extend public health surveillance of social media beyond text-based analysis. Our dataset of nearly 100,000 posts is available to allow for the study of specific topics and image tags in more depth. There are pragmatic reasons why this popular platform has been used in research relatively little compared to platforms like Twitter and Facebook, but our results and discussion point to ideas that image-sharing platforms like Instagram may complement other social media data sources in health research.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <app id="app1">
        <title>Multimedia Appendix 1</title>
        <p>The dataset of 96,426 Instagram posts. The raw data is not included for privacy reasons, but can be collected through the URLs provided. The dataset includes the additional information inferred for each post: the image tags, and the topic model probabilities. Descriptions of all 150 topics are also included, as well as the 269 keywords used to search for posts.</p>
        <media xlink:href="publichealth_v4i2e10150_app1.zip" xlink:title="ZIP File (Zip Archive), 5MB"/>
      </app>
      <app id="app2">
        <title>Multimedia Appendix 2</title>
        <p>The descriptions of the 47 health topics. Each slide contains a table corresponding to one of the ten health categories, along with the top ten hashtag, caption, and image features for each topic in that category. Some topics have the same name, in which case we added the topic index (1-150) in parentheses to the name to differentiate these topics.</p>
        <media xlink:href="publichealth_v4i2e10150_app2.pdf" xlink:title="PDF File (Adobe PDF File), 267KB"/>
      </app>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>Application Programming Interface</p>
          </def>
        </def-item>
     
        <def-item>
          <term id="abb2">NLTK</term>
          <def>
            <p>Natural Language Toolkit</p>
          </def>
        </def-item>
    
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>MJP serves on the advisory board to Sickweather, a company that uses social media to forecast illness.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Social Monitoring for Public Health</article-title>
        <source>Synthesis Lectures on Information Concepts, Retrieval, and Services</source>  
        <year>2017</year>  
        <publisher-loc>San Rafael, California</publisher-loc>
        <publisher-name>Morgan &#38; Claypool</publisher-name>
        <fpage>1</fpage>  
        <lpage>185</lpage> </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>Statista</collab>
        </person-group>
        <source>Number of monthly active Twitter users worldwide from 1st quarter to 3rd quarter</source>  
        <year>2010</year>  
        <access-date>2018-02-08</access-date>
        <comment>2018 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.statista.com/statistics/282087/number-of-monthly-active-twitter-users/">https://www.statista.com/statistics/282087/number-of-monthly-active-twitter-users/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6x53kEUB1"/></comment> </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Discovering health topics in social media using topic models</article-title>
        <source>PLoS One</source>  
        <year>2014</year>  
        <volume>9</volume>  
        <issue>8</issue>  
        <fpage>e103408</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0103408"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0103408</pub-id>
        <pub-id pub-id-type="medline">25084530</pub-id>
        <pub-id pub-id-type="pii">PONE-D-14-00554</pub-id>
        <pub-id pub-id-type="pmcid">PMC4118877</pub-id></nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Signorini</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Segre</surname>
            <given-names>AM</given-names>
          </name>
          <name name-style="western">
            <surname>Polgreen</surname>
            <given-names>PM</given-names>
          </name>
        </person-group>
        <article-title>The use of Twitter to track levels of disease activity and public concern in the U.S. during the influenza A H1N1 pandemic</article-title>
        <source>PLoS One</source>  
        <year>2011</year>  
        <volume>6</volume>  
        <issue>5</issue>  
        <fpage>e19467</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0019467"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pone.0019467</pub-id>
        <pub-id pub-id-type="medline">21573238</pub-id>
        <pub-id pub-id-type="pii">PONE-D-10-02464</pub-id>
        <pub-id pub-id-type="pmcid">PMC3087759</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sadilek</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Kautz</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Silenzio</surname>
            <given-names>V</given-names>
          </name>
        </person-group>
        <article-title>Modeling spread of disease from social interactions</article-title>
        <year>2012</year>  
        <conf-name>Sixth AAAI International Conference on Weblogs and Social Media</conf-name>
        <conf-date>2012</conf-date>
        <conf-loc>Dublin, Ireland</conf-loc></nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fried</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Surdeanu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Kobourov</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Hingle</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Bell</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Analyzing the language of food on social media</article-title>
        <year>2014</year>  
        <conf-name>IEEE International Conference on Big Data</conf-name>
        <conf-date>2014</conf-date>
        <conf-loc>Washington, DC</conf-loc></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Nguyen</surname>
            <given-names>QC</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Meng</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Kath</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Nsoesie</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Wen</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Building a National Neighborhood Dataset From Geotagged Twitter Data for Indicators of Happiness, Diet, and Physical Activity</article-title>
        <source>JMIR Public Health Surveill</source>  
        <year>2016</year>  
        <month>10</month>  
        <day>17</day>  
        <volume>2</volume>  
        <issue>2</issue>  
        <fpage>e158</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://publichealth.jmir.org/2016/2/e158/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/publichealth.5869</pub-id>
        <pub-id pub-id-type="medline">27751984</pub-id>
        <pub-id pub-id-type="pii">v2i2e158</pub-id>
        <pub-id pub-id-type="pmcid">PMC5088343</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hanson</surname>
            <given-names>CL</given-names>
          </name>
          <name name-style="western">
            <surname>Burton</surname>
            <given-names>SH</given-names>
          </name>
          <name name-style="western">
            <surname>Giraud-Carrier</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>West</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Barnes</surname>
            <given-names>MD</given-names>
          </name>
          <name name-style="western">
            <surname>Hansen</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Tweaking and tweeting: exploring Twitter for nonmedical use of a psychostimulant drug (Adderall) among college students</article-title>
        <source>J Med Internet Res</source>  
        <year>2013</year>  
        <volume>15</volume>  
        <issue>4</issue>  
        <fpage>e62</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2013/4/e62/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.2503</pub-id>
        <pub-id pub-id-type="medline">23594933</pub-id>
        <pub-id pub-id-type="pii">v15i4e62</pub-id>
        <pub-id pub-id-type="pmcid">PMC3636321</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Seaman</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Giraud-Carrier</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Prevalence and attitudes about illicit and pre- scription drugs on Twitter</article-title>
        <year>2016</year>  
        <conf-name>IEEE International Conference on Healthcare Informatics</conf-name>
        <conf-date>2016</conf-date>
        <conf-loc>Chicago, IL</conf-loc></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ordun</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Blake</surname>
            <given-names>JW</given-names>
          </name>
          <name name-style="western">
            <surname>Rosidi</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Grigoryan</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Reffett</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Aslam</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Gentilcore</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Cyran</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Shelton</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Klenk</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Open source health intelligence for foodborne illness event characterization</article-title>
        <source>Online Journal of Public Health Informatics</source>  
        <year>2013</year>  
        <volume>5</volume>  
        <issue>1</issue>  
        <fpage>e128</fpage> </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Sadilek</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Kautz</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>DiPrete</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Labus</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Portman</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Teitel</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Silenzio</surname>
            <given-names>V</given-names>
          </name>
        </person-group>
        <article-title>Deploying nEmesis: Preventing foodborne illness by data mining social media</article-title>
        <year>2016</year>  
        <conf-name>Twenty-Eighth Annual Conference on Innovative Applications of Artificial Intelligence</conf-name>
        <conf-date>2016</conf-date>
        <conf-loc>Phoenix, AZ</conf-loc></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>De</surname>
            <given-names>CM</given-names>
          </name>
          <name name-style="western">
            <surname>Gamon</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Predicting depression via social media</article-title>
        <year>2013</year>  
        <conf-name>International Conference on Weblogs and Social Media</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Boston, MA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Coppersmith</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Harman</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Hollingshead</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Mitchell</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>CLPsych 2015 Shared Task: Depression and PTSD on Twitter</article-title>
        <year>2015</year>  
        <conf-name>NAACL Workshop on Computational Linguistics and Clinical Psychology</conf-name>
        <conf-date>2015</conf-date>
        <conf-loc>Denver, CO</conf-loc></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <collab>Statista</collab>
        </person-group>
        <source>Number of monthly active Instagram users from January to September</source>  
        <year>2013</year>  
        <access-date>2018-02-08</access-date>
        <comment>2018 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.statista.com/statistics/253577/number-of-monthly-active-instagram-users/">https://www.statista.com/statistics/253577/number-of-monthly-active-instagram-users/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6x57Ymx2Y"/></comment> </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mejova</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Haddadi</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Noulas</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Weber</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <article-title>#foodporn: Obesity patterns in culinary interactions</article-title>
        <year>2015</year>  
        <conf-name>International Conference on Digital Health</conf-name>
        <conf-date>2015</conf-date>
        <conf-loc>Florence, Italy</conf-loc></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>De</surname>
            <given-names>CM</given-names>
          </name>
          <name name-style="western">
            <surname>Sharma</surname>
            <given-names>S</given-names>
          </name>
        </person-group>
        <article-title>Kiciman E Characterizing dietary choices, nutrition, and language in food deserts via social media</article-title>
        <year>2016</year>  
        <conf-name>Conference on Computer Supported Cooperative Work and Social Computing</conf-name>
        <conf-date>2016</conf-date>
        <conf-loc>San Francisco, CA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Yom-Tov</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Fernandez-Luque</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Weber</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Crain</surname>
            <given-names>SP</given-names>
          </name>
        </person-group>
        <article-title>Pro-anorexia and pro-recovery photo sharing: a tale of two warring tribes</article-title>
        <source>J Med Internet Res</source>  
        <year>2012</year>  
        <volume>14</volume>  
        <issue>6</issue>  
        <fpage>e151</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2012/6/e151/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.2239</pub-id>
        <pub-id pub-id-type="medline">23134671</pub-id>
        <pub-id pub-id-type="pii">v14i6e151</pub-id>
        <pub-id pub-id-type="pmcid">PMC3510717</pub-id></nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Pless</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Begtrup</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Alkulaib</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Counts</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Harnett</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Manning</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Xuan</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Broniatowski</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Recognizing images of eating disorders in social media</article-title>
        <year>2017</year>  
        <conf-name>AMIA Workshop on Social Media Mining for Health Applications</conf-name>
        <conf-date>2017</conf-date>
        <conf-loc>Washington, DC</conf-loc></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chu</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Allem</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Cruz</surname>
            <given-names>TB</given-names>
          </name>
          <name name-style="western">
            <surname>Unger</surname>
            <given-names>JB</given-names>
          </name>
        </person-group>
        <article-title>Vaping on Instagram: cloud chasing, hand checks and product placement</article-title>
        <source>Tob Control</source>  
        <year>2016</year>  
        <month>09</month>  
        <day>22</day>  
        <pub-id pub-id-type="doi">10.1136/tobaccocontrol-2016-053052</pub-id>
        <pub-id pub-id-type="medline">27660111</pub-id>
        <pub-id pub-id-type="pii">tobaccocontrol-2016-053052</pub-id></nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cavazos-Rehg</surname>
            <given-names>PA</given-names>
          </name>
          <name name-style="western">
            <surname>Krauss</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Sowles</surname>
            <given-names>SJ</given-names>
          </name>
          <name name-style="western">
            <surname>Bierut</surname>
            <given-names>LJ</given-names>
          </name>
        </person-group>
        <article-title>Marijuana-Related Posts on Instagram</article-title>
        <source>Prev Sci</source>  
        <year>2016</year>  
        <month>08</month>  
        <volume>17</volume>  
        <issue>6</issue>  
        <fpage>710</fpage>  
        <lpage>20</lpage>  
        <pub-id pub-id-type="doi">10.1007/s11121-016-0669-9</pub-id>
        <pub-id pub-id-type="medline">27262456</pub-id>
        <pub-id pub-id-type="pii">10.1007/s11121-016-0669-9</pub-id>
        <pub-id pub-id-type="pmcid">PMC4939096</pub-id></nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cherian</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Westbrook</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ramo</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Sarkar</surname>
            <given-names>U</given-names>
          </name>
        </person-group>
        <article-title>Representations of Codeine Misuse on Instagram: Content Analysis</article-title>
        <source>JMIR Public Health Surveill</source>  
        <year>2018</year>  
        <month>03</month>  
        <day>20</day>  
        <volume>4</volume>  
        <issue>1</issue>  
        <fpage>e22</fpage>  
        <pub-id pub-id-type="doi">10.2196/publichealth.8144</pub-id>
        <pub-id pub-id-type="medline">29559422</pub-id>
        <pub-id pub-id-type="pii">v4i1e22</pub-id></nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Allem</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Escobedo</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Chu</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Boley</surname>
            <given-names>CT</given-names>
          </name>
          <name name-style="western">
            <surname>Unger</surname>
            <given-names>JB</given-names>
          </name>
        </person-group>
        <article-title>Images of Little Cigars and Cigarillos on Instagram Identified by the Hashtag #swisher: Thematic Analysis</article-title>
        <source>J Med Internet Res</source>  
        <year>2017</year>  
        <month>07</month>  
        <day>14</day>  
        <volume>19</volume>  
        <issue>7</issue>  
        <fpage>e255</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.jmir.org/2017/7/e255/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/jmir.7634</pub-id>
        <pub-id pub-id-type="medline">28710057</pub-id>
        <pub-id pub-id-type="pii">v19i7e255</pub-id>
        <pub-id pub-id-type="pmcid">PMC5533944</pub-id></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Allem</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Chu</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Cruz</surname>
            <given-names>TB</given-names>
          </name>
          <name name-style="western">
            <surname>Unger</surname>
            <given-names>JB</given-names>
          </name>
        </person-group>
        <article-title>Waterpipe Promotion and Use on Instagram: #Hookah</article-title>
        <source>Nicotine Tob Res</source>  
        <year>2017</year>  
        <month>01</month>  
        <day>11</day>  
        <pub-id pub-id-type="doi">10.1093/ntr/ntw329</pub-id>
        <pub-id pub-id-type="medline">28077449</pub-id>
        <pub-id pub-id-type="pii">ntw329</pub-id></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Garimella</surname>
            <given-names>VRK</given-names>
          </name>
          <name name-style="western">
            <surname>Alfayad</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Weber</surname>
            <given-names>I</given-names>
          </name>
        </person-group>
        <article-title>Social media image analysis for public health</article-title>
        <year>2016</year>  
        <conf-name>CHI Conference on Human Factors in Computing Systems</conf-name>
        <conf-date>2016</conf-date>
        <conf-loc>San Jose, CA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Exploring health topics in Chinese social media: an analysis of Sina Weibo</article-title>
        <year>2014</year>  
        <conf-name>AAAI Workshop on the World Wide Web and Public Health Intelligence</conf-name>
        <conf-date>2014</conf-date>
        <conf-loc>Québec City, Canada</conf-loc></nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
        <source>Instagram Developer Documentation</source>  
        <access-date>2018-05-04</access-date>
        <comment>2018 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://www.instagram.com/developer/">https://www.instagram.com/developer/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zApRoL0h"/></comment> </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
        <source>Twitter: Sample realtime tweets</source>  
        <access-date>2018-05-04</access-date>
        <comment>2018 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://developer.twitter.com/en/docs/tweets/sample-realtime/overview/GET_statuse_sample">https://developer.twitter.com/en/docs/tweets/sample-realtime/overview/GET_statuse_sample</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zApUY2BJ"/></comment> </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Lui</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Baldwin</surname>
            <given-names>T</given-names>
          </name>
        </person-group>
        <article-title>Langid.py: an off-the-shelf language identification tool</article-title>
        <year>2012</year>  
        <conf-name>Association for Computational Linguistics</conf-name>
        <conf-date>2012</conf-date>
        <conf-loc>Jeju, South Korea</conf-loc></nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bird</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Loper</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Klein</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <source>Natural Language Processing with Python</source>  
        <year>2009</year>  
        <publisher-loc>Sebastopol, CA</publisher-loc>
        <publisher-name>O'Reilly Media Inc</publisher-name></nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Microsoft</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <source>Computer Vision API</source>  
        <access-date>2018-02-08</access-date>
        <comment>2018 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/">https://azure.microsoft.com/en-us/services/cognitive-services/computer-vision/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6x57iYvRl"/></comment> </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>DM</given-names>
          </name>
        </person-group>
        <article-title>Probabilistic topic models</article-title>
        <source>Commun. ACM</source>  
        <year>2012</year>  
        <month>04</month>  
        <day>01</day>  
        <volume>55</volume>  
        <issue>4</issue>  
        <fpage>77</fpage>  
        <pub-id pub-id-type="doi">10.1145/2133806.2133826</pub-id></nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Prier</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Giraud-Carrier</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Hanson</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Identifying health-related topics on Twitter: An exploration of tobacco-related tweets as a test topic</article-title>
        <year>2011</year>  
        <conf-name>International Conference on Social Computing, Behavioral-cultural Modeling and Prediction</conf-name>
        <conf-date>2011</conf-date>
        <conf-loc>College Park, MD</conf-loc></nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ghosh</surname>
            <given-names>DD</given-names>
          </name>
          <name name-style="western">
            <surname>Guha</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>What are we 'tweeting' about obesity? Mapping tweets with Topic Modeling and Geographic Information System</article-title>
        <source>Cartogr Geogr Inf Sci</source>  
        <year>2013</year>  
        <volume>40</volume>  
        <issue>2</issue>  
        <fpage>90</fpage>  
        <lpage>102</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25126022"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1080/15230406.2013.776210</pub-id>
        <pub-id pub-id-type="medline">25126022</pub-id>
        <pub-id pub-id-type="pmcid">PMC4128420</pub-id></nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wallace</surname>
            <given-names>BC</given-names>
          </name>
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Sarkar</surname>
            <given-names>U</given-names>
          </name>
          <name name-style="western">
            <surname>Trikalinos</surname>
            <given-names>TA</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>A large-scale quantitative analysis of latent factors and sentiment in online doctor reviews</article-title>
        <source>J Am Med Inform Assoc</source>  
        <year>2014</year>  
        <volume>21</volume>  
        <issue>6</issue>  
        <fpage>1098</fpage>  
        <lpage>103</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://jamia.oxfordjournals.org/cgi/pmidlookup?view=long&#38;pmid=24918109"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1136/amiajnl-2014-002711</pub-id>
        <pub-id pub-id-type="medline">24918109</pub-id>
        <pub-id pub-id-type="pii">amiajnl-2014-002711</pub-id>
        <pub-id pub-id-type="pmcid">PMC4215053</pub-id></nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chang</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Boyd-Graber</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Gerrish</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Blei</surname>
            <given-names>D</given-names>
          </name>
        </person-group>
        <article-title>Reading tea leaves: how humans interpret topic models</article-title>
        <year>2009</year>  
        <conf-name>International Conference on Neural Information Processing Systems (NIPS)</conf-name>
        <conf-date>2009</conf-date>
        <conf-loc>Vancouver, Canada</conf-loc></nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Mimno</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Wallach</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Naradowsky</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>DA</given-names>
          </name>
          <name name-style="western">
            <surname>McCallum</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Polylingual topic models</article-title>
        <year>2009</year>  
        <conf-name>Empirical Methods in Natural Language Processing</conf-name>
        <conf-date>2009</conf-date>
        <conf-loc>Singapore</conf-loc></nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
        <source>McCallum AK</source>  
        <year>2002</year>  
        <access-date>2018-06-18</access-date>
        <comment>MALLET: A Machine Learning for Language Toolkit 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://mallet.cs.umass.edu/">http://mallet.cs.umass.edu/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="70HBNf8dX"/></comment> </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Santarossa</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Coyne</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Lisinski</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Woodruff</surname>
            <given-names>SJ</given-names>
          </name>
        </person-group>
        <article-title>#fitspo on Instagram: A mixed-methods approach using Netlytic and photo analysis, uncovering the online discussion and author/image characteristics</article-title>
        <source>J Health Psychol</source>  
        <year>2016</year>  
        <month>11</month>  
        <day>01</day>  
        <fpage>1359105316676334</fpage>  
        <pub-id pub-id-type="doi">10.1177/1359105316676334</pub-id>
        <pub-id pub-id-type="medline">27852889</pub-id>
        <pub-id pub-id-type="pii">1359105316676334</pub-id></nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Fishbain</surname>
            <given-names>DA</given-names>
          </name>
          <name name-style="western">
            <surname>Cutler</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Rosomoff</surname>
            <given-names>HL</given-names>
          </name>
          <name name-style="western">
            <surname>Rosomoff</surname>
            <given-names>RS</given-names>
          </name>
        </person-group>
        <article-title>Chronic pain-associated depression: antecedent or consequence of chronic pain? A review</article-title>
        <source>Clin J Pain</source>  
        <year>1997</year>  
        <month>06</month>  
        <volume>13</volume>  
        <issue>2</issue>  
        <fpage>116</fpage>  
        <lpage>37</lpage>  
        <pub-id pub-id-type="medline">9186019</pub-id></nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Cook</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Kenthapadi</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Mishra</surname>
            <given-names>N</given-names>
          </name>
        </person-group>
        <article-title>Group chats on Twitter</article-title>
        <year>2013</year>  
        <conf-name>International Conference on World Wide Web</conf-name>
        <conf-date>2013</conf-date>
        <conf-loc>Rio de Janeiro, Brazil</conf-loc></nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Willis</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Royne</surname>
            <given-names>MB</given-names>
          </name>
        </person-group>
        <article-title>Online Health Communities and Chronic Disease Self-Management</article-title>
        <source>Health Commun</source>  
        <year>2017</year>  
        <month>03</month>  
        <volume>32</volume>  
        <issue>3</issue>  
        <fpage>269</fpage>  
        <lpage>278</lpage>  
        <pub-id pub-id-type="doi">10.1080/10410236.2016.1138278</pub-id>
        <pub-id pub-id-type="medline">27218836</pub-id></nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Karisani</surname>
            <given-names>P</given-names>
          </name>
          <name name-style="western">
            <surname>Agichtein</surname>
            <given-names>E</given-names>
          </name>
        </person-group>
        <article-title>Did you really just have a heart attack? Towards robust detection of personal health mentions in social media</article-title>
        <year>2018</year>  
        <conf-name>The Web Conference (WWW)</conf-name>
        <conf-date>2018</conf-date>
        <conf-loc>Lyon, France</conf-loc></nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Santillana</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Nguyen</surname>
            <given-names>AT</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>MJ</given-names>
          </name>
          <name name-style="western">
            <surname>Nsoesie</surname>
            <given-names>EO</given-names>
          </name>
          <name name-style="western">
            <surname>Brownstein</surname>
            <given-names>JS</given-names>
          </name>
        </person-group>
        <article-title>Combining Search, Social Media, and Traditional Data Sources to Improve Influenza Surveillance</article-title>
        <source>PLoS Comput Biol</source>  
        <year>2015</year>  
        <month>10</month>  
        <volume>11</volume>  
        <issue>10</issue>  
        <fpage>e1004513</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pcbi.1004513"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004513</pub-id>
        <pub-id pub-id-type="medline">26513245</pub-id>
        <pub-id pub-id-type="pii">PCOMPBIOL-D-15-00856</pub-id>
        <pub-id pub-id-type="pmcid">PMC4626021</pub-id></nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>McGough</surname>
            <given-names>SF</given-names>
          </name>
          <name name-style="western">
            <surname>Brownstein</surname>
            <given-names>JS</given-names>
          </name>
          <name name-style="western">
            <surname>Hawkins</surname>
            <given-names>JB</given-names>
          </name>
          <name name-style="western">
            <surname>Santillana</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Forecasting Zika Incidence in the 2016 Latin America Outbreak Combining Traditional Disease Surveillance with Search, Social Media, and News Report Data</article-title>
        <source>PLoS Negl Trop Dis</source>  
        <year>2017</year>  
        <month>01</month>  
        <volume>11</volume>  
        <issue>1</issue>  
        <fpage>e0005295</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://journals.plos.org/plosntds/article?id=10.1371/journal.pntd.0005295"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pntd.0005295</pub-id>
        <pub-id pub-id-type="medline">28085877</pub-id>
        <pub-id pub-id-type="pii">PNTD-D-16-01733</pub-id>
        <pub-id pub-id-type="pmcid">PMC5268704</pub-id></nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
        <source>Greenwood S, Perrin A, Duggan M</source>  
        <year>2016</year>  
        <comment>Pew Research Center: Social media update 2016 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.pewinternet.org/2016/11/11/social-media-update-2016/">http://www.pewinternet.org/2016/11/11/social-media-update-2016/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6yMIOCpgJ"/></comment> </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bonevski</surname>
            <given-names>B</given-names>
          </name>
          <name name-style="western">
            <surname>Randell</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Chapman</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Twyman</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Bryant</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Brozek</surname>
            <given-names>I</given-names>
          </name>
          <name name-style="western">
            <surname>Hughes</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <article-title>Reaching the hard-to-reach: a systematic review of strategies for improving health and medical research with socially disadvantaged groups</article-title>
        <source>BMC Med Res Methodol</source>  
        <year>2014</year>  
        <volume>14</volume>  
        <fpage>42</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://www.biomedcentral.com/1471-2288/14/42"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/1471-2288-14-42</pub-id>
        <pub-id pub-id-type="medline">24669751</pub-id>
        <pub-id pub-id-type="pii">1471-2288-14-42</pub-id>
        <pub-id pub-id-type="pmcid">PMC3974746</pub-id></nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Smith</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Paul</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Ryzhkov</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Quinn</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Broniatowski</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Dredze</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Examining patterns of influenza vaccination in social media</article-title>
        <year>2017</year>  
        <conf-name>AAAI Joint Workshop on Health Intelligence</conf-name>
        <conf-date>2017</conf-date>
        <conf-loc>San Francisco, CA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Baumer</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Mimno</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Guha</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Quan</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Gay</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Comparing grounded theory and topic modeling: Extreme divergence or unlikely convergence?</article-title>
        <source>Journal of the Association for Information Science and Technology</source>  
        <year>2017</year>  
        <month>04</month>  
        <day>28</day>  
        <volume>68</volume>  
        <issue>6</issue>  
        <fpage>1397</fpage>  
        <lpage>1410</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1002/asi.23786"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1002/asi.23786</pub-id></nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Allem</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Ferrara</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Uppu</surname>
            <given-names>SP</given-names>
          </name>
          <name name-style="western">
            <surname>Cruz</surname>
            <given-names>TB</given-names>
          </name>
          <name name-style="western">
            <surname>Unger</surname>
            <given-names>JB</given-names>
          </name>
        </person-group>
        <article-title>E-Cigarette Surveillance With Social Media Data: Social Bots, Emerging Topics, and Trends</article-title>
        <source>JMIR Public Health Surveill</source>  
        <year>2017</year>  
        <month>12</month>  
        <day>20</day>  
        <volume>3</volume>  
        <issue>4</issue>  
        <fpage>e98</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://publichealth.jmir.org/2017/4/e98/"/>
        </comment>  
        <pub-id pub-id-type="doi">10.2196/publichealth.8641</pub-id>
        <pub-id pub-id-type="medline">29263018</pub-id>
        <pub-id pub-id-type="pii">v3i4e98</pub-id>
        <pub-id pub-id-type="pmcid">PMC5752967</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
