<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i4e32405</article-id>
      <article-id pub-id-type="pmid">35468092</article-id>
      <article-id pub-id-type="doi">10.2196/32405</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Toward Using Twitter for PrEP-Related Interventions: An Automated Natural Language Processing Pipeline for Identifying Gay or Bisexual Men in the United States</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Bradley</surname>
            <given-names>Heather</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Banda Orozco</surname>
            <given-names>Juan </given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Serrano</surname>
            <given-names>Pedro</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Klein</surname>
            <given-names>Ari Z</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Biostatistics, Epidemiology, and Informatics</institution>
            <institution>Perelman School of Medicine</institution>
            <institution>University of Pennsylvania</institution>
            <addr-line>Blockley Hall, 4th Floor</addr-line>
            <addr-line>423 Guardian Drive</addr-line>
            <addr-line>Philadelphia, PA, 19104</addr-line>
            <country>United States</country>
            <phone>1 215 746 1101</phone>
            <email>ariklein@pennmedicine.upenn.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8281-3464</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Meanley</surname>
            <given-names>Steven</given-names>
          </name>
          <degrees>PhD, MPH</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6098-9733</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>O'Connor</surname>
            <given-names>Karen</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7709-3813</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Bauermeister</surname>
            <given-names>José A</given-names>
          </name>
          <degrees>PhD, MPH</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9276-2306</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Gonzalez-Hernandez</surname>
            <given-names>Graciela</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6416-9556</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Biostatistics, Epidemiology, and Informatics</institution>
        <institution>Perelman School of Medicine</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Family and Community Health</institution>
        <institution>School of Nursing</institution>
        <institution>University of Pennsylvania</institution>
        <addr-line>Philadelphia, PA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Ari Z Klein <email>ariklein@pennmedicine.upenn.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>4</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>25</day>
        <month>4</month>
        <year>2022</year>
      </pub-date>
      <volume>8</volume>
      <issue>4</issue>
      <elocation-id>e32405</elocation-id>
      <history>
        <date date-type="received">
          <day>26</day>
          <month>7</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>8</day>
          <month>11</month>
          <year>2021</year>
        </date>
        <date date-type="rev-recd">
          <day>19</day>
          <month>11</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>24</day>
          <month>2</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Ari Z Klein, Steven Meanley, Karen O'Connor, José A Bauermeister, Graciela Gonzalez-Hernandez. Originally published in JMIR Public Health and Surveillance (https://publichealth.jmir.org), 25.04.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on https://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://publichealth.jmir.org/2022/4/e32405" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Pre-exposure prophylaxis (PrEP) is highly effective at preventing the acquisition of HIV. There is a substantial gap, however, between the number of people in the United States who have indications for PrEP and the number of them who are prescribed PrEP. Although Twitter content has been analyzed as a source of PrEP-related data (eg, barriers), methods have not been developed to enable the use of Twitter as a platform for implementing PrEP-related interventions.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Men who have sex with men (MSM) are the population most affected by HIV in the United States. Therefore, the objectives of this study were to (1) develop an automated natural language processing (NLP) pipeline for identifying men in the United States who have reported on Twitter that they are gay, bisexual, or MSM and (2) assess the extent to which they demographically represent MSM in the United States with new HIV diagnoses.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Between September 2020 and January 2021, we used the Twitter Streaming Application Programming Interface (API) to collect more than 3 million tweets containing keywords that men may include in posts reporting that they are gay, bisexual, or MSM. We deployed handwritten, high-precision regular expressions—designed to filter out noise and identify actual self-reports—on the tweets and their user profile metadata. We identified 10,043 unique users geolocated in the United States and drew upon a validated NLP tool to automatically identify their ages.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>By manually distinguishing true- and false-positive self-reports in the tweets or profiles of 1000 (10%) of the 10,043 users identified by our automated pipeline, we established that our pipeline has a precision of 0.85. Among the 8756 users for which a US state–level geolocation was detected, 5096 (58.2%) were in the 10 states with the highest numbers of new HIV diagnoses. Among the 6240 users for which a county-level geolocation was detected, 4252 (68.1%) were in counties or states considered priority jurisdictions by the <italic>Ending the HIV Epidemic</italic> initiative. Furthermore, the age distribution of the users reflected that of MSM in the United States with new HIV diagnoses.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our automated NLP pipeline can be used to identify MSM in the United States who may be at risk of acquiring HIV, laying the groundwork for using Twitter on a large scale to directly target PrEP-related interventions at this population.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>natural language processing</kwd>
        <kwd>social media</kwd>
        <kwd>data mining</kwd>
        <kwd>PrEP</kwd>
        <kwd>pre-exposure prophylaxis</kwd>
        <kwd>HIV</kwd>
        <kwd>AIDS</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Pre-exposure prophylaxis (PrEP) with antiretroviral drugs is highly effective at preventing the acquisition of HIV in men who have sex with men (MSM) [<xref ref-type="bibr" rid="ref1">1</xref>]. There is a substantial gap, however, between the number of people in the United States who have indications for PrEP, including 25% of MSM [<xref ref-type="bibr" rid="ref2">2</xref>], and the number of them who are prescribed PrEP [<xref ref-type="bibr" rid="ref3">3</xref>]; approximately one-third of primary care physicians (PCPs) in the United States who are aware of PrEP have prescribed PrEP or referred a patient for PrEP [<xref ref-type="bibr" rid="ref4">4</xref>]. Although efforts should be made to increase PCPs’ adoption of PrEP recommendations into routine clinical practice, PCP-based interventions are limited because some MSM, especially younger men, face challenges when disclosing their same-sex sexual behaviors to their PCPs [<xref ref-type="bibr" rid="ref5">5</xref>]. Based on the findings of a recent study by Reuter et al [<xref ref-type="bibr" rid="ref6">6</xref>] that examined Twitter users’ attitudes toward being monitored for health-related research, some MSM may be more open to PrEP-related interventions on social media, such as targeted messages or advertisements.</p>
      <p>Hannaford et al [<xref ref-type="bibr" rid="ref7">7</xref>] found that social media can help identify factors for implementing PrEP-related interventions that are not captured by traditional research methods, and they suggested that social media may present novel opportunities to implement PrEP-related interventions. Although Twitter content has been analyzed as a source of PrEP-related data (eg, barriers) [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], to our knowledge, methods have not been developed to enable the use of Twitter as a platform for PrEP-related interventions. The foremost requirement for implementing PrEP-related interventions on Twitter is to identify users in the populations that have indications for PrEP. Given that MSM are the population most affected by HIV in the United States [<xref ref-type="bibr" rid="ref10">10</xref>], the objectives of this study were to (1) develop an automated natural language processing (NLP) pipeline for identifying men in the United States who have reported on Twitter that they are gay, bisexual, or MSM and (2) assess the extent to which they demographically represent MSM in the United States with new HIV diagnoses. This study seeks to lay the groundwork for using Twitter on a large scale to directly target PrEP-related interventions at MSM who may be at risk of acquiring HIV.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Ethical Considerations</title>
        <p>The Institutional Review Board of the University of Pennsylvania reviewed this study and deemed it exempt human subjects research under Category (4) of Paragraph (b) of the US Code of Federal Regulations Title 45 Section 46.101 for publicly available data sources (45 CFR §46.101(b)(4)).</p>
      </sec>
      <sec>
        <title>Data Collection</title>
        <p>Between September 2020 and January 2021, we used the Twitter Streaming Application Programming Interface (API) to collect more than 3 million tweets containing keywords that men may include in posts reporting that they are gay, bisexual, or MSM. As a preliminary approach, we deployed handwritten, high-precision regular expressions—search patterns designed to automatically match text strings—on the 3 million tweets to filter out noise and identify actual self-reports (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). After automatically removing retweets and “reported speech” (eg, quotations, news headlines) [<xref ref-type="bibr" rid="ref11">11</xref>], the regular expressions matched 8603 tweets that were posted by 6358 users geolocated in the United States [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        <p>In addition to tweet-based regular expressions, we also deployed handwritten regular expressions on the user profile metadata of the 3 million tweets collected from the Twitter Streaming API (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The regular expressions matched the profile metadata of 4127 users geolocated in the United States [<xref ref-type="bibr" rid="ref12">12</xref>]. After removing duplicate users from our tweet- and profile-based searches, we identified a total of 10,043 unique users. <xref rid="figure1" ref-type="fig">Figure 1</xref> illustrates our automated pipeline for identifying men in the United States who have reported on Twitter that they are gay, bisexual, or MSM. To assess the extent to which they demographically represent MSM in the United States with new HIV diagnoses, we analyzed the state- and county-level geolocations [<xref ref-type="bibr" rid="ref12">12</xref>] of these 10,043 users and drew upon a validated NLP tool [<xref ref-type="bibr" rid="ref13">13</xref>] to automatically identify their ages.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Automated natural language processing pipeline for identifying men in the United States who have reported on Twitter that they are gay, bisexual, or men who have sex with men.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i4e32405_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Pipeline Evaluation</title>
        <p>True positives and false positives were manually distinguished by 2 annotators in a random sample of 1000 (10%) of the 10,043 users that were identified by our automated pipeline, consisting of 500 matching tweets and 500 matching profiles. <italic>True positives</italic> were defined as tweets or profiles in which the users reported that they are gay, bisexual, or MSM. Overall interannotator agreement (Cohen κ) based on independent, dual annotations for all 1000 users was 0.81, which is deemed to be “almost perfect agreement” [<xref ref-type="bibr" rid="ref14">14</xref>]. More specifically, interannotator agreement was 0.83 for the 500 tweets and 0.79 for the 500 profiles. Upon resolving the disagreements, 417 (83.4%) tweets and 430 (86%) profiles were annotated as true positives and 83 (16.6%) tweets and 70 (14%) profiles were annotated as false positives. Based on this evaluation, our automated pipeline has an overall precision of 0.85, where <italic>precision = true positives / (true positives + false positives)</italic>. <xref ref-type="table" rid="table1">Table 1</xref> provides examples of tweets and profiles that were manually annotated as true or false positives. The majority of the profiles that were annotated as false positives were users that mentioned being transgender or nonbinary—populations that are beyond the scope of this study.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Sample manual annotations of tweets and profiles.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="700"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td>Type</td>
                <td>Text</td>
                <td>Label</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Tweet</td>
                <td>End the FDA’s discriminatory and unscientific policy against gay men like me donating blood.</td>
                <td>True positive</td>
              </tr>
              <tr valign="top">
                <td>Tweet</td>
                <td>As a bi guy we get so little representation, and almost all of its negative. It’s frustrating.</td>
                <td>True positive</td>
              </tr>
              <tr valign="top">
                <td>Tweet</td>
                <td>Today, we remember Matthew Shepard who’s life was cut short as a result of a hate crime due to his identity as a gay male.</td>
                <td>False positive</td>
              </tr>
              <tr valign="top">
                <td>Profile</td>
                <td>A proud black gay guy.</td>
                <td>True positive</td>
              </tr>
              <tr valign="top">
                <td>Profile</td>
                <td>50+ gay trans man, writer, film and food lover. He/him OR they/them.</td>
                <td>False positive</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Demographics</title>
        <p>To assess the utility of our automated pipeline for identifying MSM in the United States who may be particularly at risk of acquiring HIV, we analyzed their state- and county-level geolocations and ages. We detected a US state–level geolocation for 8756 (87.6%) of the 10,043 users identified by our automated pipeline, including users from all 50 states and the District of Columbia. As <xref rid="figure2" ref-type="fig">Figure 2</xref> illustrates, the largest numbers of users were detected in California, New York, Texas, Florida, Illinois, Pennsylvania, Ohio, and Georgia. We detected a county-level geolocation for 6240 (71.2%) of these 8756 users. <xref ref-type="table" rid="table2">Table 2</xref> presents the 15 counties for which we detected at least 100 users. We detected an age of ≥13 years [<xref ref-type="bibr" rid="ref10">10</xref>] for 4782 (47.6%) of the 10,043 users, with a mean age of 31.9 (SD 13.1) years and a median age of 29 years. <xref ref-type="table" rid="table3">Table 3</xref> presents the age distribution, based on each user’s most recent tweet containing a self-report of age.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Number of Twitter users, by state, identified by our automated pipeline between September 2020 and January 2021.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i4e32405_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Counties with at least 100 Twitter users identified by our automated pipeline between September 2020 and January 2021.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>US county</td>
                <td>Users (N=6240), n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Los Angeles County, CA</td>
                <td>535 (8.6)</td>
              </tr>
              <tr valign="top">
                <td>New York County, NY</td>
                <td>417 (6.7)</td>
              </tr>
              <tr valign="top">
                <td>Cook County, IL</td>
                <td>318 (5.1)</td>
              </tr>
              <tr valign="top">
                <td>District of Columbia, DC</td>
                <td>237 (3.8)</td>
              </tr>
              <tr valign="top">
                <td>King County, WA</td>
                <td>192 (3.1)</td>
              </tr>
              <tr valign="top">
                <td>Fulton County, GA</td>
                <td>155 (2.5)</td>
              </tr>
              <tr valign="top">
                <td>San Mateo County, CA</td>
                <td>151 (2.4)</td>
              </tr>
              <tr valign="top">
                <td>Multnomah County, OR</td>
                <td>128 (2.1)</td>
              </tr>
              <tr valign="top">
                <td>Kings County, NY</td>
                <td>127 (2)</td>
              </tr>
              <tr valign="top">
                <td>Dallas County, TX</td>
                <td>123 (2)</td>
              </tr>
              <tr valign="top">
                <td>Philadelphia County, PA</td>
                <td>121 (1.9)</td>
              </tr>
              <tr valign="top">
                <td>Harris County, TX</td>
                <td>116 (1.9)</td>
              </tr>
              <tr valign="top">
                <td>Maricopa County, AZ</td>
                <td>111 (1.8)</td>
              </tr>
              <tr valign="top">
                <td>Suffolk County, MA</td>
                <td>110 (1.8)</td>
              </tr>
              <tr valign="top">
                <td>Travis County, TX</td>
                <td>109 (1.7)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Age distribution of Twitter users identified by our automated pipeline between September 2020 and January 2021.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Age group (years)</td>
                <td>Users (N=4782), n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>13-24</td>
                <td>1630 (34.1)</td>
              </tr>
              <tr valign="top">
                <td>25-34</td>
                <td>1644 (34.4)</td>
              </tr>
              <tr valign="top">
                <td>35-44</td>
                <td>704 (14.7)</td>
              </tr>
              <tr valign="top">
                <td>45-54</td>
                <td>449 (9.4)</td>
              </tr>
              <tr valign="top">
                <td>≥55</td>
                <td>355 (7.4)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our study demonstrates that gay men, bisexual men, or MSM in the United States publicly report their sexual orientation on Twitter and that these users can be accurately identified on a large scale. Moreover, among the 8756 users for which our automated pipeline detected a US state–level geolocation, 5096 (58.2%) were in the 10 states with the highest numbers of new HIV diagnoses [<xref ref-type="bibr" rid="ref10">10</xref>]. Among the 6240 users for which a county-level geolocation was detected, 4252 (68.1%) were in counties or states considered priority jurisdictions by the <italic>Ending the HIV Epidemic</italic> initiative [<xref ref-type="bibr" rid="ref15">15</xref>]. Furthermore, the age distribution of the users reflected the ranking of the most frequent age groups with new HIV diagnoses among MSM in the United States [<xref ref-type="bibr" rid="ref10">10</xref>], with the 25-34 years age group first and the 13-24 years age group second. More specifically, these 2 age groups represent both the majority of the users in this study and the majority of MSM with new HIV diagnoses [<xref ref-type="bibr" rid="ref10">10</xref>]. The mean (31.9 years) and median (29 years) ages of the users are within the age group (25-34 years) with the largest number of new HIV diagnoses, which is also the only age group in which HIV infections have increased since 2014 [<xref ref-type="bibr" rid="ref10">10</xref>]. Therefore, our automated pipeline can be used as the basis for PrEP-related interventions targeted directly at MSM who are largely in the regions and age groups most affected by HIV in the United States, including younger men who may face challenges when discussing their same-sex sexual behaviors with their PCPs [<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This paper presented an automated NLP pipeline that can be used to identify MSM in the United States who may be at risk of acquiring HIV, laying the groundwork for using Twitter on a large scale to directly target PrEP-related interventions at this population.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Regular expressions.</p>
        <media xlink:href="publichealth_v8i4e32405_app1.docx" xlink:title="DOCX File , 12 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">MSM</term>
          <def>
            <p>men who have sex with men</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">PCP</term>
          <def>
            <p>primary care physician</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">PrEP</term>
          <def>
            <p>pre-exposure prophylaxis</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to thank Ivan Flores for contributing to software development and Alexis Upshur for contributing to annotating the Twitter data for validation of the pipeline. This research was supported by a grant from the Penn Center for AIDS Research, a National Institutes of Health–funded program (P30 AI 045008).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>AZK contributed to designing the pipeline, developing the sets of regular expressions, preparing the data set for validation, resolving the annotators’ disagreements, analyzing the demographics, and writing the manuscript. SM contributed to guiding data collection from Twitter and data validation and editing the manuscript. KO contributed to annotating the Twitter data for validation, calculating interannotator agreement, and editing the manuscript. JB contributed to guiding the overall study design and data collection from Twitter and editing the manuscript. GGH contributed to conceptualizing the research study, guiding the overall study design and data collection from Twitter, and editing the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grant</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Lama</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>PL</given-names>
            </name>
            <name name-style="western">
              <surname>McMahan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Vargas</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Goicochea</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Casapía</surname>
              <given-names>Martín</given-names>
            </name>
            <name name-style="western">
              <surname>Guanira-Carranza</surname>
              <given-names>JV</given-names>
            </name>
            <name name-style="western">
              <surname>Ramirez-Cardich</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Montoya-Herrera</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Fernández</surname>
              <given-names>Telmo</given-names>
            </name>
            <name name-style="western">
              <surname>Veloso</surname>
              <given-names>VG</given-names>
            </name>
            <name name-style="western">
              <surname>Buchbinder</surname>
              <given-names>SP</given-names>
            </name>
            <name name-style="western">
              <surname>Chariyalertsak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schechter</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bekker</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Kallás</surname>
              <given-names>Esper Georges</given-names>
            </name>
            <name name-style="western">
              <surname>Amico</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Mulligan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bushman</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Hance</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ganoza</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Defechereux</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Postle</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>McConnell</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rooney</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Jaffe</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Burns</surname>
              <given-names>DN</given-names>
            </name>
            <name name-style="western">
              <surname>Glidden</surname>
              <given-names>DV</given-names>
            </name>
            <collab>iPrEx Study Team</collab>
          </person-group>
          <article-title>Preexposure chemoprophylaxis for HIV prevention in men who have sex with men</article-title>
          <source>N Engl J Med</source>
          <year>2010</year>
          <month>12</month>
          <day>30</day>
          <volume>363</volume>
          <issue>27</issue>
          <fpage>2587</fpage>
          <lpage>99</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21091279"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMoa1011205</pub-id>
          <pub-id pub-id-type="medline">21091279</pub-id>
          <pub-id pub-id-type="pmcid">PMC3079639</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Van Handel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wolitski</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Stryker</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>HI</given-names>
            </name>
            <name name-style="western">
              <surname>Prejean</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Koenig</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Valleroy</surname>
              <given-names>LA</given-names>
            </name>
          </person-group>
          <article-title>Vital signs: estimated percentages and numbers of adults with indications for preexposure prophylaxis to prevent HIV acquisition--United States, 2015</article-title>
          <source>MMWR Morb Mortal Wkly Rep</source>
          <year>2015</year>
          <month>11</month>
          <day>27</day>
          <volume>64</volume>
          <issue>46</issue>
          <fpage>1291</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.15585/mmwr.mm6446a4"/>
          </comment>
          <pub-id pub-id-type="doi">10.15585/mmwr.mm6446a4</pub-id>
          <pub-id pub-id-type="medline">26606148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hoover</surname>
              <given-names>KW</given-names>
            </name>
          </person-group>
          <article-title>HIV preexposure prophylaxis, by race and ethnicity - United States, 2014-2016</article-title>
          <source>MMWR Morb Mortal Wkly Rep</source>
          <year>2018</year>
          <month>10</month>
          <day>19</day>
          <volume>67</volume>
          <issue>41</issue>
          <fpage>1147</fpage>
          <lpage>1150</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.15585/mmwr.mm6741a3"/>
          </comment>
          <pub-id pub-id-type="doi">10.15585/mmwr.mm6741a3</pub-id>
          <pub-id pub-id-type="medline">30335734</pub-id>
          <pub-id pub-id-type="pmcid">PMC6193685</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blackstock</surname>
              <given-names>OJ</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Berkenblit</surname>
              <given-names>GV</given-names>
            </name>
            <name name-style="western">
              <surname>Calabrese</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>CO</given-names>
            </name>
            <name name-style="western">
              <surname>Fiellin</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>VV</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Tetrault</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Edelman</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>A cross-sectional online survey of HIV pre-exposure prophylaxis adoption among primary care physicians</article-title>
          <source>J Gen Intern Med</source>
          <year>2017</year>
          <month>01</month>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>62</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27778215"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-016-3903-z</pub-id>
          <pub-id pub-id-type="medline">27778215</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11606-016-3903-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC5215171</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Petroll</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>JW</given-names>
            </name>
          </person-group>
          <article-title>Health insurance and disclosure of same-sex sexual behaviors among gay and bisexual men in same-sex relationships</article-title>
          <source>LGBT Health</source>
          <year>2015</year>
          <month>03</month>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>48</fpage>
          <lpage>54</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26790018"/>
          </comment>
          <pub-id pub-id-type="doi">10.1089/lgbt.2013.0050</pub-id>
          <pub-id pub-id-type="medline">26790018</pub-id>
          <pub-id pub-id-type="pmcid">PMC4855732</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reuter</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Angyan</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Merchant</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmer</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Public concern About monitoring Twitter users and their conversations to recruit for clinical trials: survey study</article-title>
          <source>J Med Internet Res</source>
          <year>2019</year>
          <month>10</month>
          <day>30</day>
          <volume>21</volume>
          <issue>10</issue>
          <fpage>e15455</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2019/10/e15455/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/15455</pub-id>
          <pub-id pub-id-type="medline">31670698</pub-id>
          <pub-id pub-id-type="pii">v21i10e15455</pub-id>
          <pub-id pub-id-type="pmcid">PMC6914244</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hannaford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lipshie-Williams</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Starrels</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Arnsten</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Rizzuto</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jacobs</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>VV</given-names>
            </name>
          </person-group>
          <article-title>The use of online posts to identify barriers to and facilitators of HIV pre-exposure prophylaxis (PrEP) among men who have sex with men: a comparison to a systematic review of the peer-reviewed literature</article-title>
          <source>AIDS Behav</source>
          <year>2018</year>
          <month>04</month>
          <volume>22</volume>
          <issue>4</issue>
          <fpage>1080</fpage>
          <lpage>1095</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/29285638"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10461-017-2011-3</pub-id>
          <pub-id pub-id-type="medline">29285638</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10461-017-2011-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC5991474</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McLaughlin</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Meng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>An</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nam</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Propagation of information about preexposure prophylaxis (PrEP) for HIV prevention through Twitter</article-title>
          <source>Health Commun</source>
          <year>2016</year>
          <month>08</month>
          <volume>31</volume>
          <issue>8</issue>
          <fpage>998</fpage>
          <lpage>1007</lpage>
          <pub-id pub-id-type="doi">10.1080/10410236.2015.1027033</pub-id>
          <pub-id pub-id-type="medline">26756069</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grimm</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>PrEP on Twitter: information, barriers, and stigma</article-title>
          <source>Health Commun</source>
          <year>2017</year>
          <month>04</month>
          <volume>32</volume>
          <issue>4</issue>
          <fpage>509</fpage>
          <lpage>516</lpage>
          <pub-id pub-id-type="doi">10.1080/10410236.2016.1140271</pub-id>
          <pub-id pub-id-type="medline">27295507</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Centers for Disease Control and Prevention</collab>
          </person-group>
          <article-title>Diagnoses of HIV infection in the United States and dependent areas, 2018 (Updated)</article-title>
          <source>HIV Surveillance Report, 2018 (Updated)</source>
          <year>2020</year>
          <month>05</month>
          <access-date>2021-03-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/hiv/library/reports/hiv-surveillance/vol-31/index.html">https://www.cdc.gov/hiv/library/reports/hiv-surveillance/vol-31/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weissenbacher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Levine</surname>
              <given-names>LD</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A natural language processing pipeline to advance the use of Twitter data for digital epidemiology of adverse pregnancy outcomes</article-title>
          <source>J Biomed Inform</source>
          <year>2020</year>
          <volume>112S</volume>
          <fpage>100076</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2590-177X(20)30010-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.yjbinx.2020.100076</pub-id>
          <pub-id pub-id-type="medline">34417007</pub-id>
          <pub-id pub-id-type="pii">S2590-177X(20)30010-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bergsma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Carmen: a Twitter geo-location system with applications to public health</article-title>
          <year>2013</year>
          <conf-name>The Association for the Advancement of Artificial Intelligence (AIII) Workshop Expanding the Boundaries of Health Informatics Using AI</conf-name>
          <conf-date>Jul 14-15, 2013</conf-date>
          <conf-loc>Bellevue, WA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Klein</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Magge</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Hernandez</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>ReportAGE: Automatically extracting the exact age of Twitter users based on self-reports in tweets</article-title>
          <source>PLoS One</source>
          <year>2022</year>
          <volume>17</volume>
          <issue>1</issue>
          <fpage>e0262087</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0262087"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0262087</pub-id>
          <pub-id pub-id-type="medline">35077484</pub-id>
          <pub-id pub-id-type="pii">PONE-D-21-08851</pub-id>
          <pub-id pub-id-type="pmcid">PMC8789116</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Viera</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Garrett</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Understanding interobserver agreement: the kappa statistic</article-title>
          <source>Fam Med</source>
          <year>2005</year>
          <month>05</month>
          <volume>37</volume>
          <issue>5</issue>
          <fpage>360</fpage>
          <lpage>3</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.stfm.org/fmhub/fm2005/May/Anthony360.pdf"/>
          </comment>
          <pub-id pub-id-type="medline">15883903</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <article-title>Centers for Disease Control and Prevention</article-title>
          <source>Ending the HIV epidemic in the U.S. - Jurisdictions</source>
          <access-date>2021-05-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/endhiv/jurisdictions.html">https://www.cdc.gov/endhiv/jurisdictions.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
