<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v6i2e15917</article-id>
      <article-id pub-id-type="pmid">32352389</article-id>
      <article-id pub-id-type="doi">10.2196/15917</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Comparing Methods for Record Linkage for Public Health Action: Matching Algorithm Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Sanchez</surname>
            <given-names>Travis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bosh</surname>
            <given-names>Karin</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Enamorado</surname>
            <given-names>Ted</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Avoundjian</surname>
            <given-names>Tigran</given-names>
          </name>
          <degrees>MPH, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Epidemiology</institution>
            <institution>School of Public Health</institution>
            <institution>University of Washington</institution>
            <addr-line>1959 NE Pacific Street</addr-line>
            <addr-line>Seattle, WA, 98195</addr-line>
            <country>United States</country>
            <phone>1 5431065</phone>
            <email>tavoun@uw.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6202-2043</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Dombrowski</surname>
            <given-names>Julia C</given-names>
          </name>
          <degrees>MPH, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1907-9428</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Golden</surname>
            <given-names>Matthew R</given-names>
          </name>
          <degrees>MPH, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7449-3774</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Hughes</surname>
            <given-names>James P</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5034-3157</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Guthrie</surname>
            <given-names>Brandon L</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2059-3291</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Baseman</surname>
            <given-names>Janet</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1974-8196</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Sadinle</surname>
            <given-names>Mauricio</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7092-3877</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Epidemiology</institution>
        <institution>School of Public Health</institution>
        <institution>University of Washington</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>HIV/STD Program</institution>
        <institution>Public Health–Seattle and King County</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Division of Allergy and Infectious Diseases</institution>
        <institution>Department of Medicine</institution>
        <institution>University of Washington</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Biostatistics</institution>
        <institution>School of Public Health</institution>
        <institution>University of Washington</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Global Health</institution>
        <institution>School of Public Health</institution>
        <institution>University of Washington</institution>
        <addr-line>Seattle, WA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Tigran Avoundjian <email>tavoun@uw.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Apr-Jun</season>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>4</month>
        <year>2020</year>
      </pub-date>
      <volume>6</volume>
      <issue>2</issue>
      <elocation-id>e15917</elocation-id>
      <history>
        <date date-type="received">
          <day>21</day>
          <month>8</month>
          <year>2019</year>
        </date>
        <date date-type="rev-request">
          <day>17</day>
          <month>10</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>11</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>10</day>
          <month>1</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Tigran Avoundjian, Julia C Dombrowski, Matthew R Golden, James P Hughes, Brandon L Guthrie, Janet Baseman, Mauricio Sadinle. Originally published in JMIR Public Health and Surveillance (http://publichealth.jmir.org), 30.04.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on http://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://publichealth.jmir.org/2020/2/e15917/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Many public health departments use record linkage between surveillance data and external data sources to inform public health interventions. However, little guidance is available to inform these activities, and many health departments rely on deterministic algorithms that may miss many true matches. In the context of public health action, these missed matches lead to missed opportunities to deliver interventions and may exacerbate existing health inequities.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to compare the performance of record linkage algorithms commonly used in public health practice.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We compared five deterministic (exact, Stenger, Ocampo 1, Ocampo 2, and Bosh) and two probabilistic record linkage algorithms (fastLink and beta record linkage [BRL]) using simulations and a real-world scenario. We simulated pairs of datasets with varying numbers of errors per record and the number of matching records between the two datasets (ie, overlap). We matched the datasets using each algorithm and calculated their recall (ie, sensitivity, the proportion of true matches identified by the algorithm) and precision (ie, positive predictive value, the proportion of matches identified by the algorithm that were true matches). We estimated the average computation time by performing a match with each algorithm 20 times while varying the size of the datasets being matched. In a real-world scenario, HIV and sexually transmitted disease surveillance data from King County, Washington, were matched to identify people living with HIV who had a syphilis diagnosis in 2017. We calculated the recall and precision of each algorithm compared with a composite standard based on the agreement in matching decisions across all the algorithms and manual review.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>In simulations, BRL and fastLink maintained a high recall at nearly all data quality levels, while being comparable with deterministic algorithms in terms of precision. Deterministic algorithms typically failed to identify matches in scenarios with low data quality. All the deterministic algorithms had a shorter average computation time than the probabilistic algorithms. BRL had the slowest overall computation time (14 min when both datasets contained 2000 records). In the real-world scenario, BRL had the lowest trade-off between recall (309/309, 100.0%) and precision (309/312, 99.0%).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Probabilistic record linkage algorithms maximize the number of true matches identified, reducing gaps in the coverage of interventions and maximizing the reach of public health action.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>medical record linkage</kwd>
        <kwd>public health surveillance</kwd>
        <kwd>public health practice</kwd>
        <kwd>data management</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>A central goal of public health surveillance is to provide continuous and systematically collected health-related data to inform public health practice and guide interventions to improve individual and population health [<xref ref-type="bibr" rid="ref1">1</xref>]. For example, health departments in the United States use HIV surveillance data [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>] to identify people living with HIV (PLWH) who are not engaged in HIV care to provide assistance and services to facilitate care engagement—a strategy known as Data to Care [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. In this way, surveillance data are used to improve both HIV care and prevention as well as to reduce inequities in access and utilization of HIV care resources to improve the well-being of vulnerable populations with HIV.</p>
        <p>When used in isolation from other sources of information, public health surveillance can be inefficient and ineffective. In the case of Data to Care, many PLWH who appear to be out of care in HIV surveillance data because they have not had a recent HIV viral load or CD4 test have actually moved out of the jurisdiction and engaged in HIV care elsewhere [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Thus, Data to Care strategies that rely entirely on HIV surveillance data involve time-consuming individual case investigations to determine whether persons are truly out of care, although that information is often readily available in other data sources, such as Ryan White–funded care programs, sexually transmitted disease (STD) surveillance, electronic health records, or HIV surveillance systems in other jurisdictions. The Centers for Disease Control and Prevention (CDC) is supporting efforts to match surveillance data between jurisdictions through programs such as the <italic>black box</italic> system, in which HIV surveillance data from multiple jurisdictions are matched to identify PLWH who have moved from one jurisdiction to another [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. In addition, several health departments are seeking to improve real-time record linkage between HIV and STD surveillance to provide HIV care relinkage services as part of STD partner services [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>Despite the widespread use of record linkage techniques throughout public health, little information is available to guide this process from the perspective of algorithm accuracy and the implications of missing true matches and identifying false matches. There are two primary approaches to record linkage: deterministic algorithms and probabilistic algorithms [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. Deterministic algorithms use exact matching on specific variables or a set of matching rules to identify matched record pairs [<xref ref-type="bibr" rid="ref18">18</xref>]. In contrast, probabilistic algorithms use statistical methods to identify the optimal set of matches, which often involves estimating and thresholding the probability that two records are a match [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Probabilistic algorithms typically have higher recall than deterministic algorithms, especially when linking databases that have high rates of data quality errors [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. However, probabilistic algorithms also tend to be more computationally complex than deterministic algorithms and may require more computing resources to implement in practice [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>Recent studies of record linkage involving health department HIV/STD surveillance data have presented deterministic algorithms to link HIV surveillance data with other data sources, improve the quality of HIV surveillance data, and facilitate Data to Care investigations [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. These algorithms are enticing because they are not computationally complex and can be executed quickly [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. As they are rule based, deterministic algorithms are intuitive to understand, easy to implement, and easy to modify. In addition (and perhaps more importantly), deterministic algorithms typically have low rates of false-positive matches. As a major concern of working with HIV data is inadvertent disclosure of HIV status, minimizing false matches is crucial to preserving individual privacy. However, although deterministic algorithms may be highly specific, they may be overly conservative in identifying matches, leading to large numbers of missed matches. Missed matches represent missed opportunities to deliver public health interventions to individuals who need them, and depending on their population distribution, missed matches could magnify health inequities. Probabilistic algorithms could potentially offer increased sensitivity compared with deterministic algorithms, while still identifying a small number of false matches.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>The performance of deterministic algorithms compared with probabilistic algorithms in the context of public health record linkage is unknown. The goal of this study was to compare the recall, precision, and computation time of record linkage algorithms often used in HIV/STD programs to better define the trade-offs between these algorithms in a variety of record linkage scenarios.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>We compared deterministic and probabilistic record linkage algorithms using two approaches. First, we compared the recall, precision, and computation time of different algorithms using paired simulated datasets, varying the quality of the data and overlap between datasets (ie, the proportion of true matches in each pair of datasets). Second, we conducted a <italic>real-world</italic> matching scenario involving public health surveillance data from Public Health—Seattle &#38; King County (PHSKC) to assess whether our simulation findings were generalizable to record linkage involving real datasets, where the exact error rate and overlap are difficult to assess.</p>
        <p>This study received a human subjects research exemption from the University of Washington Institutional Review Board because it involves the use of simulated data and public health surveillance data used to inform and improve existing operational public health department activities.</p>
      </sec>
      <sec>
        <title>Matching Algorithms</title>
        <p>We compared seven algorithms used to conduct record linkage involving public health surveillance data: exact matching, four deterministic, and two probabilistic algorithms (<xref ref-type="table" rid="table1">Table 1</xref>). The exact matching algorithm identifies the matched pairs of records between two datasets using an exact match on first name, last name, and year of birth. This was chosen as a <italic>base case</italic> algorithm because it uses the simplest rule set to match two datasets. The four deterministic algorithms (<italic>Stenger</italic>, <italic>Ocampo 1</italic>, <italic>Ocampo 2</italic>, and <italic>Bosh</italic>) define rule sets for identifying a match using patient-identifying information, such as first name, last name, date of birth, gender, and race (<xref ref-type="table" rid="table1">Table 1</xref>) [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. The Ocampo and Bosh algorithms also include matching criteria that require social security numbers (SSNs), which were omitted from our study because we did not have SSNs in the datasets used. In addition, the original Ocampo and Bosh algorithms used sex at birth, whereas we have used current gender. These modifications to these algorithms are noted in <xref ref-type="table" rid="table1">Table 1</xref>. These algorithms were chosen because they have been recently cited as matching algorithms used to conduct record linkage involving HIV surveillance data. Notably, the Ocampo algorithms have been used by the CDC to match interstate HIV surveillance data [<xref ref-type="bibr" rid="ref15">15</xref>]. The Stenger algorithm was obtained directly from the PHSKC HIV/STD program, where it has been implemented for several record linkage projects involving HIV surveillance data. This algorithm was also recently used by the Mississippi State Department of Health to link their HIV and STD surveillance databases to integrate HIV care relinkage services into STD partner services [<xref ref-type="bibr" rid="ref17">17</xref>].</p>
        <p>The two probabilistic algorithms are <italic>fastLink</italic> and <italic>beta record linkage</italic> (BRL). fastLink is an implementation of the traditional Fellegi-Sunter approach to record linkage [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. This approach uses comparisons of the shared fields between two datasets (ie, first name, last name, year of birth, month of birth, day of birth, gender, and race) to compute the conditional probability that each record pair is a match. Record pairs are classified as <italic>matches</italic> or <italic>nonmatches</italic> based on thresholding these conditional probabilities. BRL is similar to the Fellegi-Sunter approach but uses a Bayesian implementation to explore the space of plausible matching configurations between the datafiles [<xref ref-type="bibr" rid="ref22">22</xref>]. By using a Bayesian approach, BRL allows for quantifying uncertainty on the matching decisions and finds the optimal set of matches by minimizing the expected misclassification errors based on a loss function.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Record linkage algorithms.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="140"/>
            <col width="650"/>
            <col width="210"/>
            <thead>
              <tr valign="top">
                <td>Algorithm</td>
                <td>Match criteria</td>
                <td>Source</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Exact match</td>
                <td>Exact match on first name, last name, AND year of birth</td>
                <td>Not applicable</td>
              </tr>
              <tr valign="top">
                <td>Stenger</td>
                <td>Best record pairs with a score of 50+ based on the following criteria:<break/><list list-type="bullet"><list-item><p>+20 points: first 3 letters of the last name and 2 letters of the first name</p></list-item><list-item><p>+15 points: exact match on the full name</p></list-item><list-item><p>+15 points: match on birth year (±2 years)</p></list-item><list-item><p>+5 points: exact match on the year of birth</p></list-item><list-item><p>+10 points: exact match on the month of birth</p></list-item><list-item><p>+5 points: exact match on the day of birth</p></list-item></list></td>
                <td>Public Health Seattle King County and Avoundjian et al [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Ocampo 1</td>
                <td>Record pairs that met the following criteria:<break/><list list-type="bullet"><list-item><p>Exact<sup>a</sup>: last name, first name, date of birth, race, gender<sup>b</sup>, AND SSN<sup>c</sup> OR</p></list-item><list-item><p>Very high<sup>a</sup>: (last name, first name, date of birth, AND gender<sup>b</sup>) OR SSN OR</p></list-item><list-item><p>High: last name, first name, date of birth, AND (gender<sup>b</sup> OR race)</p></list-item></list></td>
                <td>Ocampo et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Ocampo 2</td>
                <td>Record pairs that matched in Ocampo 1 OR met the following criteria:<break/><list list-type="bullet"><list-item><p>Medium high: last name, first name (Soundex), date of birth, or gender<sup>b</sup></p></list-item></list></td>
                <td>Ocampo et al [<xref ref-type="bibr" rid="ref16">16</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Bosh</td>
                <td>Records that met any of the following matching keys:<break/><list list-type="bullet"><list-item><p>Full last name+first 6 letters of first name+full date of birth</p></list-item><list-item><p>First letter of the last name+letters 3 to 10 of the last name+letters 2 to 9 of the first name+full date of birth</p></list-item><list-item><p>Letters 2 to 7 of the last name+first 6 letters of the last name+full date of birth</p></list-item><list-item><p>First 2 letters of the last name+first 3 letters of the first name+full SSN+full date of birth<sup>d</sup></p></list-item><list-item><p>Full last name+first 3 letters of the first name+full date of birth</p></list-item><list-item><p>Letters 3 to 5 of the last name+first 3 letters of the first name+full date of birth</p></list-item><list-item><p>First 4 letters of the last name+first 4 letters of the first name+full date of birth</p></list-item><list-item><p>First letter of the last name+letters 3 to 10 of the last name+letters 2 to 9 of the first name+month and year of birth<sup>e</sup></p></list-item><list-item><p>First letter of the last name+letters 3 to 10 of the last name+letters 2 to 9 of the first name+day and year of birth<sup>e</sup></p></list-item><list-item><p>Full SSN<sup>d,e</sup></p></list-item><list-item><p>First 5 letters of the last name+first 4 letters of the first name+month and year of birth<sup>e</sup></p></list-item><list-item><p>First letter of the last name+letters 3 to 10 of the last name+letters 2 to 9 of the first name+(day OR month of birth)+year of birth, switching the first and last names in 1 dataset<sup>e</sup></p></list-item><list-item><p>First 5 letters of the last name+first 4 letters of the first name+month and year of birth, switching the first and last names in 1 dataset<sup>e</sup></p></list-item></list></td>
                <td>Bosh et al [<xref ref-type="bibr" rid="ref25">25</xref>]</td>
              </tr>
              <tr valign="top">
                <td>fastLink (Fellegi-Sunter)</td>
                <td>Calculates match/nonmatch weights using an expectation maximization algorithm and computes a match probability for each record pair. Pairs are classified as a match if their match probability is above 0.85. The following fields are used to estimate the match probability:<break/><list list-type="bullet"><list-item><p>First name and last name: partial match using Jaro Winkler string distance, with 3 agreement levels<sup>f</sup></p></list-item><list-item><p>Year of birth, month of birth, day of birth, gender and race: exact match</p></list-item></list></td>
                <td>Enamorado et al [<xref ref-type="bibr" rid="ref26">26</xref>]</td>
              </tr>
              <tr valign="top">
                <td>Beta Record Linkage</td>
                <td>Uses a Gibbs sampler to sample plausible matching configurations and uses a loss function to identify the optimal set of matching pairs. The following fields are used by the algorithm:<break/><list list-type="bullet"><list-item><p>First name and last name: partial match using Levenshtein string distance, with 4 agreement levels<sup>g</sup></p></list-item><list-item><p>Year of birth, month of birth, day of birth, gender, and race: exact match</p></list-item></list></td>
                <td>Sadinle [<xref ref-type="bibr" rid="ref22">22</xref>]</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>We omitted social security number from the exact and very high match tiers because of lack of social security number data.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Original algorithm used birth sex instead of gender.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>SSN: social security number.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Key was not implemented because of lack of social security number data.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>These keys require the following additional criteria to be met to be considered a match: exact match on gender OR full date of birth AND first name in the HIV dataset not among the 20 most common names in the HIV dataset AND last name in the HIV dataset not among the 20 most common names in the HIV dataset. Note: the original algorithm used birth sex instead of gender in these criteria. In addition, the original criteria also required a match on digits 1 to 4 and 6 to 9 of social security number, which was not implemented because of lack of social security number data.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>FastLink’s default agreement levels for partially matched fields: 0 to 0.87: not a match, 0.88 to 0.91: partial match, and 0.92+: exact match.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>Beta record linkage’s default agreement levels for partially matched fields: 0 to 0.49: not a match, 0.5 to 0.74: probable nonmatch, 0.76 to 0.998: probable match, and 0.99+: exact match.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Hypothetical Matching Scenario</title>
        <p>To compare record linkage algorithm performance in the context of public health action, we considered the scenario of linking records between HIV and STD surveillance data to identify syphilis cases reported in the past year among PLWH. Such record linkage is conducted by many health departments in the United States as a way to integrate HIV care engagement activities into syphilis partner services. We assumed that both HIV and STD surveillance data contain the following shared fields that can be used for record linkage: first name, last name, date of birth (year, month, and day), gender, and race.</p>
      </sec>
      <sec>
        <title>Simulation Study</title>
        <p>Simulations were used to compare the accuracy of the selected record linkage algorithms in scenarios with varying dataset size, overlap, and measurement error. GeCo (Australia National University, Canberra, Australia), a Python-based program that creates realistic datasets of personal information, was used to generate pairs of datasets based on STD surveillance data from PHSKC’s partner services data system, known as Public Health Information Management System (PHIMS) [<xref ref-type="bibr" rid="ref27">27</xref>]. In each simulation, we generated two datasets containing records of 2000 individuals each. A number of individuals were included in both datasets, which we refer to as the <italic>overlap</italic> between the datasets. We considered scenarios where 5%, 10%, 25%, and 50% of individuals overlapped. To generate each pair of datasets, we used the distribution of values for each field from PHIMS. Using PHIMS, we created frequency tables for first and last names, year of birth, gender (male, female, transgender male, and transgender female), and race/ethnicity (Asian, black, Hispanic/Latinx, Native American/Alaska Native, Native Hawaiian/other Pacific Islander, white, other, and multiple race). We created a joint frequency table for month and day of birth, giving an equal sampling weight for each day of the year. For each individual, a value was sampled from each frequency table to generate a number of clean records, which were then <italic>corrupted</italic> to create the datasets. For each pair of datasets, the first dataset consisted of <italic>clean</italic> records, and the second dataset consisted of <italic>corrupted</italic> records. Each corrupted record has a fixed number of erroneous fields that are selected at random. For each dataset size and overlap scenario, we generated datasets containing 1, 2, 3, 4, and 5 erroneous fields per record. The types of errors introduced into each field were selected at random from a set of possibilities that vary from field to field (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The types of errors are edits (insertions, deletions, substitutions, and transpositions of characters in a string), keyboard (typing errors based on a QWERTY keyboard layout), phonetic (using a list of predefined phonetic rules), value swap (an entire value is swapped with another value selected from a predefined list of possible values), and missing values. The probability of missing values was determined by the frequency of missing values for each field in PHSKC’s STD surveillance data. The probabilities of the remaining error types were defined based on the default probabilities provided by GeCo.</p>
        <p>We matched each pair of datasets using each record linkage algorithm. After simulated data were created, we did not further modify the data (eg, modifying date values with missing date parts) before inputting them into any of the algorithms. We measured each algorithm’s <italic>recall</italic> (ie, sensitivity, the proportion of true matches identified by the algorithm) and <italic>precision</italic> (ie, positive predictive value, the proportion of algorithm matches that were true matches). Each matching scenario was simulated 100 times, and we calculated the mean and standard deviation of recall and precision for each algorithm across these replicates. In addition, we measured the computational performance of each algorithm in terms of their average runtime. We ran each matching algorithm 20 times while fixing the overlap between the two datasets (50% of the individuals in the second dataset overlap with those in the first dataset) and the number of erroneous fields (one erroneous field per record) and varying the size of the second dataset (10%, 25%, 50%, and 100% of the first dataset). We then calculated the mean and standard deviation of computation time for each algorithm.</p>
      </sec>
      <sec>
        <title>Real-World Matching Scenario</title>
        <p>In our <italic>real-world</italic> matching scenario, we linked PHSKC HIV (Electronic HIV/AIDS Reporting System [eHARS]) and STD (PHIMS) surveillance data to identify PLWH who had a syphilis diagnosis in 2017. In 2017, there were 885 case-patients with a syphilis infection reported in King County. There were 17,415 PLWH in eHARS, which includes all persons living with diagnosed HIV in Washington state. As there is no shared unique identifier between PHIMS and eHARS, we did not have a gold standard against which we could compare each matching algorithm’s performance. Thus, we defined true matches and true nonmatches using a composite of the matching decisions made by each of the algorithms (<italic>composite standard</italic>). If all the algorithms identified a pair of records as a match, we considered it a true match. If none of the algorithms identified a pair of records as a match, it was considered a true nonmatch. When there was a lack of consensus between the record pairs, we manually reviewed the records to determine whether they were a true match or nonmatch. As in the simulations described above, we made no modifications to any date values with missing date parts before inputting them into the algorithms (&#60;0.1% of records had missing date parts). We calculated the precision and recall of each algorithm. In addition, we measured the <italic>value and error added</italic> by each algorithm beyond exact matching, which we considered as the baseline algorithm. We measured <italic>value added</italic> as the number of additional true matches and <italic>error added</italic> as the additional false matches identified by each algorithm over and beyond exact matching.</p>
        <p>Dataset generation and corruption were done using GeCo and Python 2.7. All other analyses were done using R version 3.5.2. Python and R programs used to perform simulations, perform the real-world match, and measure computational performance are provided as supplemental material (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Simulations</title>
        <p>The selected deterministic algorithms had a lower recall than the selected probabilistic algorithms, regardless of the overlap or the number of erroneous fields per record (<xref rid="figure1" ref-type="fig">Figure 1</xref> and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The exact algorithm had a recall of between 56% (5% overlap) and 57% (50% overlap) when there was one erroneous field per record, and its recall decreased as the number of erroneous fields per record increased. The exact matching algorithm’s precision was between 99% and 100% when there were three or fewer erroneous fields per record (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The Stenger, Ocampo 1, and Ocampo 2 algorithms had similar recall and precision but had lower recall than the exact match. When there was only one erroneous field, both the Stenger and Ocampo 1 algorithms had a recall of 30%, whereas the Ocampo 2 algorithm had a recall of 39%, regardless of the dataset size and overlap. The precision for all three algorithms was 100% when there was only one erroneous field per record. All three algorithms failed to identify any matches when there were at least three erroneous fields. The Bosh algorithm had the highest recall of the five deterministic algorithms. When there was one erroneous field per record, the Bosh algorithm’s recall ranged between 74% (5% overlap) and 75% (50% overlap). However, its recall decreased to less than 20% in scenarios with at least three erroneous fields per record. The precision for the Bosh algorithm was high across all scenarios (between 88% and 100%).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Simulations: record linkage algorithm recall/precision.</p>
          </caption>
          <graphic xlink:href="publichealth_v6i2e15917_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>fastLink and BRL had better recall than the deterministic algorithms. In the one erroneous field per record scenario, both fastLink and BRL had about 100% recall, regardless of the dataset overlap. In the three erroneous field scenario, fastLink’s recall ranged between 73% (5% overlap) and 85% (50% overlap), whereas BRL’s recall ranged between 94% and 99%. In the five erroneous field scenario, fastLink’s recall was between 8% and 27%, whereas BRL’s recall was between 74% and 92%. The precision of both algorithms was high across all scenarios (fastLink: 97%-100% and BRL: 85%-100%).</p>
      </sec>
      <sec>
        <title>Computational Performance</title>
        <p>The exact, Ocampo, and Stenger algorithms took an average of about 0.01 seconds to compute, even when the datasets being compared contained 2000 records (<xref rid="figure2" ref-type="fig">Figure 2</xref>). The Bosh algorithm took between 2 seconds and 18 seconds to compute, depending on the dataset size. The two probabilistic algorithms took a longer time to compute than all the deterministic algorithms. fastLink took an average of between 2.3 min and 4 min to compute. On average, BRL performed faster than fastLink when the second dataset contained 200 records (1.5 min vs 2.3 min) but was the slowest algorithm in every other scenario. BRL, on average, took between 3.6 min (second dataset N=500) and 14.1 min (second dataset N=2000) in the remaining scenarios.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Record linkage algorithm matching computational performance. Average computational time after 20 replications in scenario where overlap (50%) and number of erroneous fields per record (1) were fixed and size of second dataset was varied (10%, 25%, 50%, and 75% of first dataset [N=2000]).</p>
          </caption>
          <graphic xlink:href="publichealth_v6i2e15917_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Real-World Matching Scenario</title>
        <p>Among the 885 case-patients with any syphilis infection in King County in 2017, a majority (760/885, 85.8%) were men who have sex with men (MSM). Nearly half of the patients were white (436/885, 49.3%), 12.8% (113/885) were black, and 20.5% (182/885) were Hispanic/Latinx. Among the 17,415 PLWH in PHSKC’s eHARS database, 14,887 (85.48%) were male (12,640/17,415, 72.58% MSM), 10,293 (59.10%) were white, 2965 (17.10%) were black, and 2376 (13.67%) were Hispanic/Latinx.</p>
        <p>There were 367 record pairs classified as a match by any of the algorithms. Of these, the algorithms disagreed on 113 record pairs, which were manually reviewed to determine their true match status. According to our composite standard, there were 309 true matches, representing 35% of all case-patients with a syphilis infection in 2017 and 1.8% of all PLWH in eHARS. The exact matching algorithm identified 256 true matches and one mismatch (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Compared with this algorithm, the Stenger and Ocampo 1 algorithms identified two fewer true matches and did not have any mismatches. The Ocampo 2 algorithm identified three more matches than the exact matching algorithm and also had no mismatches. The Bosh algorithm identified 36 additional true matches but also identified 20 additional false matches. Both fastLink and BRL identified 53 additional true matches. However, fastLink had 33 additional false matches, whereas BRL only had two additional false matches.</p>
        <p>Compared with our composite standard, all the deterministic algorithms had lower recall than the probabilistic algorithms (<xref rid="figure3" ref-type="fig">Figure 3</xref>). The recall of the exact, Stenger, Ocampo 1, and Ocampo 2 algorithms ranged between 82% and 84%. The recall of the Bosh algorithm was about 94%, and the recall of fastLink and BRL was 100%. The precision of the deterministic algorithms (except for Bosh) was overall higher than the precision of the probabilistic algorithms. The Stenger, Ocampo 1, and Ocampo 2 algorithms had 100% precision, whereas the exact algorithm had 99.6% precision. The precision of the Bosh algorithm was about 93%, and the precision of fastLink was about 90%. BRL had a precision of 99%, which was the lowest trade-off between recall and precision.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Real-world matching scenario: record linkage algorithm recall and precision. PPV: positive predictive value.</p>
          </caption>
          <graphic xlink:href="publichealth_v6i2e15917_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Using simulations, we found that the probabilistic algorithms we evaluated had substantially better recall than the selected deterministic algorithms, while the deterministic algorithms had higher precision. However, in scenarios with three or more erroneous fields per record, nearly all the deterministic algorithms (except the Bosh algorithm) failed to identify any matches, which diminishes their utility in record linkage scenarios where data quality is poor. In contrast, both BRL and fastLink offered high recall without sacrificing much in terms of precision. In addition, in a <italic>real-world</italic> comparison, BRL had the highest recall with only a minimal sacrifice in precision and was the best performing algorithm overall.</p>
        <p>Our findings suggest that although deterministic algorithms offer a high degree of precision, they are highly sensitive to data quality issues and may miss a substantial number of matches even in situations where there is only one erroneous field per record. The recall of deterministic algorithms can be improved by implementing more matching rules (as in the case of the Bosh algorithm [<xref ref-type="bibr" rid="ref25">25</xref>]), but this also results in lower precision. Furthermore, even with additional match keys, deterministic algorithms still do not reach the level of recall offered by probabilistic algorithms.</p>
        <p>Surprisingly, the Bosh and fastLink algorithms had low precision in our real-world match, despite having very high precision in simulations. For fastLink, this may be a limitation of the algorithm, which tends to lose precision in situations where the overlap between datasets is small or there is a large difference in the size of the datasets being linked [<xref ref-type="bibr" rid="ref26">26</xref>]. The lack of SSN may have led to the Bosh algorithm’s lower precision in the real-world match. The false matches identified by the Bosh algorithm were identified because they met matching keys 8 to 14, which require additional criteria to be considered a match (<xref ref-type="table" rid="table1">Table 1</xref>). As noted in the original Bosh article, these additional criteria were added to reduce possible false matches. Although we implemented most of the additional criteria, they include a partial match on SSN (ie, match on digits 1-4 and 6-9 of SSN), which was omitted from this study. If SSN was included, we may have eliminated the false matches identified by the less strict matching keys, resulting in a higher observed precision for this algorithm.</p>
      </sec>
      <sec>
        <title>Public Health Implications</title>
        <p>In the context of public health action, choosing a record linkage algorithm that prioritizes the identification of true matches is critical to preventing gaps in the provision of public health interventions to those who are most in need of assistance. Choosing overly conservative record linkage algorithms that prioritize precision over recall could increase gaps among these groups in public health prevention delivery and may amplify disparities among marginalized populations. Previous studies have demonstrated that imperfect record linkage algorithms may disproportionately miss women, older individuals, and persons of minoritized races/ethnicities and lower socioeconomic status [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref31">31</xref>]. The use of probabilistic record linkage methods (such as BRL and fastLink) or more complex deterministic algorithms (such as the Bosh algorithm) would result in a large increase in the reach of public health interventions relying on the linkage of data systems, which offsets small decreases in match precision.</p>
        <p>A disadvantage of probabilistic algorithms is their computational complexity. While the computational time of the deterministic algorithms is generally under 1 second, both probabilistic methods took minutes to compute. For applications that require near-instant record linkage of large databases, probabilistic algorithms may not be practical because of their slow computation time; however, such applications may be relatively uncommon in practice. When record linkage is done on a daily or less frequent basis, the increased computation time of fastLink and BRL is less problematic. Importantly, fastLink was designed to outperform other approaches to probabilistic record linkage algorithms when datasets are very large [<xref ref-type="bibr" rid="ref26">26</xref>]. In these situations, fastLink may have even greater gains compared with slower methods such as BRL, although it may still be slower than deterministic algorithms. In addition, because of their increased computational complexity, BRL and fastLink require more memory and processing power than the deterministic algorithms. Both BRL and fastLink required over 4 GB of RAM and a 64-bit version of R, which may be a limitation of using these algorithms in resource-limited settings. However, 64-bit computing and 4 or more GB of RAM are becoming increasingly common, suggesting that these barriers would be less problematic in the future. As of May 2019, the estimated minimum cost of a new business desktop with these specifications is about US $400.</p>
        <p>Another advantage of deterministic algorithms is that these are easier to implement in different programming languages. Matching rules used by the deterministic algorithms we evaluated are relatively intuitive and translatable to multiple programming languages. Although fastLink has thorough documentation and support, modifications to the algorithm require an understanding of the Fellegi-Sunter record linkage methodology and the R programming language [<xref ref-type="bibr" rid="ref26">26</xref>]. Modifications to BRL are particularly challenging, as there is currently limited documentation on the method [<xref ref-type="bibr" rid="ref22">22</xref>]. In addition, much of the BRL algorithm is implemented in the C programming language, an additional prerequisite to making modifications to the algorithm. To address these barriers, we have provided R programs for each algorithm in a <italic>Load, Clean, Func, Do</italic> framework, a portable and flexible organizational structure for developing R projects, to implement them in practice (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) [<xref ref-type="bibr" rid="ref32">32</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study has several limitations. First, in our simulations, we assumed a uniform error rate across all records in each matching scenario. As our probabilistic algorithms use information from all records, this may have misrepresented how well they perform when linking datasets that contain a wide range of erroneous fields per record, including records that have 0 erroneous fields. Indeed, in our real-world match scenario, in which record quality was more variable, BRL had much higher precision than in our simulations, suggesting that it is able to leverage information from record pairs that have high data quality to make decisions about record pairs that have poor data quality.</p>
        <p>Second, both the Bosh and Ocampo algorithms include matching keys that involve SSN, which is not available in PHSKC’s STD surveillance database. This may have resulted in an underestimation of the performance of these algorithms. In the Bosh algorithm, SSN is used as additional criteria to reduce mismatches for matching keys that are very broad, and its inclusion may have resulted in improved precision. In the original Bosh study, 1.7% of true matches were identified using SSN alone, suggesting that if SSN was available, we would have observed a very slightly improved recall of the Bosh algorithm, although it probably would not have reached the levels of recall observed with the probabilistic algorithms [<xref ref-type="bibr" rid="ref25">25</xref>]. In addition, if SSN had been available, it could have also been included in both probabilistic algorithms, which could have possibly improved their recall and precision as well.</p>
        <p>Third, we have only considered deterministic and probabilistic algorithms that can be implemented in R and have excluded algorithms that require third-party software (eg, the Link King and CDC’s Link Plus) and novel record linkage methodologies (eg, active, supervised, and unsupervised learning algorithms). Third-party software for record linkage offers a point-and-click interface for implementing probabilistic (and deterministic) record linkage methodologies. Both the Link King and Link Plus, two popular applications for conducting record linkage involving public health surveillance databases, use the Fellegi-Sunter methodology for conducting probabilistic record linkage, which is the same methodology used by fastLink. Supervised learning–based and active learning–based algorithms may yield greater match quality than probabilistic or deterministic algorithms in cases where databases are to be linked prospectively or when training data are available (in the case of supervised learning) [<xref ref-type="bibr" rid="ref19">19</xref>]. These algorithms use data on record pairs that are known to be matches or nonmatches to develop a predictive model that is used to classify record pairs in the databases that are being linked as matches or nonmatches. As these algorithms require a training dataset of known matches and nonmatches (something neither the probabilistic nor the deterministic algorithms we evaluated required), we chose to exclude them from our analysis. Further research is needed to assess the performance and utility of these techniques in conducting record linkage for public health action as well as the feasibility of implementing them in practice.</p>
        <p>Finally, for the probabilistic matching algorithms we evaluated, we only considered their default parameterizations. We chose to evaluate these algorithms using their default (or <italic>out-of-the-box</italic>) implementations, as this would represent a baseline level of their performance. Modifying the parameters for fastLink and BRL, such as the string distance measure used to match string variables or the number of partial agreement levels, could improve their performance. Importantly, fastLink and BRL use different default methods to match string variables (eg, first name and last name). This may partially explain why BRL had better recall than fastLink in our simulations and a lower trade-off between recall and precision in our real-world match. In addition, the use of a blocking scheme, such as grouping record pairs on the first two letters of the first name before they are compared by the algorithm, may have improved both the precision and computational performance of these algorithms. Future studies should consider evaluating the use of blocking on algorithm performance in the public health practice setting.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, public health interventions that involve record linkage of multiple data systems should carefully consider their choice of record linkage algorithm. This choice should be based not only on reducing false matches but also on maximizing intervention coverage. Record linkage methodologies that do not seek to maximize true matches, especially in the context of imperfect data quality, limit the reach of public health interventions and could exacerbate existing health disparities. Probabilistic algorithms, such as BRL, can maximize the number of true matches identified without sacrificing precision and should be considered as the first choice when using record linkage for public health action.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Additional details about data generation and tables describing simulation results.</p>
        <media xlink:href="publichealth_v6i2e15917_app1.docx" xlink:title="DOCX File , 24 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Python and R programs used to conduct simulations and real-world match.</p>
        <media xlink:href="publichealth_v6i2e15917_app2.docx" xlink:title="DOCX File , 14 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Real-world matching scenario: value and error added over exact matching algorithm.</p>
        <media xlink:href="publichealth_v6i2e15917_app3.docx" xlink:title="DOCX File , 63 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">BRL</term>
          <def>
            <p>beta record linkage</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDC</term>
          <def>
            <p>Centers for Disease Control and Prevention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">eHARS</term>
          <def>
            <p>Electronic HIV/AIDS Reporting System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MSM</term>
          <def>
            <p>men who have sex with men</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">PHIMS</term>
          <def>
            <p>Public Health Information Management System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">PHSKC</term>
          <def>
            <p>Public Health—Seattle &#38; King County</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">PLWH</term>
          <def>
            <p>people living with HIV</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">SSN</term>
          <def>
            <p>social security number</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">STD</term>
          <def>
            <p>sexually transmitted disease</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <source>World Health Organization</source>
          <year>2017</year>
          <access-date>2020-02-24</access-date>
          <comment>Public Health Surveillance<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/topics/public_health_surveillance/en/">https://www.who.int/topics/public_health_surveillance/en/</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wiewel</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Braunstein</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shepard</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Torian</surname>
              <given-names>LV</given-names>
            </name>
          </person-group>
          <article-title>Monitoring outcomes for newly diagnosed and prevalent HIV cases using a care continuum created with New York city surveillance data</article-title>
          <source>J Acquir Immune Defic Syndr</source>
          <year>2015</year>
          <month>02</month>
          <day>1</day>
          <volume>68</volume>
          <issue>2</issue>
          <fpage>217</fpage>
          <lpage>26</lpage>
          <pub-id pub-id-type="doi">10.1097/QAI.0000000000000424</pub-id>
          <pub-id pub-id-type="medline">25394192</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Karch</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of the National Human Immunodeficiency Virus Surveillance System for the 2011 diagnosis year</article-title>
          <source>J Public Health Manag Pract</source>
          <year>2014</year>
          <volume>20</volume>
          <issue>6</issue>
          <fpage>598</fpage>
          <lpage>607</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24253405"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/PHH.0000000000000033</pub-id>
          <pub-id pub-id-type="medline">24253405</pub-id>
          <pub-id pub-id-type="pmcid">PMC4602389</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dombrowski</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Buskin</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Bennett</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Thiede</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Use of multiple data sources and individual case investigation to refine surveillance-based estimates of the HIV care continuum</article-title>
          <source>J Acquir Immune Defic Syndr</source>
          <year>2014</year>
          <month>11</month>
          <day>1</day>
          <volume>67</volume>
          <issue>3</issue>
          <fpage>323</fpage>
          <lpage>30</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25140904"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/QAI.0000000000000302</pub-id>
          <pub-id pub-id-type="medline">25140904</pub-id>
          <pub-id pub-id-type="pmcid">PMC4197062</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>Centers for Disease Control and Prevention</collab>
          </person-group>
          <source>Centers for Disease Control and Prevention</source>
          <year>2018</year>
          <access-date>2020-02-24</access-date>
          <comment>Monitoring Selected National HIV Prevention and Care Objectives by Using HIV Surveillance Data - United States and 6 Dependent Areas, 2016<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/hiv/pdf/library/reports/surveillance/cdc-hiv-surveillance-supplemental-report-vol-23-4.pdf">https://www.cdc.gov/hiv/pdf/library/reports/surveillance/cdc-hiv-surveillance-supplemental-report-vol-23-4.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buchacz</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Parisi</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Yoshida-Cervantes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Antunez</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Delgado</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Moss</surname>
              <given-names>NJ</given-names>
            </name>
            <name name-style="western">
              <surname>Scheer</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Using HIV surveillance registry data to re-link persons to care: the RSVP Project in San Francisco</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>e0118923</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pone.0118923"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0118923</pub-id>
          <pub-id pub-id-type="medline">25748668</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-23841</pub-id>
          <pub-id pub-id-type="pmcid">PMC4352048</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <source>Centers for Disease Control and Prevention</source>
          <year>2017</year>
          <month>08</month>
          <access-date>2020-02-24</access-date>
          <comment>Data to Care: Using HIV Surveillance Data to Support the HIV Care Continuum<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://effectiveinterventions.cdc.gov/en/HighImpactPrevention/PublicHealthStrategies/DatatoCare.aspx">https://effectiveinterventions.cdc.gov/en/HighImpactPrevention/PublicHealthStrategies/DatatoCare.aspx</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dombrowski</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Carey</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Pitts</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Craw</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Freeman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Bertolli</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>HIV provider and patient perspectives on the Development of a Health Department 'Data to Care' Program: a qualitative study</article-title>
          <source>BMC Public Health</source>
          <year>2016</year>
          <month>06</month>
          <day>10</day>
          <volume>16</volume>
          <fpage>491</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcpublichealth.biomedcentral.com/articles/10.1186/s12889-016-3152-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12889-016-3152-4</pub-id>
          <pub-id pub-id-type="medline">27286654</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12889-016-3152-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC4901404</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Evans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>van Gorder</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Morin</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Steward</surname>
              <given-names>WT</given-names>
            </name>
            <name name-style="western">
              <surname>Gaffney</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Charlebois</surname>
              <given-names>ED</given-names>
            </name>
          </person-group>
          <article-title>Acceptance of the use of HIV surveillance data for care engagement: national and local community perspectives</article-title>
          <source>J Acquir Immune Defic Syndr</source>
          <year>2015</year>
          <month>05</month>
          <day>1</day>
          <volume>69</volume>
          <issue>Suppl 1</issue>
          <fpage>S31</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/25867776"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/QAI.0000000000000573</pub-id>
          <pub-id pub-id-type="medline">25867776</pub-id>
          <pub-id pub-id-type="pii">00126334-201505011-00005</pub-id>
          <pub-id pub-id-type="pmcid">PMC4530777</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hood</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Katz</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Bennett</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Buskin</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Dombrowski</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Hawes</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Integrating HIV surveillance and field services: data quality and care continuum in King County, Washington, 2010-2015</article-title>
          <source>Am J Public Health</source>
          <year>2017</year>
          <month>12</month>
          <volume>107</volume>
          <issue>12</issue>
          <fpage>1938</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.2105/AJPH.2017.304069</pub-id>
          <pub-id pub-id-type="medline">29048962</pub-id>
          <pub-id pub-id-type="pmcid">PMC5678383</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gardner</surname>
              <given-names>LI</given-names>
            </name>
            <name name-style="western">
              <surname>Buchacz</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Garland</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Mugavero</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Bosshart</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Shouse</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Bertolli</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Shifting the paradigm: using HIV surveillance data as a foundation for improving HIV care and preventing HIV infection</article-title>
          <source>Milbank Q</source>
          <year>2013</year>
          <month>09</month>
          <volume>91</volume>
          <issue>3</issue>
          <fpage>558</fpage>
          <lpage>603</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24028699"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/milq.12018</pub-id>
          <pub-id pub-id-type="medline">24028699</pub-id>
          <pub-id pub-id-type="pmcid">PMC3790525</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tesoriero</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Hart-Malloy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cukrovany</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Moncur</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Bogucki</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>MC</given-names>
            </name>
          </person-group>
          <article-title>Improving Retention in HIV Care Through New York's Expanded Partner Services Data-to-Care Pilot</article-title>
          <source>J Public Health Manag Pract</source>
          <year>2017</year>
          <volume>23</volume>
          <issue>3</issue>
          <fpage>255</fpage>
          <lpage>263</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27902561"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/PHH.0000000000000483</pub-id>
          <pub-id pub-id-type="medline">27902561</pub-id>
          <pub-id pub-id-type="pmcid">PMC5381495</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buskin</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Kent</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Dombrowski</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Migration distorts surveillance estimates of engagement in care: results of public health investigations of persons who appear to be out of HIV care</article-title>
          <source>Sex Transm Dis</source>
          <year>2014</year>
          <month>01</month>
          <volume>41</volume>
          <issue>1</issue>
          <fpage>35</fpage>
          <lpage>40</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/24326579"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/OLQ.0000000000000072</pub-id>
          <pub-id pub-id-type="medline">24326579</pub-id>
          <pub-id pub-id-type="pii">00007435-201401000-00007</pub-id>
          <pub-id pub-id-type="pmcid">PMC5689076</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dombrowski</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Bove</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Roscoe</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Harvill</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Firth</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Khormooji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Carr</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schafer</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>MR</given-names>
            </name>
            <collab>Northwest Health Department and Centers for AIDS Research (CFAR) Consortium</collab>
          </person-group>
          <article-title>'Out of Care' HIV case investigations: a collaborative analysis across 6 states in the Northwest US</article-title>
          <source>J Acquir Immune Defic Syndr</source>
          <year>2017</year>
          <month>02</month>
          <day>1</day>
          <volume>74</volume>
          <issue>Suppl 2</issue>
          <fpage>S81</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/28079717"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/QAI.0000000000001237</pub-id>
          <pub-id pub-id-type="medline">28079717</pub-id>
          <pub-id pub-id-type="pii">00126334-201702011-00002</pub-id>
          <pub-id pub-id-type="pmcid">PMC5234689</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ocampo</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Hamp</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rhodes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Smart</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Pemmaraju</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Poschman</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hess</surname>
              <given-names>KL</given-names>
            </name>
            <name name-style="western">
              <surname>Bhattacharjee</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Flynn</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dowling</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Maccormack</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Doshi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lum</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maddox</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Moncur</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Barnhart</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Maxwell</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Aurand</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Hogan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Wills</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Prowell</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kassaye</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Karn</surname>
              <given-names>HE</given-names>
            </name>
            <name name-style="western">
              <surname>Laffoon</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Collmann</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Improving HIV surveillance data by using the ATra black box system to assist regional deduplication activities</article-title>
          <source>J Acquir Immune Defic Syndr</source>
          <year>2019</year>
          <month>09</month>
          <day>1</day>
          <volume>82</volume>
          <issue>Suppl 1</issue>
          <fpage>S13</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1097/QAI.0000000000002090</pub-id>
          <pub-id pub-id-type="medline">31425390</pub-id>
          <pub-id pub-id-type="pii">00126334-201909011-00003</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ocampo</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Smart</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Allston</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bhattacharjee</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Boggavarapu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Castel</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Collmann</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Flynn</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hamp</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jordan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kassaye</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kharfen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lum</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pemmaraju</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rhodes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stover</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Improving HIV surveillance data for public health action in Washington, DC: a novel multiorganizational data-sharing method</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2016</year>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>e3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2016/1/e3/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/publichealth.5317</pub-id>
          <pub-id pub-id-type="medline">27227157</pub-id>
          <pub-id pub-id-type="pii">v2i1e3</pub-id>
          <pub-id pub-id-type="pmcid">PMC4869245</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Avoundjian</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Peyton</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Golden</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Integrating HIV and STD surveillance to direct partner services and improve HIV care engagement in Jackson, Mississippi</article-title>
          <year>2018</year>
          <conf-name>CDC STD Prevention Conference</conf-name>
          <conf-date>August 2018</conf-date>
          <conf-loc>Washington, DC</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cdc.confex.com/cdc/std2018/webprogram/Paper39519.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Christen</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <source>Data Matching: Concepts and Techniques for Record Linkage, Entity Resolution, and Duplicate Detection</source>
          <year>2012</year>
          <publisher-loc>Berlin</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elmagarmid</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Ipeirotis</surname>
              <given-names>PG</given-names>
            </name>
            <name name-style="western">
              <surname>Verykios</surname>
              <given-names>VS</given-names>
            </name>
          </person-group>
          <article-title>Duplicate record detection: a survey</article-title>
          <source>IEEE Trans Knowl Data Eng</source>
          <year>2006</year>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>16</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2007.250581</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Winkler</surname>
              <given-names>WE</given-names>
            </name>
          </person-group>
          <source>US Census</source>
          <year>2006</year>
          <access-date>2020-02-24</access-date>
          <comment>Overview of Record Linkage and Current Research Directions<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.census.gov/srd/papers/pdf/rrs2006-02.pdf">https://www.census.gov/srd/papers/pdf/rrs2006-02.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fellegi</surname>
              <given-names>IP</given-names>
            </name>
            <name name-style="western">
              <surname>Sunter</surname>
              <given-names>AB</given-names>
            </name>
          </person-group>
          <article-title>A theory for record linkage</article-title>
          <source>J Am Stat Assoc</source>
          <year>1969</year>
          <volume>64</volume>
          <issue>328</issue>
          <fpage>1183</fpage>
          <lpage>210</lpage>
          <pub-id pub-id-type="doi">10.1080/01621459.1969.10501049</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sadinle</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Bayesian estimation of bipartite matchings for record linkage</article-title>
          <source>J Am Stat Assoc</source>
          <year>2017</year>
          <month>03</month>
          <day>30</day>
          <volume>112</volume>
          <issue>518</issue>
          <fpage>600</fpage>
          <lpage>12</lpage>
          <pub-id pub-id-type="doi">10.1080/01621459.2016.1148612</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Campbell</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Deck</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Krupski</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Record linkage software in the public domain: a comparison of Link Plus, The Link King, and a 'basic' deterministic algorithm</article-title>
          <source>Health Informatics J</source>
          <year>2008</year>
          <month>03</month>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>5</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1177/1460458208088855</pub-id>
          <pub-id pub-id-type="medline">18258671</pub-id>
          <pub-id pub-id-type="pii">14/1/5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Waldenburger</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nasseh</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Stausberg</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Detecting duplicates at hospital admission: comparison of deterministic and probabilistic record linkage</article-title>
          <source>Stud Health Technol Inform</source>
          <year>2016</year>
          <volume>226</volume>
          <fpage>135</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="medline">27350486</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bosh</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Coyle</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Muriithi</surname>
              <given-names>NW</given-names>
            </name>
            <name name-style="western">
              <surname>Ramaswamy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Brantley</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Stockman</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>VanderBusch</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Westheimer</surname>
              <given-names>EF</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>HI</given-names>
            </name>
          </person-group>
          <article-title>Linking HIV and viral hepatitis surveillance data: evaluating a standard, deterministic matching algorithm using data from 6 US health jurisdictions</article-title>
          <source>Am J Epidemiol</source>
          <year>2018</year>
          <month>11</month>
          <day>1</day>
          <volume>187</volume>
          <issue>11</issue>
          <fpage>2415</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1093/aje/kwy161</pub-id>
          <pub-id pub-id-type="medline">30099475</pub-id>
          <pub-id pub-id-type="pii">5067617</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Enamorado</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Fifield</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Imai</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Using a probabilistic model to assist merging of large-scale administrative records</article-title>
          <source>Am Polit Sci Rev</source>
          <year>2019</year>
          <month>01</month>
          <day>2</day>
          <volume>113</volume>
          <issue>2</issue>
          <fpage>353</fpage>
          <lpage>71</lpage>
          <pub-id pub-id-type="doi">10.1017/s0003055418000783</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>K-N</given-names>
            </name>
            <name name-style="western">
              <surname>Vatsalan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Christen</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>GeCo: an online personal data generator and corruptor</article-title>
          <conf-name>22nd ACM international conference on Information &#38; Knowledge Management</conf-name>
          <conf-date>2013</conf-date>
          <conf-loc>San Francisco, CA</conf-loc>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Association for Computing Machinery</publisher-name>
          <pub-id pub-id-type="doi">10.1145/2505515.2508207</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zingmond</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ettner</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Linking hospital discharge and death records--accuracy and sources of bias</article-title>
          <source>J Clin Epidemiol</source>
          <year>2004</year>
          <month>01</month>
          <volume>57</volume>
          <issue>1</issue>
          <fpage>21</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1016/S0895-4356(03)00250-6</pub-id>
          <pub-id pub-id-type="medline">15019007</pub-id>
          <pub-id pub-id-type="pii">S0895435603002506</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bohensky</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Jolley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sundararajan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Evans</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pilcher</surname>
              <given-names>DV</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Brand</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>Data linkage: a powerful research tool with potential problems</article-title>
          <source>BMC Health Serv Res</source>
          <year>2010</year>
          <month>12</month>
          <day>22</day>
          <volume>10</volume>
          <fpage>346</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmchealthservres.biomedcentral.com/articles/10.1186/1472-6963-10-346"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1472-6963-10-346</pub-id>
          <pub-id pub-id-type="medline">21176171</pub-id>
          <pub-id pub-id-type="pii">1472-6963-10-346</pub-id>
          <pub-id pub-id-type="pmcid">PMC3271236</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Atkinson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Blakely</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>(Mis)classification of ethnicity on the New Zealand Cancer Registry: 1981-2004</article-title>
          <source>N Z Med J</source>
          <year>2009</year>
          <month>05</month>
          <day>8</day>
          <volume>122</volume>
          <issue>1294</issue>
          <fpage>10</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="medline">19465958</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lariscy</surname>
              <given-names>Joseph T</given-names>
            </name>
          </person-group>
          <article-title>Differential record linkage by Hispanic ethnicity and age in linked mortality studies: implications for the epidemiologic paradox</article-title>
          <source>J Aging Health</source>
          <year>2011</year>
          <month>12</month>
          <volume>23</volume>
          <issue>8</issue>
          <fpage>1263</fpage>
          <lpage>1284</lpage>
          <comment>
            <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/21934120"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/0898264311421369</pub-id>
          <pub-id pub-id-type="medline">21934120</pub-id>
          <pub-id pub-id-type="pii">0898264311421369</pub-id>
          <pub-id pub-id-type="pmcid">PMC4598042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Silverman</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <source>The Comprehensive R Archive Network</source>
          <year>2015</year>
          <month>02</month>
          <day>20</day>
          <access-date>2020-02-24</access-date>
          <comment>Package 'makeProject'<ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/makeProject/makeProject.pdf">https://cran.r-project.org/web/packages/makeProject/makeProject.pdf</ext-link>
                                                </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
