<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i9e34472</article-id>
      <article-id pub-id-type="pmid">36053573</article-id>
      <article-id pub-id-type="doi">10.2196/34472</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Privacy of Study Participants in Open-access Health and Demographic Surveillance System Data: Requirements Analysis for Data Anonymization</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Bradley</surname>
            <given-names>Heather</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Sariyar</surname>
            <given-names>Murat</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Herbst</surname>
            <given-names>Kobus</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Templ</surname>
            <given-names>Matthias</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Institute of Data Analysis and Process Design</institution>
            <institution>Zurich University of Applied Sciences</institution>
            <addr-line>Rosenstrasse 3</addr-line>
            <addr-line>Winterthur, 8404</addr-line>
            <country>Switzerland</country>
            <phone>41 793221578</phone>
            <email>matthias.templ@zhaw.ch</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8638-5276</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kanjala</surname>
            <given-names>Chifundo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0540-8374</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Siems</surname>
            <given-names>Inken</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3412-0284</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Institute of Data Analysis and Process Design</institution>
        <institution>Zurich University of Applied Sciences</institution>
        <addr-line>Winterthur</addr-line>
        <country>Switzerland</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Population Health</institution>
        <institution>London School of Hygiene and Tropical Medicine</institution>
        <addr-line>Lilongwe</addr-line>
        <country>Malawi</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Economics and Social Statistics</institution>
        <institution>University of Trier</institution>
        <addr-line>Trier</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Matthias Templ <email>matthias.templ@zhaw.ch</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>9</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>2</day>
        <month>9</month>
        <year>2022</year>
      </pub-date>
      <volume>8</volume>
      <issue>9</issue>
      <elocation-id>e34472</elocation-id>
      <history>
        <date date-type="received">
          <day>25</day>
          <month>10</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>23</day>
          <month>2</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>19</day>
          <month>4</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>10</day>
          <month>5</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Matthias Templ, Chifundo Kanjala, Inken Siems. Originally published in JMIR Public Health and Surveillance (https://publichealth.jmir.org), 02.09.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on https://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://publichealth.jmir.org/2022/9/e34472" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Data anonymization and sharing have become popular topics for individuals, organizations, and countries worldwide. Open-access sharing of anonymized data containing sensitive information about individuals makes the most sense whenever the utility of the data can be preserved and the risk of disclosure can be kept below acceptable levels. In this case, researchers can use the data without access restrictions and limitations.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to highlight the requirements and possible solutions for sharing health surveillance event history data. The challenges lie in the anonymization of multiple event dates and time-varying variables.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>A sequential approach that adds noise to event dates is proposed. This approach maintains the event order and preserves the average time between events. In addition, a nosy neighbor distance-based matching approach to estimate the risk is proposed. Regarding the key variables that change over time, such as educational level or occupation, we make 2 proposals: one based on limiting the intermediate statuses of the individual and the other to achieve k-anonymity in subsets of the data. The proposed approaches were applied to the Karonga health and demographic surveillance system (HDSS) core residency data set, which contains longitudinal data from 1995 to the end of 2016 and includes 280,381 events with time-varying socioeconomic variables and demographic information.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>An anonymized version of the event history data, including longitudinal information on individuals over time, with high data utility, was created.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The proposed anonymization of event history data comprising static and time-varying variables applied to HDSS data led to acceptable disclosure risk, preserved utility, and being sharable as public use data. It was found that high utility was achieved, even with the highest level of noise added to the core event dates. The details are important to ensure consistency or credibility. Importantly, the sequential noise addition approach presented in this study does not only maintain the event order recorded in the original data but also maintains the time between events. We proposed an approach that preserves the data utility well but limits the number of response categories for the time-varying variables. Furthermore, using distance-based neighborhood matching, we simulated an attack under a nosy neighbor situation and by using a worst-case scenario where attackers have full information on the original data. We showed that the disclosure risk is very low, even when assuming that the attacker’s database and information are optimal. The HDSS and medical science research communities in low- and middle-income country settings will be the primary beneficiaries of the results and methods presented in this paper; however, the results will be useful for anyone working on anonymizing longitudinal event history data with time-varying variables for the purposes of sharing.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>longitudinal data and event history data</kwd>
        <kwd>low- and middle-income countries</kwd>
        <kwd>LMIC</kwd>
        <kwd>anonymization</kwd>
        <kwd>health and demographic surveillance system</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Although health research data sharing has many benefits and great value [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], one of the main concerns is maintaining the privacy of study participants. The rationale for both data sharing and privacy is widely recognized. In the field of medical science research, the issue of privacy is central to good ethical practice. Anonymization of data provides an opportunity to mitigate this tension between sharing data and preserving the privacy of those whose data are shared. However, it is often unclear how data can be shared without unduly compromising the privacy of the individuals included in a data set.</p>
        <p>A fundamental issue with personal data disclosure is whether an attacker can learn anything about an individual if the data or analysis results are provided or predictions are made. On the one hand, one can ask whether an attacker can successfully match individuals with the data at their disposal. In addition, are attackers’ efforts (and related costs) higher than the benefits of disclosing information? On the other hand, the needs of the users of data are of high utility, allowing for high-quality analysis. Data providers are interested in providing such information without disclosing the identities of the individuals in the data.</p>
        <p>Similar to all other areas of health research, longitudinal population studies in low- and middle-income countries (LMIC), such as health and demographic surveillance system (HDSS) [<xref ref-type="bibr" rid="ref3">3</xref>], face the challenge of finding the right balance between data sharing and privacy protection.</p>
        <p>The HDSS must take a position that allows the sharing required by research funders and journal publishers [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref4">4</xref>] while minimizing the risk of compromising the privacy of individuals who make their data available for research.</p>
        <p>However, the important issue of health data privacy has not been adequately explored in LMIC in general and HDSSs in particular. HDSSs currently share data in most cases without anonymizing them beyond masking direct identifiers [<xref ref-type="bibr" rid="ref5">5</xref>]. There is a possibility that attackers may use indirect identifiers such as education level, sex, and age—in cases where these are shared [<xref ref-type="bibr" rid="ref6">6</xref>]—to identify participants and, consequently, their health status, which they did not intend to share beyond the boundaries of the research in which they participated. The extent of such risks has not been fully explored in the HDSS data sets, and consequently, no measures have been taken to mitigate these risks; that is, to the best of our knowledge, this has not been addressed in the literature on health, statistics, and privacy.</p>
        <p>Note that for some selected data sets and general anonymization problems, the World Bank Group, PARIS21 and Organization for Economic Cooperation and Development, and the International Household Survey Network supported the development of the anonymization software sdcMicro [<xref ref-type="bibr" rid="ref7">7</xref>], and they all recommend it [<xref ref-type="bibr" rid="ref8">8</xref>]. sdcMicro is actively used in many organizations, ranging from statistical offices [<xref ref-type="bibr" rid="ref9">9</xref>] and social and political science [<xref ref-type="bibr" rid="ref10">10</xref>] to the United Nations High Commissioner for Refugees [<xref ref-type="bibr" rid="ref11">11</xref>] and health [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref14">14</xref>]. However, there is a need to justify the use of this software for the specific needs arising from longitudinal population health data in LMIC.</p>
        <p>Longitudinal data include records of different attributes of the same participants observed and measured at multiple points in time. Existing theories and software are suitable only for anonymizing and assessing the disclosure risk of cross-sectional data. An extension of this theory is needed to quantify and control the disclosure risk for longitudinal data.</p>
      </sec>
      <sec>
        <title>Karonga HDSS</title>
        <p>An HDSS is a combination of field and computing procedures for collecting demographic, health risk, and exposure and outcome data from a defined population within a defined geographical area on a longitudinal basis [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. HDSSs are set up to monitor open or dynamic population cohorts, building longitudinal databases of this population over time [<xref ref-type="bibr" rid="ref15">15</xref>]. A substantial body of literature has considered various HDSS aspects, including the rationale for their establishment in LMIC [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], the definition of core HDSS concepts and processes [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref16">16</xref>], and the reference data model [<xref ref-type="bibr" rid="ref17">17</xref>] among many others. The data set used for illustration is from an HDSS in Malawi, the Karonga HDSS. This HDSS has been described in detail elsewhere [<xref ref-type="bibr" rid="ref18">18</xref>]. Briefly, its surveillance site is in northern rural Malawi and has been in operation from its initial census in 2002 to 2004. The Karonga HDSS contains longitudinally linked health data from the study population.</p>
        <p>The Karonga HDSS is part of a collaborative research program under the Malawi Epidemiology and Intervention Research Unit [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
      </sec>
      <sec>
        <title>HDSS Core Residency Data</title>
        <p>The generic data set structure on which we based this data anonymization requirements analysis is in the core residency data format. This standard data set is widely used in HDSS for data sharing and analysis [<xref ref-type="bibr" rid="ref19">19</xref>]. An extended version of this data set is comprehensive enough to cover the considerations that need to be made in anonymizing HDSS event history data. This data set essentially comprises the core HDSS events for each individual under surveillance and attributes relating to the individual and to the core events. The events occur in a particular order that defines entry or exit from the study population. The first event for any individual is one of the following: a baseline census enumeration, a birth, or an in-migration. The last event is one of the following: an out-migration, a death, or the end of observation (censoring). The intervening events observed for any individual need to be logical; for example, an individual born within the surveillance area cannot have in-migration as the next event. The core events change the residency status of an individual and, thus, the name of the data set, core residency data [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>The basic form of the core residency data includes the following variables: an individual identifier, date of birth, sex, core event, and event date. This form contains all the data on the numerators and person-years of surveillance (exposure) required to calculate the demographic rates for the HDSS population and perform event history analyses.</p>
        <p>This basic form can be extended to capture other observations made within the HDSS population. These may include disaggregation of the migration events by distinguishing between migration within the surveillance area (internal) and migration to or from outside the area (external), as well as the inclusion of attributes that change over time, such as education level, occupation, and specific disease status (eg, HIV and tuberculosis).</p>
        <p>To elaborate on the anonymization requirements, we distinguish between three variable groupings that can go into these HDSS core residency data:</p>
        <list list-type="order">
          <list-item>
            <p>Static variables: These are variables in which the observations on individuals do not change over time, such as sex and date of birth.</p>
          </list-item>
          <list-item>
            <p>Status (time-varying) variables: These are variables in which the observations on individuals change over time, such as occupation or education level.</p>
          </list-item>
          <list-item>
            <p>Core events variables: These are the variables in which the observations are specific to the event. The observed event and the event date fall into this category.</p>
          </list-item>
        </list>
        <p>Our approach investigates the requirements for anonymizing variables falling into these 3 groups.</p>
      </sec>
      <sec>
        <title>Karonga Residency Data</title>
        <p>The variables in this data set largely overlap with those found in the publicly available Karonga HDSS core residency data set on the iSHARE data repository [<xref ref-type="bibr" rid="ref21">21</xref>]. The extended version used in this study has status variables on occupation and education level, in addition to those found in the Karonga core residency file.</p>
        <p>This data set contains information recorded from October 1995 to the end of 2016, comprising 14 variables, 280,381 rows (events), and 72,935 individuals ever observed since the HDSS’s inception.</p>
        <p>The main variables of the data set for this work are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>Static variables: sex</p>
          </list-item>
          <list-item>
            <p>Status variables: occupation with categories not working, student, unskilled manual, farmer, fisherman, skilled manual, nonmanual, small trader or business, unskilled manual, skilled manual, nonmanual, and professional; and education with categories none, 1 to 3 years primary, 4 to 7 years primary, primary completed, Junior Certificate of Education completed, Malawi School Certificate of Education completed, and tertiary</p>
          </list-item>
          <list-item>
            <p>Core event variables: event code with dates on the baseline, date of birth, in-migration, out-migration, and date of death</p>
          </list-item>
          <list-item>
            <p>Household ID, mother’s ID, father’s ID, and polygamy ID</p>
          </list-item>
        </list>
      </sec>
      <sec>
        <title>Objective</title>
        <p>To contribute toward filling this gap, we propose a set of requirements for anonymizing the HDSS longitudinal data. Our proposal customizes and applies traditional methods that work on the premise of keeping the data quality as high as possible while slightly altering the data until the disclosure risk is below a fixed threshold. The main contributions of this study are as follows:</p>
        <list list-type="bullet">
          <list-item>
            <p>We define anonymization requirements peculiar to longitudinal event history data.</p>
          </list-item>
          <list-item>
            <p>We propose steps to take to meet these requirements, including assessing and controlling for disclosure risk for the static and time-varying variables and core event dates.</p>
          </list-item>
          <list-item>
            <p>We implement the proposed steps and show the results.</p>
          </list-item>
          <list-item>
            <p>We place our proposal within the larger context of data anonymization approaches, outlining how our method of choice contrasts with the alternatives within the LMIC HDSS context.</p>
          </list-item>
        </list>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>In this section, we outline the methods and procedures for anonymizing HDSS core residency data.</p>
      <sec>
        <title>Different Concepts for Different Needs</title>
        <p>Our approach of keeping data quality as high as possible by modifying data slightly until the disclosure risk is below a certain threshold does not stand alone but rather is part of a broader ecosystem of data anonymization methods. We briefly review this ecosystem and emphasize that the choice of anonymization approaches depends heavily on the needs of the user group and the cost of implementing the solution. We briefly outline 4 important anonymization concepts before discussing their applicability for sharing HDSS data. They are listed in ascending order of data analysis potential as follows: privacy-preserving computation, synthetic data, secure laboratories, and the approach used in this study (anonymized individual-level data using methods of statistical disclosure control [SDC]). With privacy-preserving computation, data remain on the data owner’s side. This can be extended to a secure multiparty computation with multiple clients (data holders). Two popular privacy-preserving computation methods are differential privacy [<xref ref-type="bibr" rid="ref22">22</xref>] and federated learning with Private Aggregation of Teacher Ensembles [<xref ref-type="bibr" rid="ref23">23</xref>]. However, there are several limitations, as highlighted in the studies by Domingo-Ferrer et al [<xref ref-type="bibr" rid="ref24">24</xref>], Francis et al [<xref ref-type="bibr" rid="ref25">25</xref>], and Bambauer et al [<xref ref-type="bibr" rid="ref26">26</xref>]. Furthermore, the user must trust the predictions without evaluating the model and the data behind the model. Another way of providing anonymized data is by generating synthetic data that exhibit the same characteristics as the original data [<xref ref-type="bibr" rid="ref27">27</xref>], usually using machine learning and statistical modeling methods. Synthetic data typically have very low disclosure but have also relatively low data utility when the original data possess complex structures [<xref ref-type="bibr" rid="ref6">6</xref>]. Synthetic data can also be used in remote execution environments, whereby registered researchers work on the synthetic data to develop an analysis code, and the staff of the data holder finally runs the code on the original data. The final analysis output is checked for privacy by laboratory staff as this checking can hardly be fully automated [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>].</p>
      </sec>
      <sec>
        <title>Difficulties in Using Alternative Concepts</title>
        <p>For HDSS data, using privacy-preserving computation would mean first setting up a framework to compute privacy and, for known users (test data), providing a predictive value for a meaningful piece of information (eg, the date of migration or the health status of a person) based on a machine learning prediction approach. It is evident that these approaches have some difficulties in providing good predictions for complex longitudinal data sets. Privacy-preserving computational approaches are also not sustainable options for health and survival data for LMIC because of the high cost and the users’ need for detailed data, instead of simply receiving predictions for sensitive information or working with aggregated data. Synthetic, close-to-reality data have the potential of being a viable approach; however, the complexity of longitudinal event history data from HDSS makes it difficult to model and represent all relationships and logical conditions adequately. Remote access to secure laboratories offers the advantage of working on real data but can only provide access to a small number of trusted researchers and requires permanent staff to perform output checks to keep the software on the servers up to date and the server and access secure.</p>
      </sec>
      <sec>
        <title>Methods for SDC</title>
        <p>For these reasons, methods of SDC are the most suitable. The core concept of SDC comprises transforming data in such a way as to reduce the reidentification risks of the persons represented in the data. More precisely, the aim of SDC is to reduce the risk to a level below a predefined threshold on the one hand and to maintain the data quality and analysis potential and research questions on the other. This is a complex task that requires the application and development of complex methods and, in our particular case, the understanding of specific health population data sets.</p>
      </sec>
      <sec>
        <title>Data Release Types: Public Use Versus Scientific Use Files</title>
        <p>In line with lowering the barriers to data access, as encouraged by funders [<xref ref-type="bibr" rid="ref2">2</xref>], and in the interest of implementing sustainable data sharing models, open data through the sharing of the so-called public use files [<xref ref-type="bibr" rid="ref31">31</xref>] would be a typical mechanism for sharing HDSS data. Public use files require that a potential user agrees to the terms of use and then get access to the data without seeking approval from the data custodians. A reason for this is the resource-efficient publication and distribution of data. Once distributed, there is no need for further labor-intensive steps, as is the case with remote execution and remote access solutions. The next level up would be the scientific use files [<xref ref-type="bibr" rid="ref31">31</xref>]. This requires a potential user to go through a review process by a data access team to confirm that they are a bona fide researcher from a reputable institution. This sharing demands that the custodians set aside staff time to review data access applications, prepare the data for sharing, customize the shared data to suit the request, and communicate and supervise the researchers. These demands of staff time are suboptimal as they will take staff away from their daily work and are rarely sufficiently funded in LMIC medical science research projects.</p>
      </sec>
      <sec>
        <title>Pseudoanonymization</title>
        <p>In pseudoanonymization, a string—the exact name of a person or any other direct identification feature (eg, social security number)—is replaced by a pseudonym, usually a 256-bit hash code produced by a cryptography hash function from a salted string [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. The pseudoanonymization of the HDSS core residency data on the iSHARE data repository is performed in a simplified manner. An ascending ID is assigned per person instead of listing their names or identifiers used in the dynamic HDSS databases. Note that as more data with complex interrelationships are shared through platforms such as the Implementation Network for Sharing Population Information from Research Entities (INSPIRE) data, more elaborate pseudoanonymization will become necessary. However, pseudoanonymization does not solve the data protection problem as it only prevents attacks on direct identifiers.</p>
      </sec>
      <sec>
        <title>Identifying Key Variables—the Disclosure Scenario</title>
        <p>The key question here is what information does an attacker have access to that they could match with the data to be released to identify individuals? Before the key variables (also often called quasi-identifiers) are identified, a check is made to see what other existing data a potential attacker could access and use to link to the current data and identify individuals. This is called the (archive) disclosure scenario [<xref ref-type="bibr" rid="ref34">34</xref>]. Existing data may include census, voters’ roll, population surveys, or administrative data held by government departments and national statistical offices. In most LMIC, not many data sets are available for broad access, and hence, this should not be a major problem.</p>
        <p>The biggest challenge may be that an attacker has additional knowledge of some information pertaining to an individual in the data being released. This is often called the nosy neighbor scenario in the literature [<xref ref-type="bibr" rid="ref34">34</xref>]. An attacker can potentially use this information to identify individuals.</p>
        <p>In general, defining these scenarios requires input from subject matter experts who work with the data being released and who are also aware of other common data.</p>
      </sec>
      <sec>
        <title>Anonymization Methods for Static and Status Variables</title>
        <p>Traditional anonymization of population data uses the concept of uniqueness. By combining several variables (quasi-identifiers from the <italic>Identifying Key Variables—the Disclosure Scenario</italic> section), an individual can be uniquely identified in the data. A key is unique if its frequency is 1, and thus, only one person has the combination of characteristics defined by the key. For example, the key postcode <italic>8404</italic>, citizenship <italic>Austria</italic>, sex <italic>male</italic>, and age <italic>45</italic> are unique in a demographic population data set of Switzerland. A commonly used concept for measuring uniqueness and “almost uniques” is k-anonymity. A data set is k-anonymous if each key (ie, combination of key variables) belongs to at least <italic>k</italic> observations. An approach that also evaluates subsets of key variables is called the special uniques detection algorithm [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. This approach allows for a more detailed analysis and evaluation of uniques in subsets of key variables.</p>
        <p>To achieve k-anonymity and low special uniques detection algorithm scores, the first step typically involves use case–specific recoding of the categorical key variables into broader categories [<xref ref-type="bibr" rid="ref6">6</xref>]. With recoding, the risk can be significantly reduced. If some individuals still have an increased risk and further recoding would lead to an excessive loss of quality of data, local suppression is typically considered next [<xref ref-type="bibr" rid="ref6">6</xref>]. This suppresses certain values to guarantee, for example, k-anonymity. The aim is to find specific patterns in categorical key variables and replace these patterns with missing values. (heuristic) optimization methods must be applied to find a minimal suppression pattern [<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>If the number of categorical key variables is large or many of these variables have many categories, the number of keys in a data set is large, and many keys will be unique. In this case, recoding and local suppression would significantly change the data to achieve, for example, k-anonymity. Applying the postrandomization method (PRAM) [<xref ref-type="bibr" rid="ref37">37</xref>] to a subset of key variables would be a good alternative to recoding and suppressing all key variables. In the PRAM, values are exchanged between the categories of a variable with certain transition probabilities. An attacker can never be sure whether a value is true or has been swapped.</p>
      </sec>
      <sec>
        <title>Handling Static and Status Variables With Varying Status of a Person Over Time</title>
        <p>Cross-sectional data sets typically contain observations for a single time point, and the application of anonymization methods is generally straightforward (eg, using the guidelines presented by Templ et al [<xref ref-type="bibr" rid="ref6">6</xref>]).</p>
        <p>In the following paragraphs, the extension to longitudinal information, in particular to status variables (eg, <italic>occupation</italic> or <italic>education</italic>), for which the observed values (can) change over time, is discussed. <xref ref-type="table" rid="table1">Table 1</xref> shows the problem of using a toy data set with 2 individuals in a simplified manner. It can be easily seen that for person 1, both educational level and occupational have improved over time. When only the baseline status in 2010 is considered, both individuals share the same level of education and occupation category; thus, they are not unique in the data set. If only 2015 were considered, the 2 individuals would not be unique. If only the latest status of a person is considered, both individuals would be unique in this toy data set, considering the key variables of occupation and education level. Moreover, if each status is reported each year, the 2 individuals would also be unique.</p>
        <p>A number of alternative representations could be used to anonymize the status variables, each of which has its own advantages and disadvantages.</p>
        <p>If only the initial status of a person is reported, the variable would no longer be considered a status variable that changes over time, which simplifies anonymization. The disadvantage is that we can no longer see the progress, for example, in the person’s occupational and educational level over time.</p>
        <p>If only the first and last statuses of a person in a record are reported, all events in between must either be deleted or replaced by the first stage or the last status.</p>
        <p>Another very strict alternative would be to delete the link of a person from one year to the other; that is, for each person, another ID is provided from one year to another. However, this makes a longitudinal analysis difficult; thus, the data utility would suffer significantly.</p>
        <p>Postrandomization could be an option, although the order and consistency of educational and occupational levels are either lost or biased to higher levels. For example, it makes no sense to lower a person’s education level over time; therefore, with realistic swapping probabilities in the PRAM, the education level would randomly increase but never decrease.</p>
        <p>Another approach would be to apply traditional anonymization methods to patterns or subsets of the data, whereby individuals with the same pattern of event occurrence are considered as a subset to be anonymized. For example, the 2 individuals in <xref ref-type="table" rid="table1">Table 1</xref> do not have the same pattern as they have a different number of events. This approach leads to a potentially large oversuppression but reduces the disclosure risk heavily. Studies aimed at analyzing the education and occupation of individuals over time might be possible, especially when data analysts impute the suppressed information.</p>
        <p>Before deciding on one of these or even other alternative approaches, one has to think about the disclosure scenario. How likely is it that an attacker can merge their database with the anonymized data set provided to match and identify individuals? How likely is a nosy neighbor scenario and to what extent?</p>
        <p>For an archive scenario, the following assumptions regarding the attacker’s knowledge are made:</p>
        <list list-type="bullet">
          <list-item>
            <p>Only the last status of education of a person is known to the attacker, assuming that the attacker’s database is more or less an up-to-date archive containing the current educational level of a person used for matching. Here, it is neglected that the attacker has access to the historical sociodemographic status data of individuals.</p>
          </list-item>
          <list-item>
            <p>Only the last occupational status is known by an attacker, provided that the attacker’s database is more or less an up-to-date archive containing the current profession of a person used for matching.</p>
          </list-item>
          <list-item>
            <p>The attacker has knowledge of the static variables of sex and birth date.</p>
          </list-item>
          <list-item>
            <p>The attacker does not know the reason for in- and out-migration but knows the birth date, the start date, and the stop date.</p>
          </list-item>
        </list>
        <p>For a nosy neighbor scenario, the following assumptions about the attacker’s knowledge are made:</p>
        <list list-type="bullet">
          <list-item>
            <p>The (changing status) of the education of a person is known to the attacker over time, assuming that the attacker has individual knowledge of the historical development of the educational and occupational levels of a few individuals.</p>
          </list-item>
          <list-item>
            <p>The attacker has knowledge of the static variables of sex and birth date.</p>
          </list-item>
          <list-item>
            <p>The attacker may know the reason for in- and out-migration for certain individuals and the corresponding event time, and they may have knowledge about the birth date of certain individuals.</p>
          </list-item>
        </list>
        <p>As the data go public as an open-access data set, a nosy neighbor scenario is possible and, thus, in focus. Therefore, we use the approach in which only the first and last observed statuses of a person are reported. This is a solution in which the change in a person’s status is reported without their intermediate improvements, whereas local suppression results in a low number of suppressions as not all stages are reported.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Toy data set supporting a simple explanation to the problem to deal with time-varying information on status variables.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="210"/>
            <col width="250"/>
            <col width="240"/>
            <col width="300"/>
            <thead>
              <tr valign="top">
                <td>Person ID</td>
                <td>(Event) year</td>
                <td>Occupation</td>
                <td>Education level</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>1</td>
                <td>2010</td>
                <td>2</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>2011</td>
                <td>2</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>2012</td>
                <td>3</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>2013</td>
                <td>3</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>2014</td>
                <td>3</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>2015</td>
                <td>3</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>1</td>
                <td>2016</td>
                <td>4</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>2010</td>
                <td>2</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>2015</td>
                <td>3</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>2</td>
                <td>2016</td>
                <td>3</td>
                <td>3</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Handling Event History Dates</title>
        <sec>
          <title>General Considerations</title>
          <p>To prevent (exact) record linkage and closest distance–based neighborhood matching, we suggest adding random noise to the event dates. An adequate obvious choice is to add approximately 100 days randomly. This prevents an attacker from successfully applying record linkage and is likely to prevent distance-based matching.</p>
          <p>However, care must be taken to ensure that the order of events is maintained. For example, if a person has a birth date of May 15, 2009, and we hypothetically assume that this person out-migrated on June 5, 2009, in-migrated on July 6, and died on August 1, 2009, then a random noise of +40 or –40 to +60 or –60 days will completely upset the event order.</p>
          <p>Thus, we need to modify the event data by adding or subtracting a sufficient number of days so that the individual cannot be identified, although the data utility and event order of the data are retained. More specifically, the addition of noise must be performed with the following constraints: (1) the order of events must be maintained; (2) the time span between events should remain the same as much as possible, naturally fulfilled by adding noise; (3) attacks with record linkage should not be successful; and (4) the number of events per person should remain unchanged.</p>
          <p>This leads to a sequential approach that adds noise for each person, event by event, under certain restrictions, explained in more detail in the following paragraphs. Of course, the main parameter—the level of noise—must be determined on a use case and data set–specific basis.</p>
        </sec>
        <sec>
          <title>Add Noise to One Event Date</title>
          <p>For simplicity, equation 1 shows the case for 3 events, whereby noise is added for 1 person for event 2. <xref rid="figure1" ref-type="fig">Figure 1</xref> shows this case with 3 event dates t<sub>1</sub>, t<sub>2</sub>, and t<sub>3</sub>, and the time span between events 1 and 2 (∆<sub>2;1</sub>) and events 2 and 3 (∆<sub>3;2</sub>).</p>
          <p>It should be noted that extension to any number of events per person is possible and straightforward to implement, although the notation becomes more complicated.</p>
          <p>With <italic>s</italic>, a Bernoulli random values∈{–1, 1} with <italic>P</italic>=.50 for random addition or subtraction of the event date, and u ~ U[ɛ<sub>min</sub>; ɛ<sub>max</sub>], which controls the number of noise (in days), a new (anonymized) event date t<sub>2</sub><sup>*</sup> is calculated using the following:</p>
          <disp-formula>
          t<sub>2</sub><sup>*</sup> = t<sub>2</sub> + u · s , if ∆<sub>2,1</sub> &#62; ɛ<sub>max</sub> ∧ ∆<sub>3,2</sub> &#62; ɛ<sub>max</sub>
          t<sub>2</sub><sup>*</sup> = t<sub>2</sub> + u , if ∆<sub>2,1</sub> ≤ ɛ<sub>max</sub> ∧ ∆<sub>3,2</sub> &#62; ɛ<sub>max</sub>
          t<sub>2</sub><sup>*</sup> = t<sub>2</sub> – u , if ∆<sub>2,1</sub> &#62; ɛ<sub>max</sub> ∧ ∆<sub>3,2</sub> ≤ ɛ<sub>max</sub>
          t<sub>2</sub><sup>*</sup> = t<sub>2</sub> – u – (∆<sub>2,1</sub> – 1) , if ∆<sub>2,1</sub> &#62; ɛ<sub>max</sub> ∧ ∆<sub>3,2</sub> ≤ ɛ<sub>max</sub> ∧ min(∆<sub>2,1</sub>, ∆<sub>3,2</sub>) = ∆<sub>2,1</sub>
          </disp-formula>
          <disp-formula>
            <graphic xlink:href="publichealth_v8i9e34472_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </disp-formula>
          <p>This ensures that the event order is preserved for t<sub>1</sub>, t<sub>2</sub>, and t<sub>3</sub>. Except for the first case, restrictions were applied as the distance between event data was smaller than the specified minimum noise range.</p>
          <p>An alternative noise addition method is to draw u ~ N(µ, σ<sup>2</sup>) and round it to the next integer value.</p>
          <fig id="figure1" position="float">
            <label>Figure 1</label>
            <caption>
              <p>Schematic overview of 3 event history dates for one person and corresponding time span between the events.</p>
            </caption>
            <graphic xlink:href="publichealth_v8i9e34472_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Add Noise Sequentially Event by Event</title>
          <p>The extension of equation 1 to all events of a person is achieved by the sequential application of noise to each event of a person. First, all recorded data of one person are stored, and the number of events of this individual, as well as the distance between all events, are recorded. For the first event, date t<sub>1</sub> noise is either randomly subtracted or added; more precisely, it is subtracted without any restrictions and added less than the distance to the second event. Subsequently, for all other events recorded, in an additional loop considering one event date at the time, noise is added, as described above (equation 1) according to a predefined noise level (see <italic>Disclosure Risk</italic> and <italic>Data Utility</italic> section for further discussion on the level of noise). Therefore, first, for t<sub>1</sub>, noise is added leading to t<sub>1</sub><sup>*</sup>, and then, noise is added to t<sub>2</sub>, considering possible restrictions from t<sub>3</sub> and t<sub>1</sub><sup>*</sup> to not change the event order. Subsequently, noise is added to t<sub>3</sub> considering t<sub>2</sub><sup>*</sup> and t<sub>4</sub>,..., until the last event date. Using this sequential approach, preservation of the event order is guaranteed.</p>
          <p>Restrictions may occur if 3 consecutive events are very close to each other. If the maximal noise of the respective noise level is larger than the difference between t<sub>2</sub><sup>*</sup> and t<sub>3</sub> and t<sub>3</sub> and t<sub>4</sub>, it proceeds as follows. If the minimum of the event difference min(∆<sub>2;1</sub>;∆<sub>3;2</sub>) is larger than the predefined minimum noise, then take minimum=minimum noise and maximum noise=∆<sub>2;1</sub> and ∆<sub>3;2</sub>, respectively, and sample at random. If the minimum of event difference min(∆<sub>2;1</sub>;∆<sub>3;2</sub>) is smaller than the minimum noise, then sample from a univariate distribution <italic>U</italic>(0; ∆<sub>2;1</sub>); same with ∆<sub>3;2</sub> in the respective sampling direction as maximum or minimum noise. In the case of normal distribution while (noise &#60; ∆<sub>2;1</sub> ∧ noise &#62; ∆<sub>3;2</sub>), draw a new value from <italic>N</italic>(μ=0; σ=50) until a valid noise is obtained.</p>
          <p>Furthermore, we would like to briefly point out that it is necessary to consider the special data structure. It has already been mentioned that the event history dates cannot ideally be represented in columns, as there are different numbers of events and different events per person. Therefore, a separate row for each event in the data set is used to store the event code and date for a person; that is, individuals are represented in multiple rows. If a person was born within the observation period, he or she has an additional entry as an event in addition to the actual date of birth. Thus, if no birth date is registered under event dates, as the individual was born before data collection, then only one number is randomly added to the date of birth of a person in all rows of this person. If birth is also represented as event date information, the same noise (used to noise the event date on birth) has to be taken as for the column holding the birth date of the person; that is, the information on birth date and the event birth date is linked and must be considered adequately and consistently.</p>
          <p>In the <italic>Results</italic> section, the noise level chosen for the HDSS core data set is presented, and further insights into the choice of noise level are provided.</p>
        </sec>
      </sec>
      <sec>
        <title>Putting It All Together</title>
        <p>The event data are particularly important as they are numerical information that can be used for record linkage if the attacker has a database of exact event data. However, an attacker might only know the year of birth and death and then use this information for matching. In addition to the event history dates, variables with varying statuses over time must also be considered. Therefore, the changes in education and occupational levels are limited by indicating only the first and last status (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>).</p>
        <p>For certain studies, for example, on fertility by educational level, the full history of event dates and changes in the educational level is needed. This is also true for various studies on the occupational level of individuals over time (eg, answering the question of whether well-educated individuals change their occupational levels quicker). In this case, the entire history of event data might be needed, and the previous procedure has to be adapted, in this case, for example, by anonymizing the patterns, as outlined previously.</p>
        <boxed-text id="box1" position="float">
          <title>Steps of putting it together.</title>
          <p>
            <bold>Step 1</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Add random noise to event dates for each person sequentially, as described in the <italic>Handling Event History Dates</italic> section. This prevents record linkage and nearest-neighbor matching with an external database containing exact event dates and preserves the order of events.</p>
            </list-item>
          </list>
          <p>
            <bold>Step 2</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Aggregate data (ie, from long to wide representation, where each line represents a person) so that each row contains the information of a person for the static variables (such as sex and birth date), first and latest education, and first and latest occupation and build new variables containing the year of birth, year of death, and number of events of a person.</p>
            </list-item>
          </list>
          <p>
            <bold>Step 3</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Perform k-anonymity using local suppression using the implemented methods in sdcMicro [<xref ref-type="bibr" rid="ref7">7</xref>] using the variables mentioned in step 2 to avoid uniques and prevent successful matching. If the year of the earliest or latest event or the year of birth is suppressed, the noised year and noised event date should also be suppressed. It should be noted that this was hardly the case as the importance was set such that the year of birth, year of death, and number of events of a person are the most important variables; thus, the suppression algorithm uses the remaining variables to make local suppressions.</p>
            </list-item>
          </list>
          <p>
            <bold>Step 4</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Disaggregate the anonymized aggregated data (from wide to long representation, where each line represents an event). The data set now includes only the anonymized information on sex and the earliest and latest occupational and educational codes of a person.</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Estimation of the Disclosure Risk</title>
        <p>The theory for estimating disclosure risk in a cross-sectional data set is well implemented, for example, in the R package sdcMicro [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. In fact, for survey sample data, the approach of Franconi et al [<xref ref-type="bibr" rid="ref38">38</xref>] or, for example, Skinner et al [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref39">39</xref>] can be used, or, for population data, the concepts of k-anonymity [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>] or sample uniqueness [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. We introduce an extension of this theory that provides a practical tool for quantifying disclosure risk for event history data.</p>
        <p>Typically in anonymization, methods differ when continuous or categorical information is anonymized [<xref ref-type="bibr" rid="ref6">6</xref>]. In addition, we distinguish between 2 scenarios—the matching of event dates (continuous measurements) and an attack on categorical key variables.</p>
        <p>Event data are considered continuous measurements as there are multiple records for each person on a time scale.</p>
        <p>As k-anonymity is already ensured (step 3) and population data are used, there is no need to quantify the disclosure risk for categorical key variables.</p>
        <p>For continuous event dates, a neighborhood distance-based approach is proposed. Neighborhood matching, as introduced here and further introduced and applied in the <italic>Results</italic> section, assumes that the attacker has a database with exact event dates, which represents a worst-case scenario. For each individual in the anonymized data set, the nearest 3 individuals in the original nonanonymized data are determined by using Euclidean distances between event dates in the original and anonymized files. This is performed with replacement, meaning that the nearest neighbors are available to match for another individual in the data set. In case 1 of the 3 nearest neighbors is the correct match, we identify this observation to be of high risk. The number of risky observations is reported. The <italic>Results</italic> section shows the specific settings for our application.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Anonymization of the Karonga HDSS Core Residency Data Set</title>
        <p>First, it should be noted that the data set obviously cannot be spread into columns of events as migration and other event codes have possibly &#62;1 entry, and the number of events differs between individuals. This makes it difficult to anonymize the data as the individuals have different events and different numbers of the same events at different times.</p>
        <p>The key (identifying) variables are listed in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <p>Experiments with the HDSS core residency data set have shown that an additional identifying variable, the ID of the mother of a child, ID of the father and of the household, and the reason for in-migration and out-migration (reasons are marriage, divorce, start or end of work or education, and others) could potentially enlarge possible matches to approximately 10% of the original possible matches or individuals. Polygamy identifiers are not considered in this study. The usual approach for handling cluster information (eg, persons in households) for risk estimation of (enlarged) risk is, for example, described in Templ et al [<xref ref-type="bibr" rid="ref6">6</xref>] and implemented in sdcMicro under the term of hierarchical risk estimation. However, as no further household information is available in this data set, this approach can be neglected. This is because household information can be used to identify individuals more easily; however, such additional household information is not available in our data set.</p>
        <p>Other socioeconomic or sensible variables (eg, health status) were not included in the open-access data set.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Key (identifying) variables of the health and demographic surveillance system core residency data set.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="650"/>
            <col width="350"/>
            <thead>
              <tr valign="top">
                <td>Key variable</td>
                <td>Kind</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Biological sex</td>
                <td>Static variable</td>
              </tr>
              <tr valign="top">
                <td>Year of birth</td>
                <td>Static variable</td>
              </tr>
              <tr valign="top">
                <td>Year of death</td>
                <td>Static variable</td>
              </tr>
              <tr valign="top">
                <td>Exact event date</td>
                <td>Core event date<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td>Education</td>
                <td>Status variable</td>
              </tr>
              <tr valign="top">
                <td>Occupation</td>
                <td>Status variable</td>
              </tr>
              <tr valign="top">
                <td>Number of events per person</td>
                <td>Static variable</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Contains dates at which the observed core events occurred (birth, death, in-migration, or out-migration).</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Anonymization of Event Dates (Details Related to Step 1)</title>
        <p>According to the random principle, a drawn number of days is randomly added to or subtracted from the event dates of birth, death, in-migration, and out-migration (equation 1; <italic>Add Noise to One Event Date</italic> section).</p>
        <p>Four levels of noise were considered. In 3 scenarios, integer numbers (noise in days denoted by ε) for each event of a person (with E being the number of events of a person) were drawn with equal probability from the following intervals—depending on the noise level. In addition, a fourth scenario with normally distributed random noise is considered:</p>
        <list list-type="order">
          <list-item>
            <p>Noise level 1: ε<sub>min</sub>=46; ε<sub>max</sub>=62</p>
          </list-item>
          <list-item>
            <p>Noise level 2: ε<sub>min</sub>=76; ε<sub>max</sub>=93</p>
          </list-item>
          <list-item>
            <p>Noise level 3: ε<sub>min</sub>=106; ε<sub>max</sub>=124</p>
          </list-item>
          <list-item>
            <p>Noise level 4: u ~ N(µ=0; σ=50)</p>
          </list-item>
        </list>
        <p>As described previously, random noise is added sequentially to the birth date, in-migration and out-migration dates, and death date to prevent record linkage and nearest-neighbor matching, with an external database containing exact event dates and information on sex, number of events, year of birth, year of death, occupational status, and educational level.</p>
      </sec>
      <sec>
        <title>Anonymization of Static and Status Key Variables (Details to Steps 2 to 3)</title>
        <p>To prevent successful matching, we achieved 3-anonymity through global recoding and local suppression using the heuristic implemented in the R package sdcMicro [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p>
        <p>New variables are built for the year of birth, year of death, and year of the first change of educational and occupational status and used as key variables along with the sex of a person and the number of events of a person. Intermediate changes in educational and occupational levels are dropped. K-anonymity is then achieved by local suppression using the implemented methods in sdcMicro [<xref ref-type="bibr" rid="ref7">7</xref>]. If the year of the latest event or the year of birth is suppressed, the noised year and noised event date are also suppressed. The number of events and the year of birth and death are set to the highest importance so that the implemented (weighted) local suppression algorithm in Templ et al [<xref ref-type="bibr" rid="ref7">7</xref>] likely does not include missing values in these variables. Note that one suppression in a variable with high importance would increase the loss (function) in utility for &#62;1 suppression in a variable with low importance (see Templ et al [<xref ref-type="bibr" rid="ref7">7</xref>] for details).</p>
        <p>After event date anonymization and status variable anonymization, the data are again matched to transform them into their original shape.</p>
      </sec>
      <sec>
        <title>Disclosure Risk</title>
        <p>To assess whether a data set was successfully anonymized, we quantified the disclosure risk. It must be reported only for event dates as, for the categorical key variables, k-anonymity is achieved, which satisfies our need to prevent successful matching.</p>
        <p>The disclosure risk is calculated by matching each individual of the raw data set with the 3 nearest neighbors of the anonymized data with replacement using distance-based matching. In addition, an individual is matched with individuals who are born, died, or migrated within plus minus the same year as the true match, respectively, having the same (final) education, the same (final) occupation, and the same sex. If an individual has a missing value for one of these variables because of local suppression, that person is still considered a possible match if the rest of the variables meet the requirement.</p>
        <p>If the match is correct, we assume that the attack was successful, and an individual can be reidentified. This means that if a person is in 3 of the nearest distances, we consider it unsafe. False-positive matches are not taken into account.</p>
        <p><xref ref-type="table" rid="table3">Table 3</xref> reports the absolute and relative disclosure risk (in percentage) of the anonymized Karonga data set for all 4 scenarios, considering only individuals as possible matches who were born or had died or migrated in the range of +1 or –1 year of the date of birth, death, or migration, respectively, of the real match. We can observe that the risk is very low and that an attacker can hardly reidentify individuals. Note that the disclosure risk is already based on a worst-case scenario with 3 neighbors and by assuming the attacker uses the original nonanonymized data for matching. The low risk can also be explained by the fact that we choose ε<sub>min</sub> to be relatively large; for example, for noise level 1 it is 46, meaning that for each event, the date is changed within at least 46 days. However, for death and birth, the risk increases as death is more unique than any of the other variables. The highest risk is connected with normal noise.</p>
        <p>The computation time for neighborhood-based risk measurement, as proposed here, is high, and an implementation that uses parallel computing is preferable. Currently, the anonymization runs for 4 hours on a single-core Intel(R) Core i7-6700HQ central processing unit (CPU) with 2.60 GHz, and 8 days are spent for the risk assessment on all 4 noise levels on the HDSS core residency data set using 32 CPUs, Intel Xeon(R) Gold 5218 CPU with 2.30 GHz.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Counts on successfully matched individuals and relative disclosure risk (in percentage; number of risky individuals divided by the number of individuals times 100) of the anonymized Karonga data set for all 4 levels of noises based on the matching scenario.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Scenario</td>
                <td>Birth (number of successful matches)</td>
                <td>Death (number of successful matches)</td>
                <td>IMG<sup>a</sup> (number of successful matches)</td>
                <td>OMG<sup>b</sup> (number of successful matches)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Absolute risk</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td><italic>U</italic>(46;62)</td>
                <td>1669</td>
                <td>177</td>
                <td>220</td>
                <td>394</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td><italic>U</italic>(76;93)</td>
                <td>1452</td>
                <td>154</td>
                <td>222</td>
                <td>388</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td><italic>U</italic>(106;124)</td>
                <td>1271</td>
                <td>151</td>
                <td>178</td>
                <td>383</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td><italic>N</italic>(μ=0; σ=50)</td>
                <td>1513</td>
                <td>619</td>
                <td>197</td>
                <td>242</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>Relative risk (%)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>U(46;62)</td>
                <td>2.3</td>
                <td>5.0</td>
                <td>0.5</td>
                <td>0.8</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>U(76;93)</td>
                <td>2.0</td>
                <td>4.3</td>
                <td>0.5</td>
                <td>0.8</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>U(106;124)</td>
                <td>1.7</td>
                <td>4.2</td>
                <td>0.4</td>
                <td>0.8</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td><italic>N</italic>(μ=0; σ=50)</td>
                <td>2.1</td>
                <td>17.3</td>
                <td>0.4</td>
                <td>0.5</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>IMG: in-migration.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>OMG: out-migration.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Utility</title>
        <p>Utility measures specialized in a particular field should always be preferred to general measures ([<xref ref-type="bibr" rid="ref42">42</xref>]; eg, as implemented in sdcMicro). To check the data utility after anonymization, visual comparisons of the original nonanonymized and anonymized data sets, as well as chi-square tests comparing contingency tables obtained from original and anonymized data, are shown.</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows the distribution of the date of birth from the original data and the noised data sets. The original data show a heaping in 1925, 1937, and 1945, which is still visible in the modified versions of the data set. This is not surprising as the noise was not too large.</p>
        <p>The 2 midyear population pyramids for 2005 and 2015 are depicted in <xref rid="figure3" ref-type="fig">Figure 3</xref>. We distinguish between the population pyramids for the original nonanonymized data and anonymized data with noise levels of 1 to 4. Almost no differences were observed.</p>
        <p>We do not explicitly show further graphs on the distribution of the date of death, in-migration, and out-migration, as the results are very similar to the previous figures; that is, there are no significant differences in the distributions.</p>
        <p><xref ref-type="table" rid="table4">Table 4</xref> shows summary statistics of the time span between in-migration and subsequent out-migration of individuals. It shows only minimal differences; that is, all statistics are well preserved. The best results are obtained with noise scenario 4 (normal distributed noise). The results for out- to in-migration are comparable, except for the time between out- to in-migration. This can be shown in more details by a visualization.</p>
        <p><xref rid="figure4" ref-type="fig">Figure 4</xref> visualizes this time span between in-migration and subsequent out-migration, as well as between out-migration and in-migration by box plots. The x-axis is presented on a log<sub>10</sub> scale to better see minimal differences in the distribution of the time span between the original nonanonymized data and the anonymized data (almost no differences can be seen in the original scale). Almost no differences were found in the time span for in-migration to out-migration.</p>
        <p>For the number of days between the out-migration and in-migration of a person, the worst results were obtained by scenario 4 (normal distributed noise). The reason for this difference between in- and out-migration is that people tend to return after out-migration much earlier than they leave the place after in-migration. Normal noise tends to increase the number of days of consecutive events if the events are close together.</p>
        <p><xref ref-type="table" rid="table5">Table 5</xref> presents the results of the statistical test. The cross-tabulation for age class×event code×sex×event time category (2000-2004, 2005-2009, 2010-2014, and 2015-2020) was calculated from the original nonanonymized data and for the anonymized data. The corresponding cell counts were compared with each other by using a chi-square test. The results of the chi-square tests (<xref ref-type="table" rid="table5">Table 5</xref>) showed that the null hypothesis of equality of anonymized and original data can never be rejected.</p>
        <p>Naturally, the differences between original and anonymization increase with an increasing level of noise, as can be seen in all the presented tables and visualizations of data utility. The best utility was achieved by adding normal noise (<xref ref-type="table" rid="table5">Table 5</xref>). However, even with noise level 3, the structure is well preserved, and the data utility is very high for all 4 noise levels investigated.</p>
        <p>For the anonymization of the status variables on education and occupation, including sex, number of events of a person, year of birth, and year of death, a few values were suppressed to achieve 3-anonymity (<xref ref-type="table" rid="table6">Table 6</xref>). The highest number of suppressions is present in variable end education (last educational status of a person), with approximately 0.64% (3735/583,480) suppression. Overall, 0.14% (808/583,480) of values were suppressed.</p>
        <p>For the static and status variables, one of the most important information might be the last status of occupation and education. <xref rid="figure5" ref-type="fig">Figure 5</xref> shows the frequencies of the corresponding contingency tables. The differences were minimal and not detectable by visual comparison. This is even more true for the other tabulations.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Distribution of the date of birth of the original data set and for the anonymized data set according to noise levels 1, 2, 3, and 4.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i9e34472_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Population pyramids for 2005 and 2015 midyear population and age structure of the original and anonymized data according to noise levels 1, 2, 3, and 4 for men (left bars) and women (right bars).</p>
          </caption>
          <graphic xlink:href="publichealth_v8i9e34472_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Summary statistics for the number of days between in-migration and subsequent out-migration of a person for noise levels 1 to 4.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>Scenario</td>
                <td>Values (minimum-maximum)</td>
                <td>Values, mean (SD)</td>
                <td>&#60;100 days (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>(0;0) (original)</td>
                <td>(0-5909)</td>
                <td>862.05 (714)</td>
                <td>2.2</td>
              </tr>
              <tr valign="top">
                <td>U(46;62)</td>
                <td>(0-5805)</td>
                <td>846.67 (716)</td>
                <td>3.4</td>
              </tr>
              <tr valign="top">
                <td>U(76;93)</td>
                <td>(0-5832)</td>
                <td>839.25 (717)</td>
                <td>4.4</td>
              </tr>
              <tr valign="top">
                <td>U(106;124)</td>
                <td>(0-5906)</td>
                <td>831.30 (720)</td>
                <td>5.5</td>
              </tr>
              <tr valign="top">
                <td><italic>N</italic>(μ=0; σ=50)</td>
                <td>(0-5859)</td>
                <td>862.58 (716)</td>
                <td>2.9</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Time span (in log10 scale) between in-migration and subsequent out-migration and out-migration to subsequent in-migration of the original data set and for the anonymized data sets by noise levels 1, 2, 3, and 4. Regarding in-migration to out-migration and out-migration to in-migration only individuals who in- or out-migrate, respectively, are considered.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i9e34472_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table5">
          <label>Table 5</label>
          <caption>
            <p>Comparison of 4-dimensional contingency tables of the anonymized and original data using a chi-square test.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="230"/>
            <col width="170"/>
            <col width="170"/>
            <col width="200"/>
            <col width="230"/>
            <thead>
              <tr valign="top">
                <td>Statistics</td>
                <td>U(46;62)</td>
                <td>U(76;93)</td>
                <td>U(106;124)</td>
                <td><italic>N</italic>(μ=0; σ=50)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Test statistic</td>
                <td>46.08</td>
                <td>73.58</td>
                <td>121.39</td>
                <td>37.52</td>
              </tr>
              <tr valign="top">
                <td>Critical value</td>
                <td>237.24</td>
                <td>237.24</td>
                <td>237.24</td>
                <td>237.24</td>
              </tr>
              <tr valign="top">
                <td><italic>P</italic> value</td>
                <td>.99</td>
                <td>.99</td>
                <td>.99</td>
                <td>.99</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Percentage of suppressions per variable and total number of suppressions per variable.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="60"/>
            <col width="120"/>
            <col width="130"/>
            <col width="120"/>
            <col width="130"/>
            <col width="120"/>
            <col width="80"/>
            <col width="90"/>
            <thead>
              <tr valign="top">
                <td>Suppression</td>
                <td>Sex</td>
                <td>Base education</td>
                <td>Base occupation</td>
                <td>End education</td>
                <td>End occupation</td>
                <td>Number of events</td>
                <td>Year of birth</td>
                <td>Year of death</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Suppressions (%)</td>
                <td>0.03</td>
                <td>0.22</td>
                <td>0.07</td>
                <td>0.64</td>
                <td>0.13</td>
                <td>0.02</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>Total suppressions</td>
                <td>23</td>
                <td>160</td>
                <td>53</td>
                <td>465</td>
                <td>94</td>
                <td>13</td>
                <td>0</td>
                <td>0</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Relative frequencies of the latest educational and latest occupational status of individuals for the original and the anonymized data set.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i9e34472_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Providing open data (public use files) is a typical mechanism for HDSS data sharing, which is consistent with the funders’ [<xref ref-type="bibr" rid="ref2">2</xref>] call for lowering barriers to data access and in the interest of implementing sustainable data sharing models. However, more stringent anonymization is required than that for access-restricted and contracted files used for scientific purposes.</p>
        <p>Anonymizing HDSS data is challenging, and no easy-to-apply solutions are available. The details matter to ensure consistency or credibility, and context knowledge is key for successful implementation. The presented approach is novel in several respects. This is the first time that a systematic approach has been adopted to determine the anonymization requirements for residency data from LMIC HDSS studies or for any other longitudinal data generated in these settings. Previously, anonymization of HDSS data was performed on an ad hoc basis. We grouped the variables into static, status (time-varying), and core event–specific variables and tackled the anonymization relating to the variables in each of these groupings.</p>
        <p>We achieved an anonymized data set with very low disclosure risk and high utility, ready for sharing as a public use data file.</p>
        <p>Using distance-based neighborhood matching, we simulated an attack under a nosy neighbor situation and using the worst-case scenario, where attackers have full information on the original data. We showed that the risk of disclosure is very low, even when assuming the worst-case scenario.</p>
        <p>We explicitly defined a procedure for anonymizing core event dates as a major part of the HDSS event history data anonymization. Different levels of noise addition to the event history dates were evaluated for disclosure risk and data utility. It was found that high utility was maintained, even with the highest level of noise. The basic properties of the event data such as order, time span, and number of events were preserved compared with the original data. As can be seen from the application and anonymization of event history dates, it is likely that the noise level and the loss of data utility will balance each other. Thus, a medium level of noise may be recommended to preserve the properties and usefulness of the data. In addition, the preservation of the time intervals between events is important for the successful implementation of this anonymization method. If the interval is too small, the added noise will is also automatically reduced by the algorithm.</p>
        <p>Furthermore, our work explores the extent to which methods or tools such as sdcMicro can be used and for which aspects of longitudinal data. The guides for these tools focus on cross-sectional data and thus do not naturally lend themselves to the anonymization of multiple records per individual, which is the case in the Karonga HDSS core residency data that we used. In this regard, we transformed the time-varying variables of education level and occupation, year of death, year of birth, and the number of events for an individual before feeding them into the sdcMicro R package. The transformation involved limiting the number of transitions an individual had in the time-varying variables over time. This strategy preserves the data utility well, albeit providing fewer details than the original data.</p>
        <p>The HDSS and medical science research communities in LMIC settings will be the primary beneficiaries of the results and methods presented in this paper; however, the results will be useful for anyone working on anonymizing longitudinal data sets, possibly including time-varying information and event history data with time-varying variables for purposes of sharing. If more sensitive variables such as medical conditions are added, l-diversity should also be checked. Alternatively, the PRAM [<xref ref-type="bibr" rid="ref37">37</xref>] should be applied to medical conditions.</p>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>The proposed approach of combining the range of values for the status variables into a baseline value and a final value may not be optimal for some analyses. This is one of the realities of data anonymization; it almost always results in data of lower utility than the original data. Further work is required to explore alternative handling of the status variables to determine the optimal handling of the transitions in the time-varying variables.</p>
        <p>The disclosure risk is calculated based on 3 nearest-neighbor distance-based matchings. This matching strategy is already quite complex, with some constraints described previously, as well as dealing with missing values. However, other matching strategies might be possible, and specialized record linkage software [<xref ref-type="bibr" rid="ref43">43</xref>] might also be considered.</p>
        <p>Further work is also required to determine the right amount of offset for the core event dates. To determine this, it might be important to gather data from the participants to estimate what it would take to sufficiently offset the dates so that the potential nosy neighbors are unable to make guesses even in cases where events such as in-migration are rare.</p>
        <p>Of course, not all data sets might have exactly the same structure as the HDSS residency data set used here. Other longitudinal data sets from HDSS settings, such as those generated from the observation of tuberculosis episodes or sexual partnership episodes, may contain features not fully catered for by our approach here. These issues need to be explored further.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CPU</term>
          <def>
            <p>central processing unit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">HDSS</term>
          <def>
            <p>health and demographic surveillance system</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">INSPIRE</term>
          <def>
            <p>Implementation Network for Sharing Population Information from Research Entities</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LMIC</term>
          <def>
            <p>low- and middle-income countries</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">PRAM</term>
          <def>
            <p>postrandomization method</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">SDC</term>
          <def>
            <p>statistical disclosure control</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The work of CK and MT was supported by a start-up grant from Network for the promotion of Institutional Health Partnerships, Switzerland. An interview about this grant and further details about the project can be found in German, English, and French [<xref ref-type="bibr" rid="ref44">44</xref>]. The authors would especially like to thank Dörte Petit and Judith Safford from the University of Bern for their support on this project.</p>
      <p>Malawi Epidemiology and Intervention Research Unit (MEIRU) and Zurich University of Applied Sciences (ZHAW) contributed in kind for some of CK’s and MT’s time on this project to enable them to fully explore the research collaboration and the methods used for anonymization.</p>
      <p>The authors’ gratitude also goes to the study participants and the iSHARE team for providing a platform through which health and demographic surveillance system data can be shared.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pisani</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Aaby</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Breugelmans</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Carr</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Groves</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Helinski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kamuya</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kern</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Littler</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Marsh</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Mboup</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Merson</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sankoh</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Serafini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schneider</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schoenenberger</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Guerin</surname>
              <given-names>PJ</given-names>
            </name>
          </person-group>
          <article-title>Beyond open data: realising the health benefits of sharing data</article-title>
          <source>BMJ</source>
          <year>2016</year>
          <month>10</month>
          <day>10</day>
          <volume>355</volume>
          <fpage>i5295</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27758792"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.i5295</pub-id>
          <pub-id pub-id-type="medline">27758792</pub-id>
          <pub-id pub-id-type="pmcid">PMC6616027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Walport</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brest</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Sharing research data to improve public health</article-title>
          <source>Lancet</source>
          <year>2011</year>
          <month>02</month>
          <day>12</day>
          <volume>377</volume>
          <issue>9765</issue>
          <fpage>537</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(10)62234-9</pub-id>
          <pub-id pub-id-type="medline">21216456</pub-id>
          <pub-id pub-id-type="pii">S0140-6736(10)62234-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sankoh</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Byass</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>The INDEPTH Network: filling vital gaps in global epidemiology</article-title>
          <source>Int J Epidemiol</source>
          <year>2012</year>
          <month>06</month>
          <volume>41</volume>
          <issue>3</issue>
          <fpage>579</fpage>
          <lpage>88</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22798690"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ije/dys081</pub-id>
          <pub-id pub-id-type="medline">22798690</pub-id>
          <pub-id pub-id-type="pii">dys081</pub-id>
          <pub-id pub-id-type="pmcid">PMC3396316</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Federer</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Belter</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Joubert</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Livinski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>YL</given-names>
            </name>
            <name name-style="western">
              <surname>Snyders</surname>
              <given-names>LN</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Data sharing in PLOS ONE: an analysis of data availability statements</article-title>
          <source>PLoS One</source>
          <year>2018</year>
          <month>5</month>
          <day>2</day>
          <volume>13</volume>
          <issue>5</issue>
          <fpage>e0194768</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0194768"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0194768</pub-id>
          <pub-id pub-id-type="medline">29719004</pub-id>
          <pub-id pub-id-type="pii">PONE-D-17-43360</pub-id>
          <pub-id pub-id-type="pmcid">PMC5931451</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Herbst</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Juvekar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bhattacharjee</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bangha</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Patharia</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tei</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Sankoh</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The INDEPTH data repository: an international resource for longitudinal population and health data from health and demographic surveillance systems</article-title>
          <source>J Empir Res Hum Res Ethics</source>
          <year>2015</year>
          <month>07</month>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>324</fpage>
          <lpage>33</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/1556264615594600?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/1556264615594600</pub-id>
          <pub-id pub-id-type="medline">26297754</pub-id>
          <pub-id pub-id-type="pmcid">PMC4547208</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <source>Statistical Disclosure Control for Microdata: Methods and Applications in R</source>
          <year>2017</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kowarik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Meindl</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Statistical disclosure control for micro-data using the R package sdcMicro</article-title>
          <source>J Stat Soft</source>
          <year>2015</year>
          <volume>67</volume>
          <issue>4</issue>
          <fpage>1</fpage>
          <lpage>36</lpage>
          <pub-id pub-id-type="doi">10.18637/jss.v067.i04</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <article-title>Statistical Disclosure Control (sdcMicro)</article-title>
          <source>International Household Survey Network</source>
          <access-date>2022-02-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.ihsn.org/software/disclosure-control-toolbox,">http://www.ihsn.org/software/disclosure-control-toolbox,</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Todorov</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>The software environment R for official statistics and survey methodology</article-title>
          <source>Aust J Stat</source>
          <year>2016</year>
          <month>02</month>
          <day>29</day>
          <volume>45</volume>
          <issue>1</issue>
          <fpage>97</fpage>
          <lpage>124</lpage>
          <pub-id pub-id-type="doi">10.17713/ajs.v45i1.100</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Milliff</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Data security in practitioner-academic partnerships: an agenda for improvement</article-title>
          <source>SSRN J</source>
          <year>2020</year>
          <month>9</month>
          <day>16</day>
          <pub-id pub-id-type="doi">10.2139/ssrn.3693330</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="web">
          <article-title>Statistical Disclosure Control</article-title>
          <source>The Centre for Humanitarian Data</source>
          <year>2019</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://centre.humdata.org/guidance-note-statistical-disclosure-control/">https://centre.humdata.org/guidance-note- statistical-disclosure-control/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hummerl</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Data-intensive computing with genomic data</article-title>
          <source>BiobankCloud</source>
          <year>2013</year>
          <access-date>2022-08-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cordis.europa.eu/docs/projects/cnect/1/317871/080/deliverables/001-D52.pdf">https://cordis.europa.eu/docs/pro jects/cnect/1/317871/080/deliverables/001-D52.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Song</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Waitman</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The impact of medical big data anonymization on early acute kidney injury risk prediction</article-title>
          <source>AMIA Jt Summits Transl Sci Proc</source>
          <year>2020</year>
          <month>5</month>
          <day>30</day>
          <volume>2020</volume>
          <fpage>617</fpage>
          <lpage>25</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32477684"/>
          </comment>
          <pub-id pub-id-type="medline">32477684</pub-id>
          <pub-id pub-id-type="pmcid">PMC7233037</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <article-title>COVID-19 Case Privacy Review</article-title>
          <source>GitHub</source>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/CDCgov/covid_case_privacy_review/">https://github.com/CDCgov/covid_case_privacy_review/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>INDEPTH Network</collab>
          </person-group>
          <source>Population and Health in Developing Countries: Population, Health, and Survival at INDEPTH Sites</source>
          <year>2002</year>
          <publisher-loc>Ottawa, ON, Canada</publisher-loc>
          <publisher-name>International Development Research Centre</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wamukoya</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ezeh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Emina</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Sankoh</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Health and demographic surveillance systems: a step towards full civil registration and vital statistics system in sub-Sahara Africa?</article-title>
          <source>BMC Public Health</source>
          <year>2012</year>
          <month>09</month>
          <day>05</day>
          <volume>12</volume>
          <fpage>741</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcpublichealth.biomedcentral.com/articles/10.1186/1471-2458-12-741"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1471-2458-12-741</pub-id>
          <pub-id pub-id-type="medline">22950896</pub-id>
          <pub-id pub-id-type="pii">1471-2458-12-741</pub-id>
          <pub-id pub-id-type="pmcid">PMC3509035</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benzler</surname>
              <given-names>BJ</given-names>
            </name>
            <name name-style="western">
              <surname>Herbst</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>MacLeod</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A data model for demographic surveillance systems</article-title>
          <source>INDEPTH Network</source>
          <year>1998</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.indepth-network.org/Resource%20Kit/INDEPTH%20DSS%20Resource%20Kit/LinkedDocuments/HRS2%20DSS%20Reference%20Data%20Model%20Paper.pdf">http://www.indepth-network.org/Resource%20Kit/INDEPTH%20DSS%20Resource%20Kit/LinkedDocuments/HRS2%20DSS %20Reference%20Data%20Model%20Paper.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Crampin</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Dube</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mboma</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chihana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jahn</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Baschieri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Molesworth</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mwaiyeghele</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Branson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Floyd</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McGrath</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fine</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>French</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Glynn</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Zaba</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Profile: the Karonga health and demographic surveillance system</article-title>
          <source>Int J Epidemiol</source>
          <year>2012</year>
          <month>06</month>
          <volume>41</volume>
          <issue>3</issue>
          <fpage>676</fpage>
          <lpage>85</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/22729235"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ije/dys088</pub-id>
          <pub-id pub-id-type="medline">22729235</pub-id>
          <pub-id pub-id-type="pii">dys088</pub-id>
          <pub-id pub-id-type="pmcid">PMC3396313</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Crampin</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Kayuni</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Amberbir</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Musicha</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Koole</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Tafatatha</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Branson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Saul</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mwaiyeghele</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Nkhwazi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Phiri</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Price</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mwagomba</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mwansambo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jaffar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nyirenda</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Hypertension and diabetes in Africa: design and implementation of a large population-based study of burden and risk factors in rural and urban Malawi</article-title>
          <source>Emerg Themes Epidemiol</source>
          <year>2016</year>
          <month>2</month>
          <day>1</day>
          <volume>13</volume>
          <fpage>3</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ete-online.biomedcentral.com/articles/10.1186/s12982-015-0039-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12982-015-0039-2</pub-id>
          <pub-id pub-id-type="medline">26839575</pub-id>
          <pub-id pub-id-type="pii">39</pub-id>
          <pub-id pub-id-type="pmcid">PMC4736489</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bocquier</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ginsburg</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Herbst</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sankoh</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Collinson</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>A training manual for event history data management using Health and Demographic Surveillance System data</article-title>
          <source>BMC Res Notes</source>
          <year>2017</year>
          <month>06</month>
          <day>26</day>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>224</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcresnotes.biomedcentral.com/articles/10.1186/s13104-017-2541-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13104-017-2541-9</pub-id>
          <pub-id pub-id-type="medline">28651610</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13104-017-2541-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC5485641</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dube</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Crampin</surname>
              <given-names>AC</given-names>
            </name>
          </person-group>
          <article-title>Malawi - Karonga HDSS INDEPTH Core Dataset 2003-2017 (Release 2019)</article-title>
          <source>INDEPTH Network Data Repository</source>
          <year>2019</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://datacompass.lshtm.ac.uk/id/eprint/1738/">https://datacompass.lshtm.ac.uk/id/eprint/1738/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dwork</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Differential privacy: a survey of results</article-title>
          <source>Proceedings of the 5th International Conference on the Theory and Applications of Models of Computation</source>
          <year>2008</year>
          <conf-name>TAMC '08</conf-name>
          <conf-date>April 25-29, 2008</conf-date>
          <conf-loc>Xi'an, China</conf-loc>
          <fpage>1</fpage>
          <lpage>19</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-540-79228-4_1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Erlingsson</surname>
              <given-names>Ú</given-names>
            </name>
            <name name-style="western">
              <surname>Goodfellow</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>McMahan</surname>
              <given-names>HB</given-names>
            </name>
            <name name-style="western">
              <surname>Mironov</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Papernot</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Talwar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>On the protection of private information in machine learning systems: two recent approches</article-title>
          <source>Proceedings of the IEEE 30th Computer Security Foundations Symposium</source>
          <year>2017</year>
          <conf-name>CSF '17</conf-name>
          <conf-date>August 21-25, 2017</conf-date>
          <conf-loc>Santa Barbara, CA, USA</conf-loc>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1109/csf.2017.10</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sánchez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Blanco-Justicia</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The limits of differential privacy (and its misuse in data release and machine learning)</article-title>
          <source>Commun ACM</source>
          <year>2021</year>
          <month>07</month>
          <volume>64</volume>
          <issue>7</issue>
          <fpage>33</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2011.02352"/>
          </comment>
          <pub-id pub-id-type="doi">10.1145/3433638</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Francis</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Dear differential privacy, put up or shut up</article-title>
          <source>The Max Planck Institute for Software Systems</source>
          <year>2020</year>
          <month>1</month>
          <day>9</day>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.mpi-sws.org/tr/2020-005.pdf">http://www.mpi-sws.org/tr/2020-005.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bambauer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Muralidhar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sarathy</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Fool's gold: an illustrated critique of differential privacy</article-title>
          <source>Vanderbilt J Entertain Technol Law</source>
          <year>2020</year>
          <volume>16</volume>
          <issue>4</issue>
          <fpage>701</fpage>
          <lpage>55</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skinner</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>DJ</given-names>
            </name>
          </person-group>
          <article-title>Estimating the re-identification risk per record in microdata</article-title>
          <source>J Off Stat</source>
          <year>1998</year>
          <volume>14</volume>
          <issue>4</issue>
          <fpage>361</fpage>
          <lpage>72</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hochguertel</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>De facto anonymity in results</article-title>
          <source>FDZ-Arbeitspapier Nr</source>
          <year>2012</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://unece.org/fileadmin/DAM/stats/documents/ece/ces/ge.46/2011/50_Hochguertel-Weiss.pdf">https://unece.org/fileadmin/DAM/stats/documents/ece/ces/ge.46/2011/50_Hochguertel-Weiss.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brandt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>de Wolf</surname>
              <given-names>PP</given-names>
            </name>
          </person-group>
          <article-title>Guidelines for the checking of output based on microdata research</article-title>
          <source>Data without Boundaries</source>
          <year>2013</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ec.europa.eu/eurostat/cros/system/files/dwb_standalone-document_output-checking-guidelines.pdf">https://ec.europa.eu/eurostat/cros/system/files/dwb_standalone-document_output-checking-guidelines.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Griffiths</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Greci</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kotrotsios</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Parker</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Scott</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Welpton</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wolters</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Woods</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Handbook on Statistical Disclosure Control for Outputs</article-title>
          <source>figshare</source>
          <year>2019</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://figshare.com/articles/book/SDC_Handbook/9958520/1">https://figshare.com/articles/book/SDC_Handbook/9958520/1</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dupriez</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Boyko</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Dissemination of microdata files: principles procedures and practices</article-title>
          <source>International Household Survey Network</source>
          <year>2010</year>
          <month>8</month>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.ihsn.org/sites/default/files/resources/IHSN-WP005.pdf">http://www.ihsn.org/sites/default/files/resources/IHSN-WP005.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borde</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Hebare</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Dhanedhar</surname>
              <given-names>PD</given-names>
            </name>
          </person-group>
          <article-title>Overview of Web password hashing using salt techiques</article-title>
          <source>Int Res J Eng Technol</source>
          <year>2017</year>
          <month>11</month>
          <volume>4</volume>
          <issue>11</issue>
          <fpage>152</fpage>
          <lpage>4</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sauermann</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kanjala</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Austin</surname>
              <given-names>CC</given-names>
            </name>
            <collab>RDA COVID-19 WG</collab>
          </person-group>
          <article-title>Preservation of individuals’ privacy in shared COVID-19 related data</article-title>
          <source>SSRN J</source>
          <year>2020</year>
          <month>7</month>
          <day>17</day>
          <pub-id pub-id-type="doi">10.2139/ssrn.3648430</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hundelpool</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Domingo-Ferrer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Franconi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giessing</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nordholt</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Spicer</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>de Wolf</surname>
              <given-names>PP</given-names>
            </name>
          </person-group>
          <source>Statistical Disclosure Control</source>
          <year>2012</year>
          <publisher-loc>Hoboken, NJ, USA</publisher-loc>
          <publisher-name>Wiley</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Haglin</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Keane</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>A recursive search algorithm for statistical disclosure assessment</article-title>
          <source>Data Min Knowl Disc</source>
          <year>2007</year>
          <month>7</month>
          <day>10</day>
          <volume>16</volume>
          <issue>2</issue>
          <fpage>165</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1007/s10618-007-0078-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Haglin</surname>
              <given-names>DJ</given-names>
            </name>
          </person-group>
          <article-title>A new algorithm for finding minimal sample uniques for use in statistical disclosure assessment</article-title>
          <source>Proceedings of the 5th IEEE International Conference on Data Mining</source>
          <year>2005</year>
          <conf-name>ICDM '05</conf-name>
          <conf-date>November 27-30, 2005</conf-date>
          <conf-loc>Houston, TX, USA</conf-loc>
          <fpage>290</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dblp.uni-trier.de/db/conf/icdm/icdm2005.html%5C#ManningH05"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/icdm.2005.10</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gouweleeuw</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Kooiman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Willenborg</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>de Wolf</surname>
              <given-names>PP</given-names>
            </name>
          </person-group>
          <article-title>Post randomisation for statistical disclosure control: theory and implementation</article-title>
          <source>J Off Stat</source>
          <year>1998</year>
          <volume>14</volume>
          <issue>4</issue>
          <fpage>463</fpage>
          <lpage>78</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Franconi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Polettini</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Individual risk estimation in μ-argus: a review</article-title>
          <source>Proceedings of the CASC Project International Workshop on the Privacy in Statistical Databases</source>
          <year>2004</year>
          <conf-name>PSD '04</conf-name>
          <conf-date>June 9-11, 2004</conf-date>
          <conf-loc>Barcelona, Spain</conf-loc>
          <fpage>262</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.1007/978-3-540-25955-8_20</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skinner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shlomo</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Assessing identification risk in survey microdata using log-linear models</article-title>
          <source>J Am Stat Assoc</source>
          <year>2008</year>
          <month>9</month>
          <volume>103</volume>
          <issue>483</issue>
          <fpage>989</fpage>
          <lpage>1001</lpage>
          <pub-id pub-id-type="doi">10.1198/016214507000001328</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Samarati</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sweeney</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Protecting privacy when disclosing information: k-anonymity and its enforcement through generalization and suppression</article-title>
          <source>Electronic Privacy Information Center</source>
          <year>1998</year>
          <access-date>2021-10-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://epic.org/wp-content/uploads/privacy/reidentification/Samarati_Sweeney_paper.pdf">https://epic.org/wp-content/uploads/privacy/reidentification/Samarati_Sweeney_paper.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Samarati</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Protecting respondents identities in microdata release</article-title>
          <source>IEEE Trans Knowl Data Eng</source>
          <year>2001</year>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>1010</fpage>
          <lpage>27</lpage>
          <pub-id pub-id-type="doi">10.1109/69.971193</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Quality indicators for statistical disclosure methods: a case study on the structure of earnings survey</article-title>
          <source>J Off Stat</source>
          <year>2015</year>
          <month>12</month>
          <day>16</day>
          <volume>31</volume>
          <issue>4</issue>
          <fpage>737</fpage>
          <lpage>61</lpage>
          <pub-id pub-id-type="doi">10.1515/jos-2015-0043</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sariyar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Borg</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The RecordLinkage package: detecting errors in data</article-title>
          <source>R J</source>
          <year>2010</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>61</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.32614/rj-2010-017</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wurz</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A partnership building on health research data from Malawi</article-title>
          <source>Esther Switzerland</source>
          <year>2021</year>
          <month>6</month>
          <day>1</day>
          <access-date>2022-08-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.esther-switzerland.ch/a-partnership-building-on-health-research-data-from-malawi/">https://www.esther-switzerland.ch/a-partnership-building-on-health-research-data-from-malawi/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
