<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id><journal-id journal-id-type="publisher-id">publichealth</journal-id><journal-id journal-id-type="index">9</journal-id><journal-title>JMIR Public Health and Surveillance</journal-title><abbrev-journal-title>JMIR Public Health Surveill</abbrev-journal-title><issn pub-type="epub">2369-2960</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e67050</article-id><article-id pub-id-type="doi">10.2196/67050</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Impact of Primary Health Care Data Quality on Infectious Disease Surveillance in Brazil: Case Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Florentino</surname><given-names>Pilar Tavares Veras</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bertoldo Junior</surname><given-names>Juracy</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Barbosa</surname><given-names>George Caique Gouveia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cerqueira-Silva</surname><given-names>Thiago</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Oliveira</surname><given-names>Vinicius de Ara&#x00FA;jo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Garcia</surname><given-names>Marcio Henrique de Oliveira</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Penna</surname><given-names>Gerson Oliveira</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Boaventura</surname><given-names>Viviane</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ramos</surname><given-names>Pablo Ivan Pereira</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Barral-Netto</surname><given-names>Manoel</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Marcilio</surname><given-names>Izabel</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Centro de Integra&#x00E7;&#x00E3;o de Dados e Conhecimento em Sa&#x00FA;de (CIDACS), Instituto Gon&#x00E7;alo Moniz, Funda&#x00E7;&#x00E3;o Oswaldo Cruz</institution><addr-line>R. Mundo, 121 - sala 315 - Trobogy</addr-line><addr-line>Salvador</addr-line><country>Brazil</country></aff><aff id="aff2"><institution>Department of Medical Statistics, London School of Hygiene &#x0026; Tropical Medicine</institution><addr-line>London</addr-line><country>United Kingdom</country></aff><aff id="aff3"><institution>Secretaria de Aten&#x00E7;&#x00E3;o Prim&#x00E1;ria, Minist&#x00E9;rio da Sa&#x00FA;de</institution><addr-line>Bras&#x00ED;lia</addr-line><country>Brazil</country></aff><aff id="aff4"><institution>Secretaria de Vigil&#x00E2;ncia em Sa&#x00FA;de e Ambiente, Minist&#x00E9;rio da Sa&#x00FA;de</institution><addr-line>Bras&#x00ED;lia</addr-line><country>Brazil</country></aff><aff id="aff5"><institution>Escola Fiocruz de Governo, Funda&#x00E7;&#x00E3;o Oswaldo Cruz (Fiocruz)</institution><addr-line>Rio de Janeiro</addr-line><country>Brazil</country></aff><aff id="aff6"><institution>N&#x00FA;cleo de Medicina Tropical, Universidade de Bras&#x00ED;lia</institution><addr-line>Bras&#x00ED;lia</addr-line><country>Brazil</country></aff><aff id="aff7"><institution>Laborat&#x00F3;rio de Medicina e Sa&#x00FA;de P&#x00FA;blica de Precis&#x00E3;o, Instituto Gon&#x00E7;alo Moniz, Funda&#x00E7;&#x00E3;o Oswaldo Cruz</institution><addr-line>Salvador</addr-line><country>Brazil</country></aff><aff id="aff8"><institution>Departamento de Epidemiologia, Escola Bahiana de Medicina e Sa&#x00FA;de P&#x00FA;blica</institution><addr-line>Salvador</addr-line><country>Brazil</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Haridas</surname><given-names>Chinmay</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Olasunkanmi</surname><given-names>Oluwatayo</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Pilar Tavares Veras Florentino, PhD, Centro de Integra&#x00E7;&#x00E3;o de Dados e Conhecimento em Sa&#x00FA;de (CIDACS), Instituto Gon&#x00E7;alo Moniz, Funda&#x00E7;&#x00E3;o Oswaldo Cruz, R. Mundo, 121 - sala 315 - Trobogy, Salvador, 41745-715, Brazil, 55 7131762357; <email>pilar.veras@fiocruz.br</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>21</day><month>2</month><year>2025</year></pub-date><volume>11</volume><elocation-id>e67050</elocation-id><history><date date-type="received"><day>30</day><month>09</month><year>2024</year></date><date date-type="rev-recd"><day>19</day><month>12</month><year>2024</year></date><date date-type="accepted"><day>20</day><month>12</month><year>2024</year></date></history><copyright-statement>&#x00A9; Pilar Tavares Veras Florentino, Juracy Bertoldo Junior, George Caique Gouveia Barbosa, Thiago Cerqueira-Silva, Vinicius de Ara&#x00FA;jo Oliveira, Marcio Henrique de Oliveira Garcia, Gerson Oliveira Penna, Viviane Boaventura, Pablo Ivan Pereira Ramos, Manoel Barral-Netto, Izabel Marcilio. Originally published in JMIR Public Health and Surveillance (<ext-link ext-link-type="uri" xlink:href="https://publichealth.jmir.org">https://publichealth.jmir.org</ext-link>), 21.2.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://publichealth.jmir.org">https://publichealth.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://publichealth.jmir.org/2025/1/e67050"/><abstract><sec><title>Background</title><p>The increase in emerging and re-emerging infectious disease outbreaks underscores the need for robust early warning systems (EWSs) to guide mitigation and response measures. Administrative health care databases provide valuable epidemiological insights without imposing additional burdens on health services. However, these datasets are primarily collected for operational use, making data quality assessment essential to ensure an accurate interpretation of epidemiological analysis. This study focuses on the development and implementation of a data quality index (DQI) for surveillance integrated into an EWS for influenza-like illness (ILI) outbreaks using Brazil&#x2019;s a nationwide Primary Health Care (PHC) dataset.</p></sec><sec><title>Objective</title><p>We aimed to evaluate the impact of data completeness and timeliness on the performance of an EWS for ILI outbreaks and establish optimal thresholds for a suitable DQI, thereby improving the accuracy of outbreak detection and supporting public health surveillance.</p></sec><sec sec-type="methods"><title>Methods</title><p>A composite DQI was established to measure the completeness and timeliness of PHC data from the Brazilian National Information System on Primary Health Care. Completeness was defined as the proportion of weeks within an 8-week rolling window with any register of encounters. Timeliness was calculated as the interval between the date of encounter and its corresponding registry in the information system. The backfilled PHC dataset served as the gold standard to evaluate the impact of varying data quality levels from the weekly updated real-time PHC dataset on the EWS for ILI outbreaks across 5570 Brazilian municipalities from October 10, 2023, to March 10, 2024.</p></sec><sec sec-type="results"><title>Results</title><p>During the study period, the backfilled dataset recorded 198,335,762 ILI-related encounters, averaging 8,623,294 encounters per week. The EWS detected a median of 4 (IQR 2&#x2010;5) ILI outbreak warnings per municipality using the backfilled dataset. Using the real-time dataset, 12,538 (65%) warnings were concordant with the backfilled dataset. Our analysis revealed that 100% completeness yielded 76.7% concordant warnings, while 80% timeliness resulted in at least 50% concordant warnings. These thresholds were considered optimal for a suitable DQI. Restricting the analysis to municipalities with a suitable DQI increased concordant warnings to 80.4%. A median of 71% (IQR 54%-71.9%) of municipalities met the suitable DQI threshold weekly. Municipalities with &#x2265;60% of weeks achieving a suitable DQI demonstrated the highest concordance between backfilled and real-time datasets, with those achieving &#x2265;80% of weeks showing 82.3% concordance.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Our findings highlight the critical role of data quality in improving the EWS&#x2019; performance based on PHC data for detecting ILI outbreaks. The proposed framework for real-time DQI monitoring is a practical approach and can be adapted to other surveillance systems, providing insights for similar implementations. We demonstrate that optimal completeness and timeliness of data significantly impact the EWS&#x2019; ability to detect ILI outbreaks. Continuous monitoring and improvement of data quality should remain a priority to strengthen the reliability and effectiveness of surveillance systems.</p></sec></abstract><kwd-group><kwd>primary health care</kwd><kwd>data quality</kwd><kwd>infectious disease surveillance</kwd><kwd>Brazil</kwd><kwd>early warning system</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In recent decades, the world has witnessed an unprecedented surge of emerging and re-emerging infectious disease outbreaks, underscoring the need for stronger early warning systems (EWSs) [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. The widespread and growing use of electronic health records (EHRs) has heightened the demand for automated processes in disease surveillance [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Systematic monitoring of administrative health care databases provides valuable epidemiological insights [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Importantly, using administrative data for health surveillance avoids the overburden of surveillance teams while ensuring timeliness, as no duplication of registry is required [<xref ref-type="bibr" rid="ref3">3</xref>]. This cost-efficient approach enhances the ability to detect outbreaks, particularly in low-resource settings, thus contributing to global security [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>However, an effective automated EWS based on administrative datasets requires that a real-time data quality assessment algorithm is set within the EWS pipeline. Since administrative data are primarily collected for operational purposes, assessing their quality is crucial to accurate interpretation of epidemiological analysis [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. A systematic review on EHR data quality assessment studies found 14 articles describing dedicated data quality programs deployed in real-world settings, while only 4 produced results generally applicable in diverse settings. Ozonze et al [<xref ref-type="bibr" rid="ref9">9</xref>] suggest there is an absence of comprehensive tools for facilitating reliable and consistent data quality assessments.</p><p>Moreover, despite existing methods for evaluating the quality of administrative health data, including EHR data quality assessment [<xref ref-type="bibr" rid="ref8">8</xref>] and indicators for specific programs such as the Data Quality Audit and the Data Quality Self-Assessment for immunization data [<xref ref-type="bibr" rid="ref10">10</xref>], there remains a gap in applying similar methods to data used for health surveillance. Although metrics for assessing the quality of surveillance systems are well established [<xref ref-type="bibr" rid="ref3">3</xref>], to the best of our knowledge, these have not been applied to evaluate administrative data when used for epidemiological surveillance purposes.</p><p>This paper describes the development and implementation of a data quality index (DQI) to assess the quality of administrative data used in epidemiological surveillance systems. We focus on applying the DQI to nationwide Brazilian primary health care (PHC) administrative data integrated into an EWS for influenza-like illness (ILI) outbreaks. The study compares the EWS performance across different DQI levels, addressing a critical gap in current research by establishing metrics that ensure accurate and timely outbreak detection while leveraging the cost-efficiency of administrative health databases.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>We developed and implemented a data quality assessment algorithm within &#x00C6;SOP (Alert-Early System of Outbreaks with Pandemic Potential), a previously validated EWS [<xref ref-type="bibr" rid="ref11">11</xref>]. This EWS applies aberration detection algorithms, such as the Early Aberration Reporting System (C2) [<xref ref-type="bibr" rid="ref12">12</xref>], to a time series consisting of weekly counts of ILI-related PHC encounters per municipality, aiming at the early detection of outbreaks. To assess the data quality of the PHC data stream, we established the composite indicator DQI to measure the completeness and timeliness of the data. Using the backfilled PHC dataset as a gold standard, we evaluated the impact of data quality in the EWS&#x2019; performance using different levels of data quality of the weekly updated real-time PHC dataset across all 5570 Brazilian municipalities from October 10, 2023, to March 10, 2024.</p></sec><sec id="s2-2"><title>Data Source</title><p>Brazil is an upper middle-income country with approximately 212.6 million people living in 5570 municipalities [<xref ref-type="bibr" rid="ref13">13</xref>], and we included all ILI-related PHC encounters occurring during the study period in our analysis. We analyzed data from the Brazilian Unified Health System (SUS), which stands as one of the largest public health systems globally, providing comprehensive and universal health care to the entire population. The effective management of SUS relies on diverse information systems, among which the Brazilian National Information System on Primary Health Care (SISAB [Sistema de Informa&#x00E7;&#x00E3;o em Sa&#x00FA;de para a Aten&#x00E7;&#x00E3;o B&#x00E1;sica]) plays a crucial role. SISAB is a hierarchical, decentralized information system maintained and managed by the Ministry of Health (MoH), and harbors data on all publicly funded PHC encounters in the country. Data registration is mandatory for the allocation of financial resources from the federal to the municipal level. All encounters are coded by the <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) or the International Classification of Primary Care (ICPC-2).</p><p>According to the MoH&#x2019;s guidelines, municipalities are requested to update the system at least on a monthly basis, with a window of 4 months for amendments following each monthly submission. This operational guideline aligns with the SISAB&#x2019;s purpose of informing decision-making for the management of the PHC system in the country. However, the EWS uses weekly updates of the SISAB database to detect ILI outbreaks. Therefore, this real-time, weekly updated dataset may present incompleteness and a temporal lag between the dates of encounter and data registration into the system (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s2-3"><title>The DQI</title><p>We defined the dimensions of completeness and timeliness to develop quantitative indicators for monitoring data quality in the EWS. Completeness is one of the most commonly used dimensions in data quality assessment and may be defined as the proportion of data filled with values for each attribute or entity in the database, while timeliness can be defined as the availability of data for decision-making, measured by the time interval between the occurrence of the measured event and its capture in an information system [<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>In our study, completeness refers to the proportion of weeks in each 8-week rolling window with any register of a PHC encounter. The indicator is measured as a fraction, with the numerator ranging from 0 to 8, and the denominator is 8 weeks, which is expressed as a percentage. Timeliness refers to the time interval, in number of weeks, between the date of the PHC encounter and its registry in the database. The indicator is represented by the proportion of registries occurring in 2 weeks or less from the PHC encounter in the same 8-week rolling window.</p><p>As it is recommended that the diverse quality dimensions should be collectively analyzed for a more comprehensive evaluation of data quality [<xref ref-type="bibr" rid="ref15">15</xref>], we combined the 2 selected indicators in a composite measure, named DQI. The DQI is assessed weekly, for each municipality, once the PHC data are updated into the EWS pipeline.</p></sec><sec id="s2-4"><title>Impact of DQI on the EWS&#x2019; Performance</title><p>To decide on the minimum required threshold of completeness and timeliness to derive trustworthy results with the EWS, we applied the EWS algorithm to the retrospectively gathered, backfilled PHC dataset. We compared the results to those obtained when applying the EWS to the weekly updated, herein named real-time PHC dataset (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Using the backfilled dataset as a reference, we calculated the proportion of concordant warnings detected in the real-time dataset. Accordingly, the DQI is expressed as either &#x201C;suitable&#x201D; or &#x201C;unsuitable&#x201D; when the minimum threshold of both completeness and timeliness is reached, indicating that the data quality may not be adequate for reliable EWS outputs.</p><p>Analyses were performed using Python (version 3.9) and R (version 4.3.1) software. The database&#x2019;s description and the scripts are available on GitHub [<xref ref-type="bibr" rid="ref16">16</xref>].</p></sec><sec id="s2-5"><title>Ethical Considerations</title><p>The study protocol and procedures were reviewed and approved by the Ethical Review Board of Oswaldo Cruz Foundation &#x2013; Fiocruz Bahia (protocol CAAE 61444122.0.0000.0040).</p><p>Data on publicly funded PHC encounters were collected and compiled by the MoH for funding reasons. No consent was needed for data collection at this administrative level. For this study, we accessed an aggregated database consisting of the number of encounters per epidemiological week, per municipality, and per diagnostic code. The accessed database has no information at the individual level, and given that this study involves secondary analysis of existing deidentified data and does not involve direct interaction with human participants, it is classified as exempt from the requirement for informed consent under applicable ethical guidelines.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>There were 198,335,762 recorded ILI-related encounters in the backfilled PHC dataset, which corresponds to an average of 8,623,294 encounters per week between October 10, 2023, and March 10, 2024. Using the backfilled dataset, the EWS detected a median of 4 (IQR 2&#x2010;5) warnings of ILI outbreaks per municipality in the study period.</p><p><xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the impact of the DQI on the ability of the EWS to correctly identify potential ILI outbreaks. Using the real-time dataset, the EWS detected 12,538 (65%) warnings of ILI-outbreaks that were concordant with warnings detected in the backfilled dataset (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The proportion of concordant warnings detected in the real-time dataset, based on different levels of completeness (<xref ref-type="fig" rid="figure2">Figure 2A</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and timeliness (<xref ref-type="fig" rid="figure2">Figure 2B</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), indicated that 100% completeness and a minimum of 80% timeliness yielded the highest percentage of concordant warnings. Therefore, these values were established as the thresholds for grading the DQI as suitable or unsuitable for the EWS. Restricting the EWS analysis to municipalities with a suitable DQI, the proportion of warnings for ILI outbreaks concordant to the backfilled dataset increased to 80.4% (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). We found a median of 71% (IQR 54%&#x2010;71.9%) of Brazilian municipalities with a suitable DQI per week in the study period (<xref ref-type="fig" rid="figure3">Figure 3A</xref> and Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Additionally, we analyzed concordant warnings by grouping municipalities based on the proportion of weeks in which they exhibited a suitable DQI (&#x2264;20%, 20%&#x2010;40%, 40%&#x2010;60%, 60%&#x2010;80%, and &#x2265;80%). Our findings revealed that municipalities with over 60% of weeks featuring a suitable DQI had the highest proportion of concordant warnings between the backfilled and real-time datasets (<xref ref-type="fig" rid="figure3">Figure 3B</xref>, Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Primary Health Care encounters due to influenza-like illness per week in (A) Sao Paulo (municipality with a suitable data quality index in less than 60% of the 23 weeks in study period) and (B) Vitoria da Conquista (municipality with a suitable data quality index for over 80% of the study period). Plots show backfilled (gray line) and real-time (green line) PHC datasets for influenza-like illness (ILI) encounters. Vertical dashed lines show all detected warnings with the Early Aberration Reporting System (EARS), red triangles show nonconcordant warnings between backfield and real-time datasets, and blue circles show concordant warnings between them. ILI: influenza-like illness; PHC: Primary Health Care.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="publichealth_v11i1e67050_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Proportion of concordant outbreak warnings detected in backfilled and real-time Brazilian Primary Health Care datasets. Outbreak warnings generated by the Early Aberration Reporting System (EARS) were identified in both backfilled and real-time datasets. Concordant warnings, detected in both datasets within the same week, are represented in dark blue, while nonconcordant warnings, identified only in the backfilled dataset, are shown in light blue. The analysis considers the proportion of concordant and nonconcordant warnings based on real-time dataset. (<bold>A</bold>) Completeness: the percentage of records from the real-time dataset in each 8-week rolling window (ranging from 0% to 100%) and (<bold>B</bold>) timeliness: the proportion of records registered within 2 weeks or less of the PHC encounter, measured within the same 8-week rolling window (ranging from 0% to 100%).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="publichealth_v11i1e67050_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Proportion of Brazilian municipalities with suitable data quality index (DQI) and concordant warnings over time. (<bold>A</bold>) Weekly analysis of the proportion of municipalities with a suitable DQI from epidemiological week 42 of 2023 to week 12 of 2024. (<bold>B</bold>) Proportion of concordant warnings (dark blue), identified in both datasets within the same week, and nonconcordant warnings (light blue), detected only in the backfilled dataset. The analysis is based on the proportion of weeks with a suitable DQI in the real-time dataset from Brazilian municipalities.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="publichealth_v11i1e67050_fig03.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study highlights the critical role of data quality in the performance of the EWS for infectious disease surveillance using PHC data. In addition, we provide a practical approach for monitoring data quality in real time, which can be adapted to other settings and data types. Our findings revealed that municipalities with over 60% of weeks featuring a suitable DQI had the highest proportion of concordant warnings between the backfilled and real-time datasets. Introducing the DQI as an algorithm integrated into the EWS can guide data management practices and inform decision-making processes.</p><p>Similar to our findings, a recent systematic review of the effectiveness of EWS found that the improvement of data is pivotal for emergency department&#x2013;based surveillance [<xref ref-type="bibr" rid="ref17">17</xref>]. However, efforts for automatization of data quality assessment are typically scattered [<xref ref-type="bibr" rid="ref9">9</xref>], and the literature on the operationalization of data quality assessment remains scarce. A study on data quality assessment for public health information systems found a lack of systematic procedures for quality assessment. While quality assessment of quantitative data generally used descriptive surveys, the authors argued about the importance of systematic scientific data quality assessment [<xref ref-type="bibr" rid="ref18">18</xref>]. To the best of our knowledge, this is the first publication to assess the importance of integrating data quality monitoring into an EWS.</p><p>Fulcher et al [<xref ref-type="bibr" rid="ref19">19</xref>] demonstrated how administrative health data were successfully used to implement a syndromic surveillance system during the COVID-19 pandemic. However, the process of cleaning data and handling missed data was carried out by a dedicated analyst once the updated database became available [<xref ref-type="bibr" rid="ref19">19</xref>]. We anticipate that the framework for a data quality assessment integrated to the EWS pipeline presented here can be adapted to other surveillance systems and can provide insights for similar implementations.</p><p>Using a retrospectively gathered, backfilled PHC dataset, we evaluated the EWS based on optimal data quality conditions. However, administrative data usually exhibit incompleteness and delays, and the EWS should be capable of detecting outbreaks using the available dataset in real time. Our analysis revealed that high levels of completeness (100%) and timeliness (at least 80%) are necessary to achieve the highest proportion of concordant warnings between backfilled and real-time datasets. Additionally, our results indicate that even incremental data quality improvements substantially enhance the EWS&#x2019; performance. Achieving such high standards may pose challenges, particularly in low-resource settings that potentially face limitations due to infrastructure such as unreliable internet connectivity and insufficient computer power. Despite these challenges, we found a weekly median of 71% of Brazilian municipalities achieving the threshold for a suitable DQI for the EWS. This result suggests that a significant proportion of municipalities met the minimum threshold for data quality even in constrained settings.</p><p>In this study, we used the SUS database, which covers approximately 75% of the Brazilian population, with great granularity, reaching underserved rural and remote regions [<xref ref-type="bibr" rid="ref20">20</xref>]. This approach allowed us to assess the performance of the EWS across different regions and health service contexts. However, these findings may not be directly applicable to other countries. It is likely that the use of the EWS in different health system structures and data management practices will need adjustments and may require distinct data quality requirements [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Another limitation of this study is that we could not access other dimensions of data quality. Specifically, we could not access the accuracy of registers in the PHC dataset. Accuracy represents the extent to which the data are free of error and reliable [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. We worked with aggregated, secondary data, and did not have access to the complete EHRs, which precluded us from verifying whether the diagnostic codes in the database accurately reflected patients&#x2019; main clinical problems. It is our perspective that evaluating the accuracy of the <italic>ICD-10</italic> and ICPC-2 is of great importance. However, given the large numbers of PHC encounters registered weekly, misclassifications of the reason of encounter are likely to be nondifferential. Additionally, syndromic surveillance systems are designed to operate effectively even with some level of imprecision, as their primary purpose is to detect patterns and trends rather than to provide definitive diagnoses.</p></sec><sec id="s4-2"><title>Conclusion</title><p>Our findings demonstrate that implementing a robust and integrated DQI analysis can significantly enhance the EWS&#x2019; ability to detect ILI outbreaks, contributing to better public health outcomes and ultimately to global health security. Beyond contributing to the existing literature on EWS, this study highlights the importance of systematic data quality assessment. Continuous monitoring and improvement of data quality should be prioritized to ensure the reliability and effectiveness of surveillance systems. Additionally, our study suggests that similar frameworks can be adapted to different contexts. As health systems increasingly use digital health data for decision-making, our approach represents a model for integrating data quality monitoring into surveillance systems, ultimately enhancing the capacity to detect and respond to infectious disease outbreaks effectively.</p></sec></sec></body><back><ack><p>This study was funded by the Rockefeller Foundation&#x2019;s Health Initiative (grant 2023-PPI-007 awarded to MBN). MBN, PIPR, and VB are research fellows from the National Council for Scientific and Technological Development (CNPq, Brazil). TCS acknowledges funding from the Royal Society (NIF\R1\231435). The funders did not interfere in the analysis, interpretation, or decision to submit the manuscript for publication. This study is part of the Alert-Early System of Outbreaks with Pandemic Potential (&#x00C6;SOP), an initiative under development by Brazil&#x2019;s Funda&#x00E7;&#x00E3;o Oswaldo Cruz (Fiocruz) and the Federal University of Rio de Janeiro, and financially supported by Rockefeller Foundation&#x2019;s Health Initiative. Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> was created with Biorender.com and was published with permission. Generative artificial intelligence was not used for ideation or any part of the study design. Additionally, it was not used for reference searches. Its use was limited to grammatical revisions in the manuscript, with the prompt, &#x201C;Please check for readability and possible grammatical mistakes&#x201D; (a transcript of the conversation with the chatbot is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p></ack><notes><sec><title>Data Availability</title><p>Our agreement with the Brazilian Ministry of Health (MoH) for accessing the referenced databases patently denies authorization of access to any third parties. All requests to access these databases must be addressed to the Brazilian MoH.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: PTVF, IM, and MBN</p><p>Data acquisition: VdAO</p><p>Data curation and processing: VdAO, JBJ, and GCGB</p><p>Formal analysis: PTVF</p><p>Script verification: JBJ, GCGB, and TCS</p><p>Study design: PTVF, JBJ, GCGB, TCS, VdAO, MHdOG, GOP, VB, PIPR, MBN, and IM</p><p>Writing&#x2014;original draft: PTVF and IM</p><p>Writing&#x2014;review and editing: GOP, TCS, VB, MHdOG, VdAO, and PIPR</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">DQI</term><def><p>data quality index</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb3">EWS</term><def><p>early warning system</p></def></def-item><def-item><term id="abb4"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb5">ICPC-2</term><def><p>International Classification of Primary Care</p></def></def-item><def-item><term id="abb6">ILI</term><def><p>influenza-like illness</p></def></def-item><def-item><term id="abb7">MoH</term><def><p>Ministry of Health</p></def></def-item><def-item><term id="abb8">PHC</term><def><p>Primary Health Care</p></def></def-item><def-item><term id="abb9">SISAB</term><def><p>Sistema de Informa&#x00E7;&#x00E3;o em Sa&#x00FA;de para a Aten&#x00E7;&#x00E3;o B&#x00E1;sica</p></def></def-item><def-item><term id="abb10">SUS</term><def><p>Brazilian Unified Health System</p></def></def-item><def-item><term id="abb11">&#x00C6;SOP</term><def><p>Alert-Early System of Outbreaks with Pandemic Potential</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Baker</surname><given-names>RE</given-names> </name><name name-style="western"><surname>Mahmud</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>IF</given-names> </name><etal/></person-group><article-title>Infectious disease in an era of global change</article-title><source>Nat Rev Microbiol</source><year>2022</year><month>04</month><volume>20</volume><issue>4</issue><fpage>193</fpage><lpage>205</lpage><pub-id pub-id-type="doi">10.1038/s41579-021-00639-z</pub-id><pub-id pub-id-type="medline">34646006</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Morgan</surname><given-names>OW</given-names> </name><name name-style="western"><surname>Aguilera</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ammon</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Disease surveillance for the COVID-19 era: time for bold changes</article-title><source>The Lancet</source><year>2021</year><month>06</month><day>19</day><volume>397</volume><issue>10292</issue><fpage>2317</fpage><lpage>2319</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(21)01096-5</pub-id><pub-id pub-id-type="medline">34000258</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><article-title>Data quality monitoring and surveillance system evaluation - a handbook of methods and applications</article-title><source>European Centre for Disease Prevention and Control</source><year>2014</year><access-date>2025-02-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ecdc.europa.eu/sites/default/files/media/en/publications/Publications/Data-quality-monitoring-surveillance-system-evaluation-Sept-2014.pdf">https://www.ecdc.europa.eu/sites/default/files/media/en/publications/Publications/Data-quality-monitoring-surveillance-system-evaluation-Sept-2014.pdf</ext-link></comment></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ramos</surname><given-names>PIP</given-names> </name><name name-style="western"><surname>Marcilio</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bento</surname><given-names>AI</given-names> </name><etal/></person-group><article-title>Combining digital and molecular approaches using health and alternate data sources in a next-generation surveillance system for anticipating outbreaks of pandemic potential</article-title><source>JMIR Public Health Surveill</source><year>2024</year><month>01</month><day>9</day><volume>10</volume><fpage>e47673</fpage><pub-id pub-id-type="doi">10.2196/47673</pub-id><pub-id pub-id-type="medline">38194263</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ulrich</surname><given-names>EH</given-names> </name><name name-style="western"><surname>So</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zappitelli</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chanchlani</surname><given-names>R</given-names> </name></person-group><article-title>A review on the application and limitations of administrative health care data for the study of acute kidney injury epidemiology and outcomes in children</article-title><source>Front Pediatr</source><year>2021</year><volume>9</volume><fpage>742888</fpage><pub-id pub-id-type="doi">10.3389/fped.2021.742888</pub-id><pub-id pub-id-type="medline">34778133</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tassi</surname><given-names>MF</given-names> </name><name name-style="western"><surname>le Meur</surname><given-names>N</given-names> </name><name name-style="western"><surname>St&#x00E9;fic</surname><given-names>K</given-names> </name><name name-style="western"><surname>Grammatico-Guillon</surname><given-names>L</given-names> </name></person-group><article-title>Performance of French medico-administrative databases in epidemiology of infectious diseases: a scoping review</article-title><source>Front Public Health</source><year>2023</year><volume>11</volume><fpage>1161550</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2023.1161550</pub-id><pub-id pub-id-type="medline">37250067</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shaw</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Harron</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Pescarini</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Biases arising from linked administrative data for epidemiological research: a conceptual framework from registration to analyses</article-title><source>Eur J Epidemiol</source><year>2022</year><month>12</month><volume>37</volume><issue>12</issue><fpage>1215</fpage><lpage>1224</lpage><pub-id pub-id-type="doi">10.1007/s10654-022-00934-w</pub-id><pub-id pub-id-type="medline">36333542</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Weiskopf</surname><given-names>N</given-names> </name><name name-style="western"><surname>Abrams</surname><given-names>ZB</given-names> </name><etal/></person-group><article-title>Electronic health record data quality assessment and tools: a systematic review</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>09</month><day>25</day><volume>30</volume><issue>10</issue><fpage>1730</fpage><lpage>1740</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad120</pub-id><pub-id pub-id-type="medline">37390812</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ozonze</surname><given-names>O</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Hopgood</surname><given-names>AA</given-names> </name></person-group><article-title>Automating electronic health record data quality assessment</article-title><source>J Med Syst</source><year>2023</year><month>02</month><day>13</day><volume>47</volume><issue>1</issue><fpage>23</fpage><pub-id pub-id-type="doi">10.1007/s10916-022-01892-2</pub-id><pub-id pub-id-type="medline">36781551</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bloland</surname><given-names>P</given-names> </name><name name-style="western"><surname>MacNeil</surname><given-names>A</given-names> </name></person-group><article-title>Defining &#x0026; assessing the quality, usability, and utilization of immunization data</article-title><source>BMC Public Health</source><year>2019</year><month>04</month><day>4</day><volume>19</volume><issue>1</issue><fpage>380</fpage><pub-id pub-id-type="doi">10.1186/s12889-019-6709-1</pub-id><pub-id pub-id-type="medline">30947703</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cerqueira-Silva</surname><given-names>T</given-names> </name><name name-style="western"><surname>Oliveira</surname><given-names>JF</given-names> </name><name name-style="western"><surname>de Ara&#x00FA;jo Oliveira</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Early warning system using primary health care data in the post-COVID-19 pandemic era: Brazil nationwide case-study</article-title><source>Cad Saude Publica</source><year>2024</year><volume>40</volume><issue>11</issue><fpage>e00010024</fpage><pub-id pub-id-type="doi">10.1590/0102-311XEN010024</pub-id><pub-id pub-id-type="medline">39775767</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Atrubin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name></person-group><article-title>Initial evaluation of the early aberration reporting system--Florida</article-title><source>MMWR Suppl</source><year>2005</year><month>08</month><day>26</day><volume>54</volume><fpage>123</fpage><lpage>130</lpage><pub-id pub-id-type="medline">16177703</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><article-title>2022 population census: main results</article-title><source>Brazilian Institute of Geography and Statistics</source><year>2022</year><access-date>2024-12-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ibge.gov.br/en/statistics/social/health/22836-2022-census-3.html">https://www.ibge.gov.br/en/statistics/social/health/22836-2022-census-3.html</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fox</surname><given-names>C</given-names> </name><name name-style="western"><surname>Levitin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Redman</surname><given-names>T</given-names> </name></person-group><article-title>The notion of data and its quality dimensions</article-title><source>Inf Process Manag</source><year>1994</year><month>01</month><volume>30</volume><issue>1</issue><fpage>9</fpage><lpage>19</lpage><pub-id pub-id-type="doi">10.1016/0306-4573(94)90020-5</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassenstein</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Vanella</surname><given-names>P</given-names> </name></person-group><article-title>Data quality&#x2014;concepts and problems</article-title><source>Encyclopedia</source><year>2022</year><month>02</month><volume>2</volume><issue>1</issue><fpage>498</fpage><lpage>510</lpage><pub-id pub-id-type="doi">10.3390/encyclopedia2010032</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>AESOP-data-documentation</article-title><source>GitHub</source><access-date>2025-02-06</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/cidacslab/AESOP-Data-Documentation/tree/main/DataPipeline">https://github.com/cidacslab/AESOP-Data-Documentation/tree/main/DataPipeline</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meckawy</surname><given-names>R</given-names> </name><name name-style="western"><surname>Stuckler</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mehta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Ahdal</surname><given-names>T</given-names> </name><name name-style="western"><surname>Doebbeling</surname><given-names>BN</given-names> </name></person-group><article-title>Effectiveness of early warning systems in the detection of infectious diseases outbreaks: a systematic review</article-title><source>BMC Public Health</source><year>2022</year><month>11</month><day>29</day><volume>22</volume><issue>1</issue><fpage>2216</fpage><pub-id pub-id-type="doi">10.1186/s12889-022-14625-4</pub-id><pub-id pub-id-type="medline">36447171</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hailey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name></person-group><article-title>A review of data quality assessment methods for public health information systems</article-title><source>Int J Environ Res Public Health</source><year>2014</year><month>05</month><day>14</day><volume>11</volume><issue>5</issue><fpage>5170</fpage><lpage>5207</lpage><pub-id pub-id-type="doi">10.3390/ijerph110505170</pub-id><pub-id pub-id-type="medline">24830450</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fulcher</surname><given-names>IR</given-names> </name><name name-style="western"><surname>Boley</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Gopaluni</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Syndromic surveillance using monthly aggregate health systems information data: methods with application to COVID-19 in Liberia</article-title><source>Int J Epidemiol</source><year>2021</year><month>08</month><day>30</day><volume>50</volume><issue>4</issue><fpage>1091</fpage><lpage>1102</lpage><pub-id pub-id-type="doi">10.1093/ije/dyab094</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Macinko</surname><given-names>J</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Rocha</surname><given-names>MG</given-names> </name></person-group><article-title>Brazil&#x2019;s National Program for Improving Primary Care Access and Quality (PMAQ): fulfilling the potential of the world&#x2019;s largest payment for performance system in primary care</article-title><source>J Ambul Care Manage</source><year>2017</year><volume>40 Suppl 2 Supplement</volume><issue>2 Suppl</issue><fpage>S4</fpage><lpage>S11</lpage><pub-id pub-id-type="doi">10.1097/JAC.0000000000000189</pub-id><pub-id pub-id-type="medline">28252498</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional material.</p><media xlink:href="publichealth_v11i1e67050_app1.docx" xlink:title="DOCX File, 141 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Conversation with the chatbot for grammatical revision.</p><media xlink:href="publichealth_v11i1e67050_app2.pdf" xlink:title="PDF File, 562 KB"/></supplementary-material></app-group></back></article>