<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<?covid-19-tdm?>
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v8i6e35266</article-id>
      <article-id pub-id-type="pmid">35507921</article-id>
      <article-id pub-id-type="doi">10.2196/35266</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Enhancing COVID-19 Epidemic Forecasting Accuracy by Combining Real-time and Historical Data From Multiple Internet-Based Sources: Analysis of Social Media Data, Online News Articles, and Search Queries</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Sanchez</surname>
            <given-names>Travis</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wen</surname>
            <given-names>Conghua</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Luo</surname>
            <given-names>Chen</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Jingwei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6129-2751</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Wei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <address>
            <institution>National Center for Applied Mathematics Shenzhen</institution>
            <addr-line>No. 1088, Xueyuan Avenue</addr-line>
            <addr-line>Nanshan District</addr-line>
            <addr-line>Shenzhen, 518055</addr-line>
            <country>China</country>
            <phone>86 15129077179</phone>
            <email>waynehuangwei@163.com</email>
          </address>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7150-0844</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Sia</surname>
            <given-names>Choon Ling</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9778-9196</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Zhuo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff6" ref-type="aff">6</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5351-3489</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Wu</surname>
            <given-names>Tailai</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff8" ref-type="aff">8</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2025-3123</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Qingnan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4373-8000</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Management</institution>
        <institution>Xi’an Jiaotong University</institution>
        <addr-line>Xi'an</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Information Systems</institution>
        <institution>City University of Hong Kong</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>National Center for Applied Mathematics Shenzhen</institution>
        <addr-line>Shenzhen</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>College of Business</institution>
        <institution>Southern University of Science and Technology</institution>
        <addr-line>Shenzhen</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Information Systems and Intelligent Business</institution>
        <institution>School of Management</institution>
        <institution>Xi’an Jiaotong University</institution>
        <addr-line>Xi'an</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>College of Public Health</institution>
        <institution>University of Georgia</institution>
        <addr-line>Athens, GA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>School of Economics</institution>
        <institution>University of Nottingham Ningbo China</institution>
        <addr-line>Ningbo</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff8">
        <label>8</label>
        <institution>School of Medicine and Health Management</institution>
        <institution>Tongji Medical College</institution>
        <institution>Huazhong University of Science and Technology</institution>
        <addr-line>Wuhan</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Wei Huang <email>waynehuangwei@163.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>6</month>
        <year>2022</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>16</day>
        <month>6</month>
        <year>2022</year>
      </pub-date>
      <volume>8</volume>
      <issue>6</issue>
      <elocation-id>e35266</elocation-id>
      <history>
        <date date-type="received">
          <day>29</day>
          <month>11</month>
          <year>2021</year>
        </date>
        <date date-type="rev-request">
          <day>22</day>
          <month>1</month>
          <year>2022</year>
        </date>
        <date date-type="rev-recd">
          <day>12</day>
          <month>2</month>
          <year>2022</year>
        </date>
        <date date-type="accepted">
          <day>3</day>
          <month>5</month>
          <year>2022</year>
        </date>
      </history>
      <copyright-statement>©Jingwei Li, Wei Huang, Choon Ling Sia, Zhuo Chen, Tailai Wu, Qingnan Wang. Originally published in JMIR Public Health and Surveillance (https://publichealth.jmir.org), 16.06.2022.</copyright-statement>
      <copyright-year>2022</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on https://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://publichealth.jmir.org/2022/6/e35266" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The SARS-COV-2 virus and its variants pose extraordinary challenges for public health worldwide. Timely and accurate forecasting of the COVID-19 epidemic is key to sustaining interventions and policies and efficient resource allocation. Internet-based data sources have shown great potential to supplement traditional infectious disease surveillance, and the combination of different Internet-based data sources has shown greater power to enhance epidemic forecasting accuracy than using a single Internet-based data source. However, existing methods incorporating multiple Internet-based data sources only used real-time data from these sources as exogenous inputs but did not take all the historical data into account. Moreover, the predictive power of different Internet-based data sources in providing early warning for COVID-19 outbreaks has not been fully explored.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The main aim of our study is to explore whether combining real-time and historical data from multiple Internet-based sources could improve the COVID-19 forecasting accuracy over the existing baseline models. A secondary aim is to explore the COVID-19 forecasting timeliness based on different Internet-based data sources.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We first used core terms and symptom-related keyword-based methods to extract COVID-19–related Internet-based data from December 21, 2019, to February 29, 2020. The Internet-based data we explored included 90,493,912 online news articles, 37,401,900 microblogs, and all the Baidu search query data during that period. We then proposed an autoregressive model with exogenous inputs, incorporating real-time and historical data from multiple Internet-based sources. Our proposed model was compared with baseline models, and all the models were tested during the first wave of COVID-19 epidemics in Hubei province and the rest of mainland China separately. We also used lagged Pearson correlations for COVID-19 forecasting timeliness analysis.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Our proposed model achieved the highest accuracy in all 5 accuracy measures, compared with all the baseline models of both Hubei province and the rest of mainland China. In mainland China, except for Hubei, the COVID-19 epidemic forecasting accuracy differences between our proposed model (model i) and all the other baseline models were statistically significant (model 1, t<sub>198</sub>=–8.722, <italic>P</italic>&#60;.001; model 2, t<sub>198</sub>=–5.000, <italic>P</italic>&#60;.001, model 3, t<sub>198</sub>=–1.882, <italic>P</italic>=.06; model 4, t<sub>198</sub>=–4.644, <italic>P</italic>&#60;.001; model 5, t<sub>198</sub>=–4.488, <italic>P</italic>&#60;.001). In Hubei province, our proposed model's forecasting accuracy improved significantly compared with the baseline model using historical new confirmed COVID-19 case counts only (model 1, t<sub>198</sub>=–1.732, <italic>P</italic>=.09). Our results also showed that Internet-based sources could provide a 2- to 6-day earlier warning for COVID-19 outbreaks.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our approach incorporating real-time and historical data from multiple Internet-based sources could improve forecasting accuracy for epidemics of COVID-19 and its variants, which may help improve public health agencies' interventions and resource allocation in mitigating and controlling new waves of COVID-19 or other relevant epidemics.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>SARS-CoV-2</kwd>
        <kwd>COVID 19</kwd>
        <kwd>epidemic forecasting</kwd>
        <kwd>disease surveillance</kwd>
        <kwd>infectious disease epidemiology</kwd>
        <kwd>social medial</kwd>
        <kwd>online news</kwd>
        <kwd>search query</kwd>
        <kwd>autoregression model</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>COVID-19 poses extraordinary challenges for public health systems worldwide. As of November 26, 2021, COVID-19 had affected 222 countries and territories [<xref ref-type="bibr" rid="ref1">1</xref>] and caused 259,502,031 confirmed cases, including 5,183,003 deaths worldwide [<xref ref-type="bibr" rid="ref2">2</xref>]. Moreover, variants of the COVID-19 virus led to further challenges for public health. After the highly contagious Alpha variant swept across Europe and the United States in early 2021, the Delta variant replaced Alpha and became the dominant COVID variant worldwide [<xref ref-type="bibr" rid="ref3">3</xref>]. The Delta variant is around 60% more transmissible than the Alpha variant, is moderately resistant to vaccines [<xref ref-type="bibr" rid="ref4">4</xref>], and caused a new wave of the COVID-19 epidemic in Europe in late 2021 [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Omicron, an even more worrying variant, was reported from South Africa on November 24, 2021; it is said to out-compete the Delta variant and has been identified in Botswana, Belgium, Hong Kong, and Israel [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. More timely and accurate forecasting of the incidence of COVID-19 and its variants is key to improving the efficiency of resource allocation and timeliness of intervention policy implementation [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>].</p>
      <p>Internet-based data sources, such as social media data (like microblogs), online news article data, and search query data, accumulate huge amounts of data all the time and have been proven to be an effective supplement to traditional infectious disease surveillance systems [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. The underlying mechanism is that, before experiencing serious symptoms and going to a sentinel hospital, patients with symptoms may search for disease-related information on search engines like Google [<xref ref-type="bibr" rid="ref14">14</xref>], complain about disease-related symptoms on social media like microblogs [<xref ref-type="bibr" rid="ref15">15</xref>], or even share disease-related personal experiences on personal news articles platforms like instant articles [<xref ref-type="bibr" rid="ref16">16</xref>]. This gives Internet-based data the ability to provide early warning for disease outbreaks [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>] or provide supplemental information to enhance epidemic forecasting accuracy [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. For instance, Wilson and Brownstein [<xref ref-type="bibr" rid="ref19">19</xref>] retrieved official public health emergency–related online articles to support the early warning of Listeria outbreaks. Yang et al [<xref ref-type="bibr" rid="ref14">14</xref>] proposed an autoregression model with Google search query data (AGRO) to improve the forecasting accuracy for influenza epidemics [<xref ref-type="bibr" rid="ref14">14</xref>]. McGough et al [<xref ref-type="bibr" rid="ref20">20</xref>] produced an improved estimation for the Zika virus in Latin America with a 1-week lead time. They used a multivariable linear regression model, combining real-time search query data, social media data (Twitter), outbreak news report counts, and historical officially reported case counts [<xref ref-type="bibr" rid="ref20">20</xref>]. Internet-based data contain a large volume of unstructured text data [<xref ref-type="bibr" rid="ref21">21</xref>] accompanied by noise caused by linguistic errors or misinformation [<xref ref-type="bibr" rid="ref22">22</xref>]. To deal with Internet-based data, researchers have adopted a combination of methods, which include, but are not limited to, natural language processing, classification or clustering algorithms based on machine learning, and time-series models [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p>
      <p>As COVID-19 has been and continues to be the most consequential infectious disease worldwide in this century, many researchers have used various Internet-based data sources to supplement COVID-19 surveillance [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Like previous research on other infectious diseases, COVID-19 forecasting research based on Internet-based data focuses mainly on 2 aspects: improving forecasting accuracy and improving forecasting timeliness. To improve COVID-19 forecasting accuracy, Shen et al [<xref ref-type="bibr" rid="ref26">26</xref>] used the Granger causality test and showed that adding COVID-19 symptom–related microblogs could help enhance the COVID-19 predictive power. Liu et al [<xref ref-type="bibr" rid="ref11">11</xref>] adopted a multivariable model and showed that adding real-time search query data and news article data into the traditional COVID-19 forecasting model could lead to more accurate forecasting results. The combination of different Internet-based data sources has shown greater power to enhance the forecasting accuracy of infectious diseases (including COVID-19) than using a single Internet-based data source [<xref ref-type="bibr" rid="ref20">20</xref>]. However, existing methods incorporating more than one Internet-based data source used only real-time data from these sources as exogenous inputs but did not use historical data from all possible sources.</p>
      <p>As for improving COVID-19 forecasting timeliness, Yuan et al [<xref ref-type="bibr" rid="ref10">10</xref>] examined the lagged correlation between COVID-19 symptoms and core term–related search queries and daily new COVID-19 cases in the United States. They found that COVID-19–related search queries could provide a 12- to 14-day earlier warning for COVID-19 epidemics [<xref ref-type="bibr" rid="ref10">10</xref>]. Similarly, Li et al [<xref ref-type="bibr" rid="ref27">27</xref>] [<xref ref-type="bibr" rid="ref26">26</xref>]proved that the Baidu search index and Weibo (social media platform similar to Twitter) index could both provide warning for COVID-19 outbreaks in China 8 days to 12 days earlier. However, the power of different Internet-based data sources to improve COVID-19 epidemic forecasting timeliness has not been fully explored [<xref ref-type="bibr" rid="ref16">16</xref>]. The length of early warning time that Internet-based data could provide is not consistent across studies, varying from 0 [<xref ref-type="bibr" rid="ref28">28</xref>] to 21 days [<xref ref-type="bibr" rid="ref29">29</xref>]. Moreover, even though unofficial online news articles have shown great potential in supplementing COVID-19 surveillance [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>], few studies have explored using unofficial online news articles to improve COVID-19 forecasting timeliness.</p>
      <p>Our study explored whether combining real-time and historical data from multiple Internet-based sources could improve COVID-19 forecasting accuracy over the existing baseline models. We also compared COVID-19 forecasting timelines based on different Internet-based data sources.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection and Processing</title>
        <p>We focused on the first wave of the COVID-19 epidemic in mainland China and compiled data on daily new confirmed COVID-19 case counts, online news articles, microblogs, and search queries from various sources. Following a previous study [<xref ref-type="bibr" rid="ref26">26</xref>], we collected data from mainland China, with separate analyses for Hubei province and the remaining provinces. The official laboratory-confirmed case counts in mainland China, except Hubei province, can be retrieved since January 19, 2020 [<xref ref-type="bibr" rid="ref21">21</xref>], while the official laboratory-confirmed case counts in Hubei province can be retrieved since January 10, 2020 [<xref ref-type="bibr" rid="ref11">11</xref>]. The max time lags we explored were 20 days, following the example from previous studies [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Thus, we traced the Internet-based sources to December 21, 2019. We chose the end of our study period as February 29, 2020, when the primary wave of the COVID-19 epidemic in China had passed and the new confirmed case number decreased to single figures [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <p>Daily new confirmed COVID-19 case counts were collected from the Chinese Center for Disease Control and Prevention (China CDC) website [<xref ref-type="bibr" rid="ref32">32</xref>], which started collecting data on January 16, 2020. Earlier counts in Hubei province between January 10, 2020, and January 16, 2020, were compiled based on reports from the Health Commission of Hubei Province [<xref ref-type="bibr" rid="ref33">33</xref>]. We then collected online news article data and microblog data from Sina Network Opinion Surveillance System (SNOSS) [<xref ref-type="bibr" rid="ref34">34</xref>], a commercially available web-based platform that collects various Internet-based data in mainland China. Search query data were collected from the Baidu Index website [<xref ref-type="bibr" rid="ref35">35</xref>]. We were the first to identify online news articles about COVID-19 and COVID-19–related microblogs using an approach based on COVID-19 core terms and symptom-related keywords. We also used COVID-19–related symptoms and core terms to extract COVID-19–related search queries, following a previous study [<xref ref-type="bibr" rid="ref36">36</xref>]. Detailed Internet-based data extraction and filtering methods are described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>We first described the Internet-based data we retrieved and the COVID-19–related data we extracted. We then summarized all the COVID-19 forecasting-related data in 1 figure, including the fraction of online news articles and microblogs, search query counts, and lab-confirmed new case counts in mainland China, except Hubei, and Hubei province. All the data were normalized into an interval of 0 to 100 for better comparison. The figures aimed to show the Internet-based data sources’ potential to provide warnings for COVID-19 epidemics.</p>
        <p>We also conducted lagged Pearson correlation analyses to evaluate the strength of relationships between different Internet-based data sources and daily new confirmed COVID-19 case counts. The max time lag explored was 20 days [<xref ref-type="bibr" rid="ref26">26</xref>]. Because outliers can have a large influence on the Pearson correlation [<xref ref-type="bibr" rid="ref37">37</xref>], we replaced the outlier data in Hubei on February 12, 2020, with the average of the 2 nearest neighbors [<xref ref-type="bibr" rid="ref38">38</xref>]. A high correlation threshold of 0.7 was used, based on previous research [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
      </sec>
      <sec>
        <title>Model Formulation</title>
        <p>Following previous infectious disease surveillance research [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], including COVID-19 forecasting research [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref26">26</xref>], we proposed an autoregressive model with exogenous inputs [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. We used the proportion of daily new confirmed COVID-19 case counts as a dependent variable. For the proportions of daily new confirmed case counts bounded between 0 and 1, we used logit transformation on the variable to turn it into unbounded scores [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. The proportion was calculated by dividing the number of new confirmed COVID-19 case counts over the related population, which was based on the latest Chinese national population census [<xref ref-type="bibr" rid="ref43">43</xref>]. We then proposed our model by adding log-transformed COVID-19–related Internet-based data as exogenous inputs, including the fraction of online news article, microblogs, and search query counts. Let <italic>p<sub>t</sub></italic> be the new confirmed COVID-19 case proportion. For days when <italic>p<sub>t</sub></italic> = 0, we added a small positive number, <italic>λ</italic>, in the logit transformation. <italic>λ</italic> was calculated by dividing the square of the first quantile by the third quantile of all the proportions [<xref ref-type="bibr" rid="ref44">44</xref>]. Let <italic>y<sub>t</sub></italic> = <italic>logit</italic>(<italic>p<sub>t</sub></italic> <sub>+</sub> <italic>λ</italic>) be the logit-transformed new confirmed COVID-19 case proportion at day t. Let <italic>x<sub>t</sub></italic> be the log-transformed fraction of COVID-19–related online news articles at day t, <italic>z<sub>t</sub></italic> be the log-transformed fraction of COVID-19–related microblogs at day t, and <italic>s<sub>t</sub></italic> be the log-transformed COVID-19–related search volume at day t. We chose “fever” to represent search queries, for it showed the highest correlations with new confirmed COVID-19 counts.</p>
        <p>We proposed our autoregressive model with exogenous inputs, denoted as</p>
        <disp-formula>
          <graphic xlink:href="publichealth_v8i6e35266_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Incorporating the real-time and historical data from online news articles, microblogs, and search query volume:</p>
        <disp-formula>
          <graphic xlink:href="publichealth_v8i6e35266_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>Where <italic>a<sub>i</sub></italic> quantifies the contribution from the historical new confirmed COVID-19 case counts, <italic>b<sub>j</sub></italic> quantifies the contribution from the historical fraction of COVID-19–related online news articles, <italic>c<sub>h</sub></italic> quantifies the contribution from the historical fraction of COVID-19–related online news articles, <italic>d<sub>k</sub></italic> quantifies the contribution from the historical COVID-19–related search queries, <italic>M</italic> is a binary variable that equals 1 when data are in Hubei and equals 0 when data are outside Hubei, <italic>f</italic> is a constant term, and <italic>ɛ<sub>t</sub></italic> is a vector of independent random disturbance. <italic>I<sub>t</sub></italic> is a time-varying binary variable that equals 1 on February 12, 2020, when Hubei adopted the fifth edition of the diagnostic criteria. <italic>I<sub>t</sub></italic> controls for the exogenous shock of case counts on that day [<xref ref-type="bibr" rid="ref26">26</xref>]. <italic>lag<sub>NC</sub></italic>, <italic>lag<sub>News</sub></italic>, <italic>lag<sub>Mblog</sub></italic>, and <italic>lag<sub>Query</sub></italic> ranged from 1 to 20 and were the optimal values that led to the highest forecasting accuracy (lowest root-mean-square error [RMSE]) for related baseline models described in the next paragraph using a single Internet-based data source (see Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for detailed lag selections).</p>
        <p>We considered 5 baseline models, including (1) AR(<italic>lag<sub>NC</sub></italic>): autoregression model based on historical new confirmed COVID-19 case counts only [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref26">26</xref>], (2) AR(<italic>lag<sub>NC</sub></italic>)+News(<italic>lag<sub>News</sub></italic>): autoregression model adding the fraction of COVID-19–related online news articles as an exogenous input [<xref ref-type="bibr" rid="ref16">16</xref>], (3) AR(<italic>lag<sub>NC</sub></italic>)+Mblog(<italic>lag<sub>Mblog</sub></italic>): autoregression model adding the fraction of microblogs as an exogenous input [<xref ref-type="bibr" rid="ref26">26</xref>], (4) AR(<italic>lag<sub>NC</sub></italic>)+Query(<italic>lag<sub>Query</sub></italic>): autoregression model adding search volume as an exogenous input [<xref ref-type="bibr" rid="ref36">36</xref>], and (5) AR(<italic>lag<sub>NC</sub></italic>)+News(1)+Mblog(1)+Query(1): multivariable linear model adding the fraction of real-time online news articles, the fraction of microblogs, and search query volume into historical official COVID-19 report data [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref20">20</xref>] (see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> for detailed model formulations).</p>
        <p>Retrospective estimations of the daily proportion of confirmed COVID-19 counts were produced through the proposed model and baseline models. The estimation period was from January 19, 2020, to February 29, 2020, for mainland China, except for Hubei. For Hubei province, even though the official laboratory-confirmed COVID-19 cases can be retrieved since January 10, 2020, there was a severe lack of laboratory testing capacity at the beginning of this unexpected epidemic. Specifically, there were thousands of COVID-19–suspected cases that could not be confirmed due to the lack of testing capacity before January 27, 2020, and the daily test capacity in Hubei had to be extended 10 times on January 27, 2020 to address this issue [<xref ref-type="bibr" rid="ref45">45</xref>]. The officially reported daily new confirmed COVID-19 case counts before January 27, 2020 reflected the testing capacity rather than the evolution of the epidemic. Thus, we tested the proposed model and other baseline models from January 27, 2020, to February 29, 2020, in Hubei.</p>
        <p>We used the variance inflation factor (VIF) to measure multicollinearity in the independent variables. A VIF over 4 indicates a moderate level of multicollinearity, and a VIF exceeding 10 shows severe multicollinearity [<xref ref-type="bibr" rid="ref46">46</xref>]. A repeated k-fold cross-validation [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>] was adopted to evaluate the proposed model and baseline models. In this study, we split the data into 10 folds and repeated the cross-validation procedure 10 times [<xref ref-type="bibr" rid="ref47">47</xref>]. We adopted the 5 most commonly used accuracy measures to compare the models’ forecasting results with the actual daily new confirmed COVID-19 case counts. The accuracy measures included the RMSE, mean absolute error (MAE), mean absolute percentage error (MAPE), correlation with forecasting target, and correlation of increment with forecasting target (the formulas for the accuracy indexes are presented in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. We conducted the analyses with the R version 4.0.2 statistical software package caret [<xref ref-type="bibr" rid="ref50">50</xref>] version 6.0-86 and DAAG [<xref ref-type="bibr" rid="ref51">51</xref>] version 1.24.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Internet-Based Data Statistics</title>
        <p>Overall, we extracted 608,335 (out of 75,431,068) and 123,955 (out of 15,062,844) COVID-19–related online news articles for mainland China, except Hubei, and Hubei province separately, respectively. Unofficial online news articles accounted for about 92.8% (83,966,946/90,493,912) of all the news articles traced. We also identified 476,932 (out of 32,475,162) and 191,296 (out of 4,926,738) COVID-19–related microblogs posted in mainland China, except Hubei, and Hubei province, respectively. For the COVID-19–related search queries, we retrieved 24,165,139 queries in mainland China, except Hubei, and 988,402 related queries in Hubei province. The daily new confirmed COVID-19 case counts, the fraction of COVID-19–related online news articles, the fraction of COVID-19–related microblogs, and COVID-19–related search query counts are displayed in <xref rid="figure1" ref-type="fig">Figures 1</xref> and <xref rid="figure2" ref-type="fig">2</xref>.</p>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> shows that the first peak of daily confirmed COVID-19 case counts was reached on January 30, 2020, in provinces except Hubei. Compared with the official COVID-19 case counts, the peak in COVID-19–related online news articles was 2 days earlier (January 28, 2020), the peak in microblogs was 3 days earlier (January 27, 2020), and the peaks in search queries were 4 days to 7 days earlier (from January 23, 2020, to January 26, 2020).</p>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows that the highest peak of daily new confirmed COVID-19 case counts was reached on February 4, 2020, in Hubei province. Compared with the peak of official COVID-19 case counts, the peak in COVID-19–related online news articles was 12 days earlier (January 23, 2020), peak in microblogs was 13 days earlier (January 22, 2020), and peaks in search queries were 10 days to 12 days earlier (from January 23, 2020, to January 25, 2020). An outlier of incidence was found on February 12, 2020, when the new confirmed COVID-19 case counts increased dramatically as Hubei province started implementing the fifth edition of the COVID-19 diagnostic criteria. The new diagnostic criteria introduced more flexible diagnostic standards and turned many previously suspected cases into confirmed cases. This outlier could impact the forecasting accuracy and has been dealt with carefully in the model formulation and data analysis.</p>
        <p>Lagged Pearson correlation analyses between different Internet-based data sources and daily new confirmed COVID-19 case counts were also conducted to illustrate the predictive power. The highest correlations for different sources with different time lags are summarized in <xref ref-type="table" rid="table1">Table 1</xref> (see Tables S2 and S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for more details).</p>
        <p><xref ref-type="table" rid="table1">Table 1</xref> shows that, in mainland China except Hubei, the highest correlation for online news articles was 0.619 with 2 days’ time lag, the highest correlation for microblogs was 0.613 with 2 days’ time lag, and the highest correlations for search queries ranged from 0.831 to 0.949 with time lags of 3 days to 6 days. In Hubei province, the highest correlation for online news articles was 0.667 with 14 days’ time lag, the highest correlation for microblogs was 0.632 with 7 days’ time lag, and the highest correlations for search queries ranged from 0.750 to 0.826 with time lags of 10 days to 12 days. Although the highest correlations for online news articles and microblogs were below the high correlation threshold (0.7), these correlations were all above 0.6, which was relatively high.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Daily time series of new confirmed COVID-19 case counts (NC), the fraction of COVID-19 related microblogs (Mblog), the fraction of COVID-19–related online news articles (News), and numbers of COVID-19–related search queries with the keyword “fever,” “dry cough,” “chest distress,” “pneumonia,” or “coronavirus” in mainland China, except Hubei province, from December 21, 2019 to February 29, 2020.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i6e35266_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Daily time series of new confirmed COVID-19 case counts (NC), the fraction of COVID-19 related microblogs (Mblog), the fraction of COVID-19–related online news articles (News), and numbers of COVID-19–related search queries with the keyword “fever,” “dry cough,” “chest distress,” “pneumonia,” or “coronavirus” in Hubei province from December 21, 2019 to February 29, 2020.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i6e35266_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Strongest correlation coefficients, <italic>P</italic> values, and related time lag between new confirmed COVID-19 case counts and the fraction of COVID-19–related microblogs, fraction of COVID-19–related online news articles, and numbers of COVID-19–related search queries between December 21, 2019, and February 29, 2020.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="320"/>
            <col width="140"/>
            <col width="80"/>
            <col width="120"/>
            <col width="0"/>
            <col width="140"/>
            <col width="80"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Source</td>
                <td colspan="4">Outside Hubei</td>
                <td colspan="3">Hubei</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Highest correlation</td>
                <td><italic>P</italic> value</td>
                <td>Days earlier</td>
                <td colspan="2">Highest correlation</td>
                <td><italic>P</italic> value</td>
                <td>Days earlier</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>News articles</td>
                <td>0.619</td>
                <td>&#60;.001</td>
                <td>2</td>
                <td colspan="2">0.667</td>
                <td>&#60;.001</td>
                <td>14</td>
              </tr>
              <tr valign="top">
                <td>Microblogs</td>
                <td>0.613</td>
                <td>&#60;.001</td>
                <td>2</td>
                <td colspan="2">0.632</td>
                <td>&#60;.001</td>
                <td>7</td>
              </tr>
              <tr valign="top">
                <td>Search for “fever”</td>
                <td>0.949</td>
                <td>&#60;.001</td>
                <td>4</td>
                <td colspan="2">0.826</td>
                <td>&#60;.001</td>
                <td>12</td>
              </tr>
              <tr valign="top">
                <td>Search for “dry cough”</td>
                <td>0.831</td>
                <td>&#60;.001</td>
                <td>6</td>
                <td colspan="2">0.775</td>
                <td>&#60;.001</td>
                <td>12</td>
              </tr>
              <tr valign="top">
                <td>Search for “chest distress”</td>
                <td>0.867</td>
                <td>&#60;.001</td>
                <td>3</td>
                <td colspan="2">0.806</td>
                <td>&#60;.001</td>
                <td>10</td>
              </tr>
              <tr valign="top">
                <td>Search for “pneumonia”</td>
                <td>0.854</td>
                <td>&#60;.001</td>
                <td>5</td>
                <td colspan="2">0.750</td>
                <td>&#60;.001</td>
                <td>11</td>
              </tr>
              <tr valign="top">
                <td>Search for “coronavirus”</td>
                <td>0.831</td>
                <td>&#60;.001</td>
                <td>6</td>
                <td colspan="2">0.765</td>
                <td>&#60;.001</td>
                <td>12</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Evaluation</title>
        <p>The forecasting results for our proposed model and baseline models are presented in <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref>. Optimal lags of different data sources, which result in the lowest RMSE for related baseline models incorporating a single Internet-based data source, are shown (see Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> for the optimal lag selection). The last 2 columns show the paired <italic>t</italic> test results comparing our proposed model with the baseline models.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>COVID-19 epidemic forecasting model comparison for mainland China, except Hubei, between January 19, 2020, and February 29, 2020.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="290"/>
            <col width="110"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="100"/>
            <col width="110"/>
            <col width="80"/>
            <col width="70"/>
            <thead>
              <tr valign="bottom">
                <td>Model (lag)</td>
                <td>Model number</td>
                <td>RMSE<sup>a</sup></td>
                <td>MAE<sup>b</sup></td>
                <td>MAPE<sup>c</sup></td>
                <td>Correlation</td>
                <td>Incremental correlation</td>
                <td>
                  t<sub>198</sub>
                </td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>AR(7)+News(1)+ Mblog(10)+Query(1)</td>
                <td>model i</td>
                <td>87.461</td>
                <td>47.780</td>
                <td>0.154</td>
                <td>0.960</td>
                <td>0.435</td>
                <td>N/A<sup>d</sup></td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td>AR(7)</td>
                <td>model 1</td>
                <td>152.182</td>
                <td>97.852</td>
                <td>0.579</td>
                <td>0.852</td>
                <td>0.006</td>
                <td>–8.722</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>AR(7)+News(1)</td>
                <td>model 2</td>
                <td>117.223</td>
                <td>68.158</td>
                <td>0.374</td>
                <td>0.911</td>
                <td>0.066</td>
                <td>–5.000</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>AR(7)+Mblog(10)</td>
                <td>model 3</td>
                <td>93.754</td>
                <td>51.375</td>
                <td>0.185</td>
                <td>0.948</td>
                <td>0.403</td>
                <td>–1.882</td>
                <td>.06</td>
              </tr>
              <tr valign="top">
                <td>AR(7)+Query(1)</td>
                <td>model 4</td>
                <td>138.724</td>
                <td>85.024</td>
                <td>0.421</td>
                <td>0.905</td>
                <td>0.168</td>
                <td>–4.644</td>
                <td>&#60;.001</td>
              </tr>
              <tr valign="top">
                <td>AR(7)+News(1)+ Mblog(1)+Query(1)</td>
                <td>model 5</td>
                <td>90.494</td>
                <td>53.332</td>
                <td>0.306</td>
                <td>0.954</td>
                <td>0.167</td>
                <td>–4.488</td>
                <td>&#60;.001</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>RMSE: root-mean-square error.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>MAE: mean absolute error.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>MAPE: mean absolute percentage error.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>COVID-19 epidemic forecasting model comparison for Hubei province, China, between January 27, 2020, and February 29, 2020.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="290"/>
            <col width="110"/>
            <col width="80"/>
            <col width="80"/>
            <col width="80"/>
            <col width="100"/>
            <col width="110"/>
            <col width="80"/>
            <col width="70"/>
            <thead>
              <tr valign="bottom">
                <td>Model (lag) (model no.)</td>
                <td>Model number</td>
                <td>RMSE<sup>a</sup></td>
                <td>MAE<sup>b</sup></td>
                <td>MAPE<sup>c</sup></td>
                <td>Correlation</td>
                <td>Incremental correlation</td>
                <td>
                  t<sub>198</sub>
                </td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>AR(1)+News(3)+ Mblog(1)+Query(3)</td>
                <td>model i</td>
                <td>325.216</td>
                <td>225.620</td>
                <td>0.168</td>
                <td>0.990</td>
                <td>0.984</td>
                <td>N/A<sup>d</sup></td>
                <td>N/A</td>
              </tr>
              <tr valign="top">
                <td>AR(1)</td>
                <td>model 1</td>
                <td>658.238</td>
                <td>403.665</td>
                <td>0.267</td>
                <td>0.963</td>
                <td>0.958</td>
                <td>–1.732</td>
                <td>.09</td>
              </tr>
              <tr valign="top">
                <td>AR(1)+News(2)</td>
                <td>model 2</td>
                <td>488.974</td>
                <td>325.731</td>
                <td>0.226</td>
                <td>0.978</td>
                <td>0.976</td>
                <td>–1.196</td>
                <td>.24</td>
              </tr>
              <tr valign="top">
                <td>AR(1)+Mblog(1)</td>
                <td>model 3</td>
                <td>431.457</td>
                <td>311.196</td>
                <td>0.228</td>
                <td>0.983</td>
                <td>0.977</td>
                <td>–0.252</td>
                <td>.80</td>
              </tr>
              <tr valign="top">
                <td>AR(1)+Query(3)</td>
                <td>model 4</td>
                <td>437.368</td>
                <td>286.900</td>
                <td>0.201</td>
                <td>0.983</td>
                <td>0.976</td>
                <td>–0.364</td>
                <td>.72</td>
              </tr>
              <tr valign="top">
                <td>AR(1)+News(1)+ Mblog(1)+Query(1)</td>
                <td>model 5</td>
                <td>360.725</td>
                <td>272.602</td>
                <td>0.206</td>
                <td>0.988</td>
                <td>0.981</td>
                <td>–0.965</td>
                <td>.34</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>RMSE: root-mean-square error.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>MAE: mean absolute error.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>MAPE: mean absolute percentage error.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>The results from the 5 accuracy measures were interpreted. The results in <xref ref-type="table" rid="table2">Tables 2</xref> and <xref ref-type="table" rid="table3">3</xref> show that our proposed model (model i) achieved the highest accuracy in all 5 accuracy measures, compared with all the baseline models in both Hubei province and the rest of mainland China. Plots depicting forecasting results and estimation errors for the proposed model and baseline models are also shown in <xref rid="figure3" ref-type="fig">Figures 3</xref> and <xref rid="figure4" ref-type="fig">4</xref>.</p>
        <p>We then assessed the statistical significance of the forecasting accuracy improvement between different models based on paired <italic>t</italic> tests on the models’ RMSEs. For mainland China, except Hubei, <xref ref-type="table" rid="table2">Table 2</xref> and <xref rid="figure3" ref-type="fig">Figure 3</xref> show that our proposed model (model i) could significantly improve the forecasting accuracy, compared with all the other baseline models (model 1, t<sub>198</sub>=–8.722, <italic>P</italic>&#60;.001; model 2, t<sub>198</sub>=–5.000, <italic>P</italic>&#60;.001; model 3, t<sub>198</sub>=–1.882, <italic>P</italic>=.06; model 4, t<sub>198</sub>=–4.644, <italic>P</italic>&#60;.001; model 5, t<sub>198</sub>=–4.488, <italic>P</italic>&#60;.001). For Hubei province, <xref ref-type="table" rid="table3">Table 3</xref> and <xref rid="figure4" ref-type="fig">Figure 4</xref> show our proposed model's (model i) forecasting accuracy improved significantly (at a significance level of .10) compared with the forecasting model using historical new confirmed COVID-19 case counts only (model 1, t<sub>198</sub>=–1.732, <italic>P</italic>=.09) and no significant differences compared with other baseline models (model 2, t<sub>198</sub>=–1.196, <italic>P</italic>=.24; model 3, t<sub>198</sub>=–0.252, <italic>P</italic>=.80; model 4, t<sub>198</sub>=–0.364, <italic>P</italic>=.72; model 5, t<sub>198</sub>=–0.965, <italic>P</italic>=.34). The forecasting accuracy differences between other baseline models using Internet-based data sources and model 1 are not significant (model 2, t<sub>198</sub>=–0.900, <italic>P</italic>=.37; model 3, t<sub>198</sub>=–1.630, <italic>P</italic>=.11; model 4, t<sub>198</sub>=–1.324, <italic>P</italic>=.19; model 5, t<sub>198</sub>=–0.786, <italic>P</italic>=.43).</p>
        <p>We also evaluated the practical significance of the forecasting models from the perspective of MAPE. For provinces outside Hubei of mainland China in <xref ref-type="table" rid="table2">Table 2</xref>, our proposed model showed significant accuracy improvement. Specifically, our proposed forecasting model's unexplained error percentage was 15.4%, while the unexplained error percentages for the other models were as follows: forecasting model based on historical new confirmed COVID-19 case counts only (model 1), 57.9%; model incorporating COVID-19–related online news articles (model 2), 37.4%; model incorporating COVID-19–related microblogs (model 3), 18.5%; model incorporating COVID-19–related search queries (model 4), 42.1%; model combining real-time Internet-based sources into historical new COVID-19 case counts (model 5), 30.6%. Meanwhile, for Hubei province in <xref ref-type="table" rid="table3">Table 3</xref>, the improvement in accuracy with our proposed model was also nearly significant. The unexplained error percentage for our proposed model was 16.8%, while the unexplained error percentages for the other models were as follows: model 1, 26.7%; model 2, 22.6%; model 3, 22.8%; model 4, 20.1%; model 5, 20.6%.</p>
        <p>The collinearity diagnostics revealed that real-time social media data, online news articles, and search queries are independent of each other in supplementing COVID-19 surveillance. More detailed results and discussions are presented in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>(A) Forecasting results for mainland China, except Hubei, between January 19, 2020 and February 29, 2020, during which the daily estimations of our proposed model and baseline models were compared against the daily new confirmed COVID-19 case counts (NC), and (B) the estimation error, defined as the estimated value minus the daily new confirmed COVID-19 case counts.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i6e35266_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>(A) Forecasting results for Hubei province between January 27, 2020 and February 29, 2020, during which the daily estimations of our proposed model and baseline models were compared against the daily new confirmed COVID-19 case counts (NC), and (B) the estimation error, defined as the estimated value minus the daily new confirmed COVID-19 case counts.</p>
          </caption>
          <graphic xlink:href="publichealth_v8i6e35266_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The SARS-COV-2 virus and its variants pose extraordinary challenges for public health systems worldwide. More accurate forecasting of COVID-19 epidemics is key to improving the efficiency of resource allocation and the implementation of intervention policies [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. Our proposed model innovatively incorporates both real-time and historical data from multiple Internet-based sources for COVID-19 epidemic forecasting. Tested during the first wave of the COVID-19 epidemic in mainland China, except Hubei, our proposed model showed statistically significant improved forecasting accuracy compared with the other baseline models. Tested in Hubei province, our proposed model outperformed all the baseline models in all 5 accuracy indexes, revealed significant practical influence, and showed statistically significant improved forecasting accuracy compared with baseline model 1 using the lab-confirmed case count only. Other baseline models incorporating different Internet-based data sources did not show significant differences compared with baseline model 1. This may be because people knew little of the disease at first and all talked online about the novel coronavirus pneumonia in Wuhan, Hubei, which could lead to disturbances in the Internet-based data sources [<xref ref-type="bibr" rid="ref52">52</xref>]. In this condition, a single Internet-based data source or real-time data only may not be able to improve the COVID-19 forecasting accuracy, and our proposed model shows the ability to mitigate the disturbance and enhance COVID-19 surveillance by combining real-time and historical data from multiple Internet-based data sources.</p>
        <p>This study also explored COVID-19 forecasting timeliness using different Internet-based data sources. Unlike previous studies that mainly focused on official online news articles, our study also took into account unofficial online news articles, which accounted for about 92.5% of all online news articles. The results show that COVID-19–related online news articles could provide a warning for the COVID-19 epidemic in mainland China, except Hubai, about 2 days earlier and in Hubai about 12 days to 14 days earlier. A similar early warning ability was also shown for microblogs and search queries. We found significant differences in the lag in an early warning for mainland China, except Hubei, and Hubei province, which may be caused by 2 reasons. First, Hubei experienced an extreme shortage of testing capacity in the beginning [<xref ref-type="bibr" rid="ref26">26</xref>], which could have delayed the peak of lab-confirmed new case counts. Second, at the beginning of the first COVID-19 epidemic, people were curious about this unknown disease and tended to search or post related information even when they did not have associated symptoms [<xref ref-type="bibr" rid="ref52">52</xref>]. This could advance the corresponding peak in Internet-based sources. As of the time of this writing, people were familiar with COVID-19–related information, and Internet-based sources, including online news articles, are supposed to provide a 2- to 6-day early warning for COVID-19 outbreaks.</p>
        <p>Our study innovatively proposes core terms and symptom-related keyword-based approaches to extract COVID-19–related Internet-based data sources. The keyword-based approaches allow us to constantly and conveniently update the core terms and symptoms to keep up with the mutation of the COVID-19 virus. For example, people infected with the Delta variant are more likely to have a “runny nose,” “headache,” or “sore throat” and less likely to experience “loss of smell” [<xref ref-type="bibr" rid="ref53">53</xref>]. Researchers then could focus more on the core term of “Delta variant” and the symptoms of “runny nose,” “headache,” and “sore throat” in online public data–based COVID-19 surveillance for this new round of epidemic in Europe [<xref ref-type="bibr" rid="ref6">6</xref>]. We thus argue that our proposed model could help governments better prepare and respond to a new wave of COVID-19 and its variants.</p>
        <p>Another interesting finding of our study is that the peak of daily new confirmed case counts in Hubei was reached on February 4, 2020, while the peak in the rest of mainland China was reached on January 30, 2020 (5 days earlier than Hubei Province). This finding was contrary to our common sense, for Hubei was the epicenter of the initial outbreak, and the rest of mainland China was influenced by this epidemic later. One possible reason for the delay of the COVID-19 epidemic peak in Hubei was the extreme shortage of medical resources at the beginning of the epidemic, including testing ability and hospital beds [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. Many suspected cases could not be tested until the testing ability was extended 10 times on January 27 [<xref ref-type="bibr" rid="ref45">45</xref>]. And until 15 mobile cabin hospitals were built in early February 2020, many confirmed cases with no or mild symptoms had to be quarantined at home rather than stay in the hospital, which increased the risk of COVID-19 transmission [<xref ref-type="bibr" rid="ref54">54</xref>]. Different from Hubei, the rest of mainland China experienced a much smaller number of COVID-19 cases and had much more adequate medical resources [<xref ref-type="bibr" rid="ref26">26</xref>], which made it possible to test and quarantine all the COVID-19 suspected cases in time. Thus, even though the rest of mainland China was influenced by the COVID-19 epidemic later than Hubei province, it is possible that the rest of mainland China could control the disease and reach the peak of daily new confirmed case counts earlier than Hubei. Future research could explore the factors contributing to the delay or advance of the epidemic peaks.</p>
        <p>Overall, the results show that incorporating both real-time and historical data from multiple Internet-based sources into the COVID-19 forecasting model could significantly improve the forecasting accuracy, compared with other baseline models. Internet-based data sources, including online news articles, microblogs, and search queries, could provide early warning for COVID-19 outbreaks. These findings have broad public health implications. Internet-based data are timely, low-cost, and rich in information, making them critical in the surveillance of COVID-19 outbreaks. This application is even more important in rural areas, where the health infrastructure does not allow for widespread screening. COVID-19 surveillance using Internet-based data could provide much-needed information to help the government trace the outbreak and more effectively allocate resources, including testing capacity, oxygen cylinders, and hospital beds. Internet-based platforms allow users to capture detailed real-time snapshots of COVID-19–related events that happen to them or near them. As the COVID-19 virus continues to mutate, Internet-based sources with richer information have the potential to identify novel COVID-19 variants through deeper information analysis.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>There are several limitations and potential future directions of this study that we would like to mention. First, our study only used retrospective data from mainland China and did not test the proposed model in countries that are currently experiencing an epidemic of COVID-19 and its variants. This is mainly because of data accessibility. We could not find available databases or online platforms that allowed us to access a large volume of real-time and historical microblogs and unofficial online news articles in other countries. We encourage future work to use the proposed method in different countries to test its generalizability and robustness.</p>
        <p>Second, our study did not incorporate machine learning methods in the data filtering process. In this study, we explored the full database of Internet-based sources in mainland China from the SNOSS and Baidu Search Index, where the raw data are not available for downloading and further analysis. Future research could apply advanced machine learning methods to the raw data of various Internet-based sources to achieve more accurate epidemic-related data extraction and deeper information analyses. For example, future research can use the support vector machine to help extract COVID-19–related online data [<xref ref-type="bibr" rid="ref55">55</xref>] or use a topic modeling algorithm to generate major themes about the COVID-19 epidemic [<xref ref-type="bibr" rid="ref56">56</xref>]. Deeper content analyses could help identify real-time characteristics of the COVID-19 epidemic, which may act as early warning signals for new emerging COVID-19 variants or other epidemics.</p>
        <p>Finally, our study mainly used symptom- and core term–related keywords to extract COVID-19–related Internet-based data, which has been proven to provide the most accurate predictions compared with other types of keywords [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Our underlying assumption is that, before getting severe symptoms and going to a sentinel hospital, patients with mild symptoms would likely search for or post COVID-19–related symptoms or core terms online. Our Internet-based method could identify patients with COVID-19 symptoms but lose sight of patients in the incubation period with no symptoms, which meant our method could only provide warning 2 days to 6 days earlier for the epidemic outbreaks. As our study’s major aim was to improve the COVID-19 forecasting accuracy, we did not explore new methods to improve the forecasting timeliness of Internet-based data in our study. We call for future studies to explore novel Internet-based sources, like traffic data and weather [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref57">57</xref>], to help improve the forecasting timeliness for COVID-19 epidemics.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>COVID-19 and its variants have been and continue to be a major public health threat worldwide. COVID-19 core term– and symptom-related Internet-based data could provide invaluable warning signals to the public and supplement existing COVID-19 surveillance systems. This study showed that our proposed COVID-19 forecasting method, incorporating both real-time and historical data from multiple Internet-based sources, could significantly improve the forecasting accuracy compared with other baseline models. Our results also show that Internet-based sources, including online news articles, could provide a warning 2 days to 6 days earlier for COVID-19 outbreaks.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Detailed descriptions of the Internet-based data extraction and filtering methods.</p>
        <media xlink:href="publichealth_v8i6e35266_app1.docx" xlink:title="DOCX File , 28 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Supplementary tables.</p>
        <media xlink:href="publichealth_v8i6e35266_app2.docx" xlink:title="DOCX File , 37 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Descriptions and formulations of baseline models.</p>
        <media xlink:href="publichealth_v8i6e35266_app3.docx" xlink:title="DOCX File , 19 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Accuracy indexes.</p>
        <media xlink:href="publichealth_v8i6e35266_app4.docx" xlink:title="DOCX File , 13 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>Collinearity diagnostics.</p>
        <media xlink:href="publichealth_v8i6e35266_app5.docx" xlink:title="DOCX File , 23 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CDC</term>
          <def>
            <p>Center for Disease Control and Prevention</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">MAE</term>
          <def>
            <p>mean absolute error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MAPE</term>
          <def>
            <p>mean absolute percentage error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">RMSE</term>
          <def>
            <p>root-mean-squared error</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">SNOSS</term>
          <def>
            <p>Sina Network Opinion Surveillance System</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">VIF</term>
          <def>
            <p>variance inflation factor</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>JL would like to acknowledge the partial grant support for the research (71731009, 72061127002, 92146005). WH would also like to acknowledge the partial grant support (2018WZDXM020, 71722014, 71732006, 91546119). CLS would also like to acknowledge the partial grant support (Hong Kong’s RGC-GRF grant 9042571 and CityU 11504417). This research was also partially supported by Shenzhen Key Research Base in Arts &#38; Social Sciences and the National Laboratory of Mechanical Manufacture Systems Engineering, Xi’an Jiaotong University.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>COVID-19 Weekly Epidemiological Update</article-title>
          <source>World Health Organization</source>
          <year>2021</year>
          <month>01</month>
          <day>31</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.who.int/docs/default-source/coronaviruse/situation-reports/20210202_weekly_epi_update_25.pdf">https://www.who.int/docs/default-source/coronaviruse/situation-reports/20210202_weekly_epi_update_25.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="web">
          <article-title>Coronavirus (COVID-19) Dashboard</article-title>
          <source>World Health Organization</source>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://covid19.who.int/">https://covid19.who.int/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lovelace</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>WHO says delta is the fastest and fittest Covid variant and will "pick off" most vulnerable</article-title>
          <source>CNBC</source>
          <year>2021</year>
          <month>06</month>
          <day>23</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cnbc.com/2021/06/21/covid-delta-who-says-variant-is-the-fastest-and-fittest-and-will-pick-off-most-vulnerable-.html">https://www.cnbc.com/2021/06/21/covid-delta-who-says-variant-is-the-fastest-and-fittest-and-will-pick-off-most-vulnerable-.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Callaway</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Delta coronavirus variant: scientists brace for impact</article-title>
          <source>Nature</source>
          <year>2021</year>
          <month>07</month>
          <day>22</day>
          <volume>595</volume>
          <issue>7865</issue>
          <fpage>17</fpage>
          <lpage>18</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-021-01696-3</pub-id>
          <pub-id pub-id-type="medline">34158664</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-021-01696-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <article-title>Delta coronavirus variant has spread to 185 countries, says WHO</article-title>
          <source>Business Standard</source>
          <year>2021</year>
          <month>09</month>
          <day>22</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.business-standard.com/article/current-affairs/delta-coronavirus-variant-has-spread-to-185-countries-says-who-121092200721_1.html">https://tinyurl.com/msfw9ts</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
          <article-title>Covid: WHO says it is very worried about Europe surge</article-title>
          <source>BBC</source>
          <year>2021</year>
          <month>11</month>
          <day>20</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bbc.com/news/world-europe-59358074">https://www.bbc.com/news/world-europe-59358074</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <article-title>What we know about the new omicron COVID-19 variant</article-title>
          <source>The Japan Times</source>
          <year>2021</year>
          <month>11</month>
          <day>27</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.japantimes.co.jp/news/2021/11/27/world/covid-variant-omicron-explainer/">https://www.japantimes.co.jp/news/2021/11/27/world/covid-variant-omicron-explainer/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <article-title>Covid: New variant classed 'of concern' and named Omicron</article-title>
          <source>BBC</source>
          <year>2021</year>
          <month>11</month>
          <day>27</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bbc.com/news/world-59438723">https://www.bbc.com/news/world-59438723</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yousefinaghani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dara</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mubareka</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sharif</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Prediction of COVID-19 waves using social media and Google search: a case study of the US and Canada</article-title>
          <source>Front Public Health</source>
          <year>2021</year>
          <volume>9</volume>
          <fpage>656635</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3389/fpubh.2021.656635"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fpubh.2021.656635</pub-id>
          <pub-id pub-id-type="medline">33937179</pub-id>
          <pub-id pub-id-type="pmcid">PMC8085269</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hussain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Trends and prediction in daily new cases and deaths of COVID-19 in the United States: an internet search-interest based model</article-title>
          <source>Explor Res Hypothesis Med</source>
          <year>2020</year>
          <month>04</month>
          <day>18</day>
          <volume>5</volume>
          <issue>2</issue>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32348380"/>
          </comment>
          <pub-id pub-id-type="doi">10.14218/ERHM.2020.00023</pub-id>
          <pub-id pub-id-type="medline">32348380</pub-id>
          <pub-id pub-id-type="pii">ERHM.2020.00023</pub-id>
          <pub-id pub-id-type="pmcid">PMC7176069</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Clemente</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Poirier</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chinazzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Real-time forecasting of the COVID-19 outbreak in Chinese provinces: machine learning approach using novel digital data and estimates from mechanistic models</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>08</month>
          <day>17</day>
          <volume>22</volume>
          <issue>8</issue>
          <fpage>e20285</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/8/e20285/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/20285</pub-id>
          <pub-id pub-id-type="medline">32730217</pub-id>
          <pub-id pub-id-type="pii">v22i8e20285</pub-id>
          <pub-id pub-id-type="pmcid">PMC7459435</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Katarya</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Social media based surveillance systems for healthcare using machine learning: A systematic review</article-title>
          <source>J Biomed Inform</source>
          <year>2020</year>
          <month>08</month>
          <volume>108</volume>
          <fpage>103500</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1532-0464(20)30128-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2020.103500</pub-id>
          <pub-id pub-id-type="medline">32622833</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(20)30128-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC7331523</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Tse</surname>
              <given-names>ZTH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Miu</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Ebola and the social media</article-title>
          <source>Lancet</source>
          <year>2014</year>
          <month>12</month>
          <day>20</day>
          <volume>384</volume>
          <issue>9961</issue>
          <fpage>2207</fpage>
          <pub-id pub-id-type="doi">10.1016/S0140-6736(14)62418-1</pub-id>
          <pub-id pub-id-type="medline">25625391</pub-id>
          <pub-id pub-id-type="pii">S0140-6736(14)62418-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kou</surname>
              <given-names>SC</given-names>
            </name>
          </person-group>
          <article-title>Accurate estimation of influenza epidemics using Google search data via ARGO</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2015</year>
          <month>11</month>
          <day>24</day>
          <volume>112</volume>
          <issue>47</issue>
          <fpage>14473</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26553980"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.1515373112</pub-id>
          <pub-id pub-id-type="medline">26553980</pub-id>
          <pub-id pub-id-type="pii">1515373112</pub-id>
          <pub-id pub-id-type="pmcid">PMC4664296</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Achrekar</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gandhe</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lazarus</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Twitter Improves Seasonal Influenza Prediction</article-title>
          <year>2012</year>
          <conf-name>International Conference on Health Informatics - HEALTHINF</conf-name>
          <conf-date>February 1-4, 2012</conf-date>
          <conf-loc>Vilamoura, Algarve, Portugal</conf-loc>
          <fpage>61</fpage>
          <lpage>70</lpage>
          <pub-id pub-id-type="doi">10.5220/0003780600610070</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sia</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Enhancing influenza epidemics forecasting accuracy in China with both official and unofficial online news articles, 2019-2020</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2021</year>
          <month>06</month>
          <day>18</day>
          <volume>18</volume>
          <issue>12</issue>
          <fpage>6591</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph18126591"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph18126591</pub-id>
          <pub-id pub-id-type="medline">34207479</pub-id>
          <pub-id pub-id-type="pii">ijerph18126591</pub-id>
          <pub-id pub-id-type="pmcid">PMC8296334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ginsberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mohebbi</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Brammer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Smolinski</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Brilliant</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Detecting influenza epidemics using search engine query data</article-title>
          <source>Nature</source>
          <year>2009</year>
          <month>02</month>
          <day>19</day>
          <volume>457</volume>
          <issue>7232</issue>
          <fpage>1012</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1038/nature07634</pub-id>
          <pub-id pub-id-type="medline">19020500</pub-id>
          <pub-id pub-id-type="pii">nature07634</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chughtai</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Macintyre</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Utility and potential of rapid epidemic intelligence from internet-based sources</article-title>
          <source>Int J Infect Dis</source>
          <year>2017</year>
          <month>10</month>
          <volume>63</volume>
          <fpage>77</fpage>
          <lpage>87</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1201-9712(17)30199-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijid.2017.07.020</pub-id>
          <pub-id pub-id-type="medline">28765076</pub-id>
          <pub-id pub-id-type="pii">S1201-9712(17)30199-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Early detection of disease outbreaks using the Internet</article-title>
          <source>CMAJ</source>
          <year>2009</year>
          <month>04</month>
          <day>14</day>
          <volume>180</volume>
          <issue>8</issue>
          <fpage>829</fpage>
          <lpage>31</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cmaj.ca/cgi/pmidlookup?view=long&#38;pmid=19364791"/>
          </comment>
          <pub-id pub-id-type="doi">10.1503/cmaj.090215</pub-id>
          <pub-id pub-id-type="medline">19364791</pub-id>
          <pub-id pub-id-type="pii">180/8/829</pub-id>
          <pub-id pub-id-type="pmcid">PMC2665960</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McGough</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Hawkins</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Santillana</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Forecasting Zika incidence in the 2016 Latin America outbreak combining traditional disease surveillance with search, social media, and news report data</article-title>
          <source>PLoS Negl Trop Dis</source>
          <year>2017</year>
          <month>01</month>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>e0005295</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pntd.0005295"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pntd.0005295</pub-id>
          <pub-id pub-id-type="medline">28085877</pub-id>
          <pub-id pub-id-type="pii">PNTD-D-16-01733</pub-id>
          <pub-id pub-id-type="pmcid">PMC5268704</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Role of media coverage in mitigating COVID-19 transmission: Evidence from China</article-title>
          <source>Technol Forecast Soc Change</source>
          <year>2021</year>
          <month>02</month>
          <volume>163</volume>
          <fpage>120435</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/33162619"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.techfore.2020.120435</pub-id>
          <pub-id pub-id-type="medline">33162619</pub-id>
          <pub-id pub-id-type="pii">S0040-1625(20)31261-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC7604032</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bridgman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Merkley</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Loewen</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Owen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ruths</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Teichmann</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhilin</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>The causes and consequences of COVID-19 misperceptions: Understanding the role of news and social media</article-title>
          <source>HKS Misinfo Review</source>
          <year>2020</year>
          <month>6</month>
          <day>18</day>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.37016/mr-2020-028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Poudyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>RK</given-names>
            </name>
          </person-group>
          <article-title>Forecasting housing prices under different market segmentation assumptions</article-title>
          <source>Urban Studies</source>
          <year>2009</year>
          <month>01</month>
          <day>01</day>
          <volume>46</volume>
          <issue>1</issue>
          <fpage>167</fpage>
          <lpage>187</lpage>
          <pub-id pub-id-type="doi">10.1177/0042098008098641</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ardabili</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Mosavi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ghamisi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ferdinand</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Varkonyi-Koczy</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Reuter</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Rabczuk</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Atkinson</surname>
              <given-names>PM</given-names>
            </name>
          </person-group>
          <article-title>COVID-19 outbreak prediction with machine learning</article-title>
          <source>Algorithms</source>
          <year>2020</year>
          <month>10</month>
          <day>01</day>
          <volume>13</volume>
          <issue>10</issue>
          <fpage>249</fpage>
          <pub-id pub-id-type="doi">10.3390/a13100249</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kwan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>JY</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>JI</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Hong</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kronbichler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Koyanagi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jacob</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ghayda</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Public interest in immunity and the justification for intervention in the early stages of the COVID-19 pandemic: analysis of Google trends data</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <month>06</month>
          <day>18</day>
          <volume>23</volume>
          <issue>6</issue>
          <fpage>e26368</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/6/e26368/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/26368</pub-id>
          <pub-id pub-id-type="medline">34038375</pub-id>
          <pub-id pub-id-type="pii">v23i6e26368</pub-id>
          <pub-id pub-id-type="pmcid">PMC8216330</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Using reports of symptoms and diagnoses on social media to predict COVID-19 case counts in mainland China: observational infoveillance study</article-title>
          <source>J Med Internet Res</source>
          <year>2020</year>
          <month>05</month>
          <day>28</day>
          <volume>22</volume>
          <issue>5</issue>
          <fpage>e19421</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2020/5/e19421/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/19421</pub-id>
          <pub-id pub-id-type="medline">32452804</pub-id>
          <pub-id pub-id-type="pii">v22i5e19421</pub-id>
          <pub-id pub-id-type="pmcid">PMC7257484</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Retrospective analysis of the possibility of predicting the COVID-19 outbreak from Internet searches and social media data, China, 2020</article-title>
          <source>Euro Surveill</source>
          <year>2020</year>
          <month>03</month>
          <volume>25</volume>
          <issue>10</issue>
          <fpage>1</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.eurosurveillance.org/content/10.2807/1560-7917.ES.2020.25.10.2000199"/>
          </comment>
          <pub-id pub-id-type="doi">10.2807/1560-7917.ES.2020.25.10.2000199</pub-id>
          <pub-id pub-id-type="medline">32183935</pub-id>
          <pub-id pub-id-type="pmcid">PMC7078825</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Effenberger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kronbichler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>JI</given-names>
            </name>
            <name name-style="western">
              <surname>Mayer</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Tilg</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Perco</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Association of the COVID-19 pandemic with Internet search volumes: a Google trends analysis</article-title>
          <source>Int J Infect Dis</source>
          <year>2020</year>
          <month>06</month>
          <volume>95</volume>
          <fpage>192</fpage>
          <lpage>197</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1201-9712(20)30249-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijid.2020.04.033</pub-id>
          <pub-id pub-id-type="medline">32305520</pub-id>
          <pub-id pub-id-type="pii">S1201-9712(20)30249-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC7162745</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Venkatesh</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Gandhi</surname>
              <given-names>PA</given-names>
            </name>
          </person-group>
          <article-title>Prediction of COVID-19 outbreaks using Google trends in India: a retrospective analysis</article-title>
          <source>Healthc Inform Res</source>
          <year>2020</year>
          <month>07</month>
          <volume>26</volume>
          <issue>3</issue>
          <fpage>175</fpage>
          <lpage>184</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.e-hir.org/DOIx.php?id=10.4258/hir.2020.26.3.175"/>
          </comment>
          <pub-id pub-id-type="doi">10.4258/hir.2020.26.3.175</pub-id>
          <pub-id pub-id-type="medline">32819035</pub-id>
          <pub-id pub-id-type="pii">hir.2020.26.3.175</pub-id>
          <pub-id pub-id-type="pmcid">PMC7438693</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Valentin</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mercier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lancelot</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Roche</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Arsevska</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Monitoring online media reports for early detection of unknown diseases: Insight from a retrospective study of COVID-19 emergence</article-title>
          <source>Transbound Emerg Dis</source>
          <year>2021</year>
          <month>05</month>
          <volume>68</volume>
          <issue>3</issue>
          <fpage>981</fpage>
          <lpage>986</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/32683774"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/tbed.13738</pub-id>
          <pub-id pub-id-type="medline">32683774</pub-id>
          <pub-id pub-id-type="pmcid">PMC7405088</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Linguistic Feature and Temporal Pattern of User-Generated News: Evidence from an Online News Portal in China</article-title>
          <year>2018</year>
          <month>6</month>
          <conf-name>PACIS 2018</conf-name>
          <conf-date>June 26-30, 2018</conf-date>
          <conf-loc>Yokohama, Japan</conf-loc>
          <fpage>19</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aisel.aisnet.org/pacis2018/19"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <article-title>COVID-19 Epidemic Dynamics</article-title>
          <source>Chinese Center for Disease Control and Prevention</source>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.chinacdc.cn/jkzt/crb/zl/szkb_11803/jszl_11809/">http://www.chinacdc.cn/jkzt/crb/zl/szkb_11803/jszl_11809/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <article-title>News</article-title>
          <source>Health Commission of Hubei Province</source>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://wjw.hubei.gov.cn/bmdt/dtyw/">http://wjw.hubei.gov.cn/bmdt/dtyw/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <source>Sina Network Opinion Surveillance System</source>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://yqt.mdata.net/">https://yqt.mdata.net/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <source>Baidu Index</source>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://index.baidu.com/">http://index.baidu.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shia</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Prediction of number of cases of 2019 novel coronavirus (COVID-19) using social media search index</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2020</year>
          <month>03</month>
          <day>31</day>
          <volume>17</volume>
          <issue>7</issue>
          <fpage>2365</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph17072365"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph17072365</pub-id>
          <pub-id pub-id-type="medline">32244425</pub-id>
          <pub-id pub-id-type="pii">ijerph17072365</pub-id>
          <pub-id pub-id-type="pmcid">PMC7177617</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodwin</surname>
              <given-names>LD</given-names>
            </name>
            <name name-style="western">
              <surname>Leech</surname>
              <given-names>NL</given-names>
            </name>
          </person-group>
          <article-title>Understanding correlation: factors that affect the size of r</article-title>
          <source>The Journal of Experimental Education</source>
          <year>2006</year>
          <month>04</month>
          <volume>74</volume>
          <issue>3</issue>
          <fpage>249</fpage>
          <lpage>266</lpage>
          <pub-id pub-id-type="doi">10.3200/jexe.74.3.249-266</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beretta</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Santaniello</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Nearest neighbor imputation algorithms: a critical evaluation</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2016</year>
          <month>07</month>
          <day>25</day>
          <volume>16 Suppl 3</volume>
          <fpage>74</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-016-0318-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-016-0318-z</pub-id>
          <pub-id pub-id-type="medline">27454392</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-016-0318-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC4959387</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hswen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hawkins</surname>
              <given-names>JB</given-names>
            </name>
          </person-group>
          <article-title>Use of a digital health application for influenza surveillance in China</article-title>
          <source>Am J Public Health</source>
          <year>2017</year>
          <month>07</month>
          <volume>107</volume>
          <issue>7</issue>
          <fpage>1130</fpage>
          <lpage>1136</lpage>
          <pub-id pub-id-type="doi">10.2105/AJPH.2017.303767</pub-id>
          <pub-id pub-id-type="medline">28520492</pub-id>
          <pub-id pub-id-type="pmcid">PMC5463210</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Combining time series models for forecasting</article-title>
          <source>International Journal of Forecasting</source>
          <year>2004</year>
          <month>1</month>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>69</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="doi">10.1016/s0169-2070(03)00004-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Simpkins</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>System Identification: Theory for the User, 2nd Edition (Ljung, L.; 1999) [On the Shelf]</article-title>
          <source>IEEE Robot. Automat. Mag</source>
          <year>2012</year>
          <month>06</month>
          <volume>19</volume>
          <issue>2</issue>
          <fpage>95</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.1109/MRA.2012.2192817</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lesaffre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Rizopoulos</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Tsonaka</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The logistic transform for bounded outcome scores</article-title>
          <source>Biostatistics</source>
          <year>2007</year>
          <month>01</month>
          <day>05</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>72</fpage>
          <lpage>85</lpage>
          <pub-id pub-id-type="doi">10.1093/biostatistics/kxj034</pub-id>
          <pub-id pub-id-type="medline">16597671</pub-id>
          <pub-id pub-id-type="pii">kxj034</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <article-title>Seventh National Population Census</article-title>
          <source>National Bureau of Statistics</source>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.stats.gov.cn/ztjc/zdtjgz/zgrkpc/dqcrkpc/">http://www.stats.gov.cn/ztjc/zdtjgz/zgrkpc/dqcrkpc/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stahel</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <source>Statistische datenanalyseine einführung für naturwissenschaftler</source>
          <year>2002</year>
          <publisher-loc>Braunschweig, Germany</publisher-loc>
          <publisher-name>Springer Vieweg Verlag</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <article-title>Wuhan: Daily Testing Ability Raising From 200 to 2000</article-title>
          <source>Health Commission of Hubei Province</source>
          <year>2020</year>
          <month>01</month>
          <day>29</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://wjw.hubei.gov.cn/bmdt/ztzl/fkxxgzbdgrfyyq/fkdt/202001/t20200129_2016053.shtml">http://wjw.hubei.gov.cn/bmdt/ztzl/fkxxgzbdgrfyyq/fkdt/202001/t20200129_2016053.shtml</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O’brien</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>A caution regarding rules of thumb for variance inflation factors</article-title>
          <source>Qual Quant</source>
          <year>2007</year>
          <month>3</month>
          <day>13</day>
          <volume>41</volume>
          <issue>5</issue>
          <fpage>673</fpage>
          <lpage>690</lpage>
          <pub-id pub-id-type="doi">10.1007/s11135-006-9018-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>James</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Witten</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hastie</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>An Introduction to Statistical Learning with Applications in R</source>
          <year>2013</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayyoubzadeh</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Ayyoubzadeh</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Zahedi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmadi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>R Niakan Kalhori</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Predicting COVID-19 incidence through analysis of Google trends data in Iran: data mining and deep learning pilot study</article-title>
          <source>JMIR Public Health Surveill</source>
          <year>2020</year>
          <month>04</month>
          <day>14</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>e18828</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://publichealth.jmir.org/2020/2/e18828/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/18828</pub-id>
          <pub-id pub-id-type="medline">32234709</pub-id>
          <pub-id pub-id-type="pii">v6i2e18828</pub-id>
          <pub-id pub-id-type="pmcid">PMC7159058</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <source>Assessing Forecast Accuracy Measures</source>
          <year>2004</year>
          <month>03</month>
          <day>14</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.69.1016&#38;rep=rep1&#38;type=pdf">https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.69.1016&#38;rep=rep1&#38;type=pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Building predictive models in R using the caret package</article-title>
          <source>J. Stat. Soft</source>
          <year>2008</year>
          <volume>28</volume>
          <issue>5</issue>
          <fpage>1</fpage>
          <pub-id pub-id-type="doi">10.18637/jss.v028.i05</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maindonald</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Braun</surname>
              <given-names>WJ</given-names>
            </name>
          </person-group>
          <article-title>DAAG: Data Analysis and Graphics Data and Functions</article-title>
          <source>Cran R</source>
          <year>2015</year>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/package=DAAG">https://cran.r-project.org/package=DAAG</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lazer</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kennedy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Big data. The parable of Google Flu: traps in big data analysis</article-title>
          <source>Science</source>
          <year>2014</year>
          <month>03</month>
          <day>14</day>
          <volume>343</volume>
          <issue>6176</issue>
          <fpage>1203</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1126/science.1248506</pub-id>
          <pub-id pub-id-type="medline">24626916</pub-id>
          <pub-id pub-id-type="pii">343/6176/1203</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Katella</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>5 Things To Know About the Delta Variant</article-title>
          <source>Yale Medicine</source>
          <year>2022</year>
          <month>03</month>
          <day>01</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.yalemedicine.org/news/5-things-to-know-delta-variant-covid">https://www.yalemedicine.org/news/5-things-to-know-delta-variant-covid</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="web">
          <article-title>A total of 16 mobile cabin hospitals have been built and 15 have been put into operation in Wuhan</article-title>
          <source>SOHU</source>
          <year>2020</year>
          <month>05</month>
          <day>14</day>
          <access-date>2022-06-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.sohu.com/a/395142180_118392">https://www.sohu.com/a/395142180_118392</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Broniatowski</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>National and local influenza surveillance through Twitter: an analysis of the 2012-2013 influenza epidemic</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <volume>8</volume>
          <issue>12</issue>
          <fpage>e83672</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0083672"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0083672</pub-id>
          <pub-id pub-id-type="medline">24349542</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-35058</pub-id>
          <pub-id pub-id-type="pmcid">PMC3857320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Saroha</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tse</surname>
              <given-names>ZTH</given-names>
            </name>
            <name name-style="western">
              <surname>Ip</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>IC</given-names>
            </name>
          </person-group>
          <article-title>How people react to Zika virus outbreaks on Twitter? A computational content analysis</article-title>
          <source>Am J Infect Control</source>
          <year>2016</year>
          <month>12</month>
          <day>01</day>
          <volume>44</volume>
          <issue>12</issue>
          <fpage>1700</fpage>
          <lpage>1702</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ajic.2016.04.253</pub-id>
          <pub-id pub-id-type="medline">27566874</pub-id>
          <pub-id pub-id-type="pii">S0196-6553(16)30623-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Kou</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Examining the relationships between air pollutants and the incidence of acute aortic dissection with electronic medical data in a moderately polluted area of Northwest China</article-title>
          <source>Inquiry</source>
          <year>2021</year>
          <month>12</month>
          <day>28</day>
          <volume>58</volume>
          <fpage>469580211065691</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/00469580211065691?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub%3dpubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/00469580211065691</pub-id>
          <pub-id pub-id-type="medline">34961361</pub-id>
          <pub-id pub-id-type="pmcid">PMC8721698</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
