<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v6i3e12842</article-id>
      <article-id pub-id-type="pmid">32701458</article-id>
      <article-id pub-id-type="doi">10.2196/12842</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>An Automated Approach for Finding Spatio-Temporal Patterns of Seasonal Influenza in the United States: Algorithm Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Sanchez</surname>
            <given-names>Travis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Velappan</surname>
            <given-names>Nileena</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Daughton</surname>
            <given-names>Ashlynn</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Sambaturu</surname>
            <given-names>Prathyush</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1795-5287</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Bhattacharya</surname>
            <given-names>Parantapa</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3626-9939</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Chen</surname>
            <given-names>Jiangzhuo</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2729-3881</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lewis</surname>
            <given-names>Bryan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0793-6082</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Marathe</surname>
            <given-names>Madhav</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1653-0658</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Venkatramanan</surname>
            <given-names>Srinivasan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-0874-8692</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Vullikanti</surname>
            <given-names>Anil</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>University of Virginia</institution>
            <addr-line>Biocomplexity Institute and Initiative, 995 Research Park Boulevard</addr-line>
            <addr-line>Charlottesville, VA, 22911</addr-line>
            <country>United States</country>
            <phone>1 540 577 3102</phone>
            <email>vsakumar@virginia.edu</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8597-6197</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>University of Virginia</institution>
        <addr-line>Charlottesville, VA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Anil Vullikanti <email>vsakumar@virginia.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <season>Jul-Sep</season>
        <year>2020</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>4</day>
        <month>9</month>
        <year>2020</year>
      </pub-date>
      <volume>6</volume>
      <issue>3</issue>
      <elocation-id>e12842</elocation-id>
      <history>
        <date date-type="received">
          <day>20</day>
          <month>11</month>
          <year>2018</year>
        </date>
        <date date-type="rev-request">
          <day>18</day>
          <month>1</month>
          <year>2019</year>
        </date>
        <date date-type="rev-recd">
          <day>16</day>
          <month>6</month>
          <year>2019</year>
        </date>
        <date date-type="accepted">
          <day>3</day>
          <month>4</month>
          <year>2020</year>
        </date>
      </history>
      <copyright-statement>©Prathyush Sambaturu, Parantapa Bhattacharya, Jiangzhuo Chen, Bryan Lewis, Madhav Marathe, Srinivasan Venkatramanan, Anil Vullikanti. Originally published in JMIR Public Health and Surveillance (http://publichealth.jmir.org), 04.09.2020.</copyright-statement>
      <copyright-year>2020</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on http://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="http://publichealth.jmir.org/2020/3/e12842/" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Agencies such as the Centers for Disease Control and Prevention (CDC) currently release influenza-like illness incidence data, along with descriptive summaries of simple spatio-temporal patterns and trends. However, public health researchers, government agencies, as well as the general public, are often interested in deeper patterns and insights into how the disease is spreading, with additional context. Analysis by domain experts is needed for deriving such insights from incidence data.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Our goal was to develop an automated approach for finding interesting spatio-temporal patterns in the spread of a disease over a large region, such as regions which have specific characteristics (eg, high incidence in a particular week, those which showed a sudden change in incidence) or regions which have significantly different incidence compared to earlier seasons.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We developed techniques from the area of transactional data mining for characterizing and finding interesting spatio-temporal patterns in disease spread in an automated manner. A key part of our approach involved using the principle of minimum description length for representing a given target set in terms of combinations of attributes (referred to as clauses); we considered both positive and negative clauses, relaxed descriptions which approximately represent the set, and used integer programming to find such descriptions. Finally, we designed an automated approach, which examines a large space of sets corresponding to different spatio-temporal patterns, and ranks them based on the ratio of their size to their description length (referred to as the compression ratio).</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>We applied our methods using minimum description length to find spatio-temporal patterns in the spread of seasonal influenza in the United States using state level influenza-like illness activity indicator data from the CDC. We observed that the compression ratios were over 2.5 for 50% of the chosen sets, when approximate descriptions and negative clauses were allowed. Sets with high compression ratios (eg, over 2.5) corresponded to interesting patterns in the spatio-temporal dynamics of influenza-like illness. Our approach also outperformed description by solution in terms of the compression ratio.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our approach, which is an unsupervised machine learning method, can provide new insights into patterns and trends in the disease spread in an automated manner. Our results show that the description complexity is an effective approach for characterizing sets of interest, which can be easily extended to other diseases and regions beyond influenza in the US. Our approach can also be easily adapted for automated generation of narratives.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>epidemic data analysis</kwd>
        <kwd>summarization</kwd>
        <kwd>spatio-temporal patterns</kwd>
        <kwd>transactional data mining</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Large-scale spatio-temporal analyses and forecasts are becoming increasingly common for several diseases, such as influenza [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. There is a lot of public interest in analysis of spatio-temporal trends relating to how these diseases are spreading across the United States—this includes statements about whether the season has officially started, a listing of regions which have differing levels of activity, and the contrast between the current season and earlier seasons. Such analyses have a broad readership and are popular among news media, the general public, and government agencies, as well as public health organizations; this is evidenced by spatio-temporal pattern reports [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>] about the spread of influenza from news agencies and blogs.</p>
      <p>Such patterns are typically identified manually by domain experts who have significant expertise on specific diseases. Data for such analyses often comes from public health agencies, such as the Centers for Disease Control and Prevention (CDC) [<xref ref-type="bibr" rid="ref7">7</xref>] and World Health Organization. Reports generated by the CDC contain raw surveillance data on metrics (eg, activity level from outpatient visits and rates of hospitalization) across states in the US. In addition, summaries of regions with specific characteristics (eg, those which have high activity levels) are also included in the reports [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. For instance, one CDC report [<xref ref-type="bibr" rid="ref8">8</xref>] summarizes the states with high influenza-like illness activity for the week ending on March 4, 2017 with the number of states followed by a list of the state names.</p>
      <p>Such descriptive listings are easy to construct from raw data but are tedious to read and do not provide deeper insight into the disease spread. In contrast, the analysis by Mashable [<xref ref-type="bibr" rid="ref6">6</xref>] is a succinct description of the set of states which have widespread activity, namely, all states in the contiguous US, except Oregon. An analysis by the New York Times [<xref ref-type="bibr" rid="ref5">5</xref>] was also a good and succinct description of the set of states which have reported widespread activity for 3 consecutive weeks. In addition to descriptions of the set of states with a particular activity level, sets exhibiting specific temporal patterns might also be of interest. An example is the set of states which maintained stable high activity for 3 consecutive weeks, ending in the week of January 27, 2018; most states had high influenza-like illness activity level 4 weeks prior, plus the states of New Jersey, New Mexico, Virginia, Washington, and Wyoming. Such descriptions involve identification of features common to these states, which provide additional insights on the outbreak.</p>
      <p>The overall objective of our work was to automate the process of identifying interesting spatio-temporal patterns from disease surveillance data and generating succinct descriptions for them. In order to do this, we encoded the incidence data as binary matrices (presence or absence of a feature) and used techniques from pattern mining [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>] in transactional data to find insights into epidemic spread; we demonstrated its utility using seasonal influenza in the US as a case study.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data</title>
        <p>We used the state level influenza-like illness activity indicator data available from the CDC [<xref ref-type="bibr" rid="ref11">11</xref>]. In the data set, each state for each week during a given influenza season is assigned an activity level from 1 to 10 based on the severity of influenza prevalence in that week (measured using the percentage of outpatient visits that show influenza-like symptoms) [<xref ref-type="bibr" rid="ref12">12</xref>]. These activity levels are also grouped by coarser labels such as minimal (1-3), low (4-5), moderate (6-7), or high (8-10) [<xref ref-type="bibr" rid="ref13">13</xref>]. We also incorporated the geographic spread index as published by CDC in [<xref ref-type="bibr" rid="ref14">14</xref>], which categorizes the states based on the internal spatial spread of influenza. We used a number of features associated with each state which are defined by the CDC and can be categorized as follows:</p>
        <p>1. Geographical or spatial which included features such as Great Lakes, southeast, mid-Atlantic;</p>
        <p>2. Temporal which included features such as activity level (eg, high, moderate, and low) in the <italic>t</italic>th week before the current (at that time) week<italic>,</italic> geographical spread (eg, widespread or local) in the <italic>t</italic>th week prior, whether the number of infections has crossed a threshold, whether the peak has been reached, and similarity with past season. In the description below, these features are denoted by <italic>was1_high</italic> (states with high influenza-like illness activity 1 week prior), <italic>was2_moderate</italic> (states with moderate influenza-like illness activity 2 weeks prior), <italic>was52_high</italic> (states with high activity 52 weeks prior), and so on. These features capture the spatial, temporal, and severity aspects of the reported cases. A full list of attributes and their description is presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>We used data corresponding to weeks from 2014 to 2017. To generate narratives for a particular week, we use data from these reports for that week, the previous 3 weeks, and the data from 52 weeks prior to generate the temporal data for each state. This was expressed as a data matrix <italic>D</italic> containing the characteristics number of regions as rows (<italic>n</italic>=51 representing 50 states and the District of Columbia) and number of features as columns (<italic>m</italic>=42 spatial, temporal, or severity features). Therefore, the data matrix for a given week had <italic>m</italic>×<italic>n</italic>=2142 entries.</p>
      </sec>
      <sec>
        <title>Problem Formulation</title>
        <p>Let <italic>D<sub>n×m</sub></italic> be the data matrix, where each row corresponds to a state and each column to a feature, and <italic>D<sub>ij</sub></italic>=1 if state <italic>i</italic> has feature <italic>j</italic>. Let <italic>U</italic>={<italic>e</italic><sub>1</sub><italic>,..., e<sub>n</sub></italic>} be the universe of elements, in our case, the set of all states. Let <italic>D<sub>j</sub></italic>={<italic>i</italic>: <italic>D<sub>ij</sub></italic>=1} denote the set of elements having feature <italic>j</italic>. Let <italic>S</italic>(<italic>j</italic><sub>1</sub>,..., <italic>j<sub>k</sub></italic>)= <inline-graphic xlink:href="publichealth_v6i3e12842_fig4.png" xlink:type="simple" mimetype="image"/> ∩...∩ <inline-graphic xlink:href="publichealth_v6i3e12842_fig5.png" xlink:type="simple" mimetype="image"/> denote the set of elements that have features (<italic>j</italic><sub>1</sub>,..., <italic>j<sub>k</sub></italic>) (denoted by <bold><italic>j</italic></bold>), referred to as a conjunctive clause. The clause S(<bold><italic>j</italic></bold>) has length <italic>k</italic>, meaning that it is formed by the intersection of <italic>k</italic> features.</p>
        <p>Given a target set <italic>T</italic> ⊆ <italic>U</italic>, we consider expressions of <italic>T</italic> in terms of unions and differences, ie,</p>
        <p><inline-graphic xlink:href="publichealth_v6i3e12842_fig6.png" xlink:type="simple" mimetype="image"/>, (<bold>1</bold>)</p>
        <p>with an associated cost</p>
        <p><inline-graphic xlink:href="publichealth_v6i3e12842_fig7.png" xlink:type="simple" mimetype="image"/>, (<bold>2</bold>)</p>
        <p>where and <italic>α</italic> and <italic>β</italic> are the constant parameters associated with positive,</p>
        <p><inline-graphic xlink:href="publichealth_v6i3e12842_fig8.png" xlink:type="simple" mimetype="image"/>, (<bold>3</bold>)</p>
        <p>and negative clauses,</p>
        <p><inline-graphic xlink:href="publichealth_v6i3e12842_fig9.png" xlink:type="simple" mimetype="image"/>, (<bold>4</bold>)</p>
        <p>respectively, and</p>
        <p><inline-graphic xlink:href="publichealth_v6i3e12842_fig10.png" xlink:type="simple" mimetype="image"/>, (<bold>5</bold>)</p>
        <p>denotes the number of features involved in a clause</p>
        <p><inline-graphic xlink:href="publichealth_v6i3e12842_fig11.png" xlink:type="simple" mimetype="image"/>. (<bold>6</bold>)</p>
        <p>The negative clauses describe the elements which need to be removed from the set of positive clauses, in order to exactly cover the elements of <italic>T</italic>.</p>
        <p>Given a subset <italic>T</italic> ⊆ <italic>U</italic> (referred to as a target set), and a data set <italic>D</italic>, the minimum description length problem involves finding a set of tuples <bold><italic>j</italic><sup>1</sup></bold>,..., <bold><italic>j<sup>s</sup></italic></bold>, such that <italic>T</italic> is represented in terms of unions and differences and the associated cost (represented by equation 2) is minimized.</p>
        <p>In order to make the descriptions interpretable, we will restrict the sizes of these clauses (ie, the number <inline-graphic xlink:href="publichealth_v6i3e12842_fig12.png" xlink:type="simple" mimetype="image"/> of columns whose intersection is allowed); herein, we will focus on <inline-graphic xlink:href="publichealth_v6i3e12842_fig12.png" xlink:type="simple" mimetype="image"/>≤2, though our approach extends to any <italic>k</italic>.</p>
        <p>Our main idea for finding patterns of interest was to explore the space of all target sets and identify those which have low cost descriptions. This was motivated by the minimum description length principle, that forms the basis of many machine learning methods to find such descriptions; we refer to [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] for details on this topic.</p>
        <p>In some cases, the target set <italic>T</italic> does not have a small description, but we can find a set <italic>T’</italic> which is close to <italic>T</italic> and has a smaller description than <italic>T</italic>. We model this as finding a representation for a subset <italic>T’</italic> such that <italic>T’≈T</italic>, which is formalized as the minimum approximate description length problem. Given a target set <italic>T</italic> ⊆ <italic>U</italic>, a data set <italic>D</italic>, and constant parameters <italic>α</italic>, <italic>β</italic>, <italic>γ</italic>, the minimum approximate description length problem involves finding a set of tuples <bold><italic>j</italic><sup>1</sup></bold>,..., <bold><italic>j<sup>s</sup></italic></bold>, for representation of <italic>T’</italic> as unions and differences, such that the symmetric difference of <italic>T</italic> and <italic>T’</italic> is of size at most <italic>γ</italic>&#124;<italic>T</italic>&#124;, and the associated cost is minimized. Since minimum approximate description length is a generalization of minimum description length, we only consider the minimum approximate description length problem in the rest of the paper. The minimum description length and minimum approximate description length problems are both NP-complete, even when <inline-graphic xlink:href="publichealth_v6i3e12842_fig12.png" xlink:type="simple" mimetype="image"/>=1, which corresponds to the set cover problem (refer to [<xref ref-type="bibr" rid="ref17">17</xref>] for discussion on this topic).</p>
      </sec>
      <sec>
        <title>Approach and Implementation</title>
        <p>We used an integer programming approach described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, which is able to scale well for the problems of interest in epidemic analysis. We used Gurobi optimization software [<xref ref-type="bibr" rid="ref18">18</xref>] to solve the resulting integer program. The size of the instances encountered results in programs that can be solved very efficiently.</p>
      </sec>
      <sec>
        <title>Generate Set Descriptions.</title>
        <p>We considered the set of states with a high activity level in a given week, as a target set <italic>T</italic> and prepared the data matrix <italic>D</italic>. These states had value 1 in the column named high in the matrix. Then, we used our method to compute the succinct descriptions for the target set <italic>T</italic> for the parameters (<italic>α</italic>, <italic>β</italic>, <italic>γ</italic>)=(2, 2, 0). From the minimum description length principle, a set <italic>T</italic> was likely to be an interesting pattern if it had a high compression ratio.</p>
        <p>We also studied the impact of the parameter <italic>γ</italic> on the description length. Recall that the parameter controls how accurately we attempt to describe the target set. A larger <italic>γ</italic> would mean greater error but should lead to a more succinct description. The target set <italic>T</italic> was the set of states with high activity in a given week. We ran our method for the given week with target set <italic>T</italic> and, for each value of <italic>γ</italic> ∈ (0.1, 0.2, 0.3).</p>
      </sec>
      <sec>
        <title>Ranking Set Descriptions</title>
        <p>It was not known a priori which target sets would give interesting patterns. We searched from a large space of possible target sets corresponding to all clauses with up to <italic>k</italic> terms (ie, sets formed by intersections of up to <italic>k</italic> columns), computed their minimum description length scores, and ranked them based on their compression ratio, and other characteristics.</p>
      </sec>
      <sec>
        <title>Baselines and Evaluation Measures</title>
        <p>The work of Xiang et al [<xref ref-type="bibr" rid="ref19">19</xref>] is directly related to our approach and can be considered as a special case of minimum description length, where only positive clauses are allowed. We referred to this as description by solution. We used the number of clauses used by description by solution and minimum approximate description length for comparison.</p>
        <p>We used the compression ratio as a metric for evaluating the performance of our method. The number of clauses used for minimum approximate description length for a target set <italic>T</italic> was <italic>s</italic>. The compression ratio provided by minimum approximate description length was defined as the ratio of the target set size &#124;<italic>T</italic>&#124; to the number of clauses used in the solution to minimum approximate description length, compression ratio=&#124;<italic>T</italic>&#124;/<italic>s</italic>.</p>
        <p>We also provided a scoring system to determine the interestingness of a target set. Sets consisting of states with high activity level were likely to be more interesting than those with moderate, low or minimal activity levels; therefore, these were assigned scores of 4, 3, 2, and 1 for high, moderate, low, and minimal activity level, respectively. Next, states exhibiting a sudden change in activity level (eg, from low to high, or vice versa) were considered more interesting than those having no change in activity levels; therefore, we assigned a score of 5 for the former type and 2 for the latter. Then, a set of states with high activity that week and minimal activity 1 week prior had a score of 9, while a set of states with minimal activity that week and minimal activity 1 week prior had a score of 3. This process is described in detail in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The score assigned to each target set or description measured its interestingness.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Generate Set Descriptions</title>
        <p>The text descriptions (manually generated), in <xref ref-type="table" rid="table1">Table 1</xref> correspond to solutions computed using our method. The mean compression ratio was 2.63. This showed that our method could easily find succinct descriptions for different kinds of target sets.</p>
        <p>Qualitatively, some descriptions (<xref ref-type="table" rid="table1">Table 1</xref>) involved large target sets (eg, February 18, 2017 and January 3, 2015 which correspond to 27 and 29 states, respectively). The CDC descriptions for these weeks were long lists, which were unlikely to give useful insights or identify any patterns. The description for the week of January 3, 2015 was succinct. Almost all the states with high or moderate activity level in the previous week had high activity in that week, 3 new states that were not experiencing high or moderate activity in the previous week had high activity, and Florida and Georgia experienced a sharp decline in activity levels within the week.</p>
        <p>We also noted that some of the descriptions may not be insightful. For instance, the description for the week of April 8, 2017 was simply a list of 2 states; it is possible that there were no common characteristics between the 2 states, so this was the most succinct. The description for the week of February 18, 2017 was quite complicated. It combined 3 sets of states with different activity levels in different times in the past. <xref rid="figure1" ref-type="fig">Figure 1</xref> shows that a set of 10 states with high influenza-like illness for the week of January 21, 2017 was represented using 6 clauses. The compression ratio achieved was 1.67 as we only use 6 clauses instead of listing 10 state names. However, automated generation of such descriptions will allow a human expert to filter and select appropriate descriptions, instead of creating them from scratch.</p>
        <p>The compression ratio increased as we increased the relaxation factor (<xref ref-type="table" rid="table2">Table 2</xref>) γ. <xref rid="figure2" ref-type="fig">Figure 2</xref> shows that a set of 29 states with high influenza-like illness for week January 3, 2015 can be represented using only 3 sets per clauses; although 8 out of the 29 states are omitted from the description (shown in the light blue region), as the relaxation parameter is set to 0.3.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Description for the set of states with high activity levels.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="300"/>
            <col width="90"/>
            <col width="300"/>
            <col width="40"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Week</td>
                <td>Descriptions of states with high influenza-like illness activity in the week</td>
                <td>Number of clauses</td>
                <td>Target set</td>
                <td>&#124;<italic>T</italic>&#124;</td>
                <td>Compression ratio</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>January 21, 2017</td>
                <td>Kansas, New York, Washington, and states with high activity 2 weeks back excluding Oregon and Utah</td>
                <td>6</td>
                <td>Alabama, Georgia, Kansas, Kentucky, Missouri, New Jersey, New York, Oklahoma, South Carolina, Washington</td>
                <td>10</td>
                <td>1.67</td>
              </tr>
              <tr valign="top">
                <td>February 18, 2017</td>
                <td>Alaska, Illinois, Maryland, Minnesota, states with high activity a week prior, states with low activity 2 weeks prior, and states with minimal activity 3 weeks prior excluding Wyoming</td>
                <td>7</td>
                <td>Alabama, Alaska, Arkansas, Connecticut, Georgia, Illinois, Indiana, Kansas, Kentucky, Louisiana, Maryland, Michigan, Minnesota, Mississippi, Missouri, New Jersey, New Mexico, New York, North Carolina, Oklahoma, Pennsylvania, Rhode Island, South Carolina, South Dakota, Tennessee, Texas, Virginia</td>
                <td>27</td>
                <td>3.86</td>
              </tr>
              <tr valign="top">
                <td>March 25, 2017</td>
                <td>States with high activity for last 2 weeks, excluding Louisiana, Mississippi and Texas</td>
                <td>4</td>
                <td>Alabama, Arkansas, Georgia, Kansas, Kentucky, North Carolina, Oklahoma, South Carolina, Tennessee, Virginia</td>
                <td>10</td>
                <td>2.50</td>
              </tr>
              <tr valign="top">
                <td>April 8, 2017</td>
                <td>Kentucky, South Carolina</td>
                <td>2</td>
                <td>Kentucky, South Carolina</td>
                <td>2</td>
                <td>1.00</td>
              </tr>
              <tr valign="top">
                <td>January 3, 2015</td>
                <td>California, Nevada, New York, and states with high or moderate activity levels a week prior excluding Florida and Georgia</td>
                <td>7</td>
                <td>Alabama, Arkansas, California, Colorado, Hawaii, Idaho, Illinois, Indiana, Kansas, Kentucky, Louisiana, Maryland, Minnesota, Mississippi, Missouri, Nevada, New Mexico, New York, North Carolina, Ohio, Oklahoma, Pennsylvania, South Carolina, Tennessee, Texas, Utah, Virginia, West Virginia, Wisconsin</td>
                <td>29</td>
                <td>4.14</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The set representation of the description for week of January 21, 2017. Each circle is a set and the states in the set are listed with their respective abbreviations. The states in the blue region correspond to the target set T. Oregon and Utah are the singleton subsets (in dark blue) with high influenza-like illness activity two weeks prior but not in that week. AL: Alabama; GA: Georgia; ILI: influenza-like illness; KY: Kentucky; KS; Kansas; MO: Missouri; NJ: New Jersey; NY: New York; OK: Oklahoma; OR: Oregon; SC: South Carolina; UT: Utah; WA: Washington.</p>
          </caption>
          <graphic xlink:href="publichealth_v6i3e12842_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Impact of varying relaxation factor γ on the description and compression ratio using 2 examples.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="470"/>
            <col width="160"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Week, <italic>γ</italic></td>
                <td>Description</td>
                <td>Clauses, number</td>
                <td>Compression ratio</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="2">
                  <bold>January 21, 2017</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0</td>
                <td>Kansas, New York, Washington, and states with high activity 2 weeks prior, excluding Oregon and Utah</td>
                <td>6</td>
                <td>1.67</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0.1</td>
                <td>Kansas, Washington, and states with high activity 2 weeks prior, excluding Oregon and Utah</td>
                <td>5</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0.2</td>
                <td>New York and states with high activity 2 weeks back, excluding Oregon and Utah</td>
                <td>4</td>
                <td>2.5</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0.3</td>
                <td>States with high activity 2 weeks back, excluding Oregon and Utah</td>
                <td>3</td>
                <td>3.33</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <bold>January 3, 2015</bold>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0</td>
                <td>California, Nevada, New York, and states with high or moderate activity levels a week prior, excluding Florida and Georgia</td>
                <td>7</td>
                <td>4.14</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0.1</td>
                <td>New York, and states with high or moderate activity levels a week prior, excluding Florida and Georgia</td>
                <td>5</td>
                <td>5.8</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0.2</td>
                <td>States with high or moderate activity levels a week prior, excluding Florida and Georgia</td>
                <td>4</td>
                <td>7.25</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>0.3</td>
                <td>States with high activity level a week prior, excluding Florida and Georgia</td>
                <td>3</td>
                <td>9.67</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The set representation of description of set of states with high influenza-like illness activity on January 3, 2015. The blue set corresponds to the states with high activity 1 week prior. The dark blue colored singletons Florida and Georgia are subsets of the blue set but do not have high activity in the current week. The light blue colored set consists of the states omitted from the description due to relaxation. AL: Alabama; AR: Arkansas; CA: California; CO: Colorado; HI: Hawaii; ID: Idaho; IL: Illinois; IN: Indiana; KS: Kansas; KY: Kentucky; LA: Louisiana; MD: Maryland; MN: Minnesota; MS: Mississippi; MO: Missouri; NV: Nevada; NM: New Mexico; NY: New York; NC: North Carolina; OH: Ohio; OK: Oklahoma; PA: Pennsylvania; SC: South Carolina; TN: Tennessee; TX: Texas; UT: Utah; VA: Virginia; WV: West Virginia; WI: Wisconsin.</p>
          </caption>
          <graphic xlink:href="publichealth_v6i3e12842_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ranking Set Descriptions</title>
        <p>We found that the top scoring narratives were generally trends. An example of trend found by our method was a gradual increase in activity levels over consecutive weeks; the states Alabama, Georgia, Mississippi, and Tennessee had high activity in the week of March 12, 2016, had moderate activity the previous week, and had minimal activity 2 weeks prior. Another trend was stable high activity for consecutive weeks; in the week ending January 27, 2018, New Jersey, New Mexico, Virginia, Washington, and Wyoming, and states with high activity 4 weeks earlier, excluding Nebraska and Tennessee, had high activity levels for 3 consecutive weeks. Another trend was a gradual decrease in influenza-like illness activity over consecutive weeks; for the week of February 1, 2014, the activity levels in North Carolina decreased from high to moderate to low in 3 consecutive weeks.</p>
        <p>Examples of surprise events identified by our methods were (1) the activity level in North Carolina, New Mexico, South Dakota, and Wyoming jumped from low to high within 1 week, for the week ending February 4, 2017 and (2) the activity level in New Hampshire and Tennessee changed from high to low within 1 week, for the week ending February 2, 2013.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Interestingness scores.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="150"/>
            <col width="80"/>
            <col width="440"/>
            <col width="270"/>
            <col width="60"/>
            <thead>
              <tr valign="top">
                <td>Week</td>
                <td><italic>α</italic>, <italic>β</italic>, <italic>γ</italic></td>
                <td>Target set or pattern</td>
                <td>Description</td>
                <td>Score</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td rowspan="3">January 27, 2018</td>
                <td rowspan="3">(0, 2, 2)</td>
                <td>States with high activity the specified week, low activity 2 weeks prior, and moderate activity 3 weeks prior</td>
                <td>Hawaii, Maryland, North Carolina, Ohio</td>
                <td>14</td>
              </tr>
              <tr valign="top">
                <td>States with moderate activity 1 week prior, minimal activity 2 weeks prior, and low activity 3 weeks prior</td>
                <td>North Dakota</td>
                <td>13</td>
              </tr>
              <tr valign="top">
                <td>States with low activity 2 weeks prior, moderate activity 3 weeks prior, and minimal activity 4 weeks prior</td>
                <td>Maryland, North Carolina, Ohio</td>
                <td>7</td>
              </tr>
              <tr valign="top">
                <td rowspan="2">February 25, 2017</td>
                <td rowspan="2">(0.3, 2, 4)</td>
                <td>States with high activity 1 week prior, low activity 2 weeks prior, and moderate activity 3 weeks prior</td>
                <td>Iowa</td>
                <td>14</td>
              </tr>
              <tr valign="top">
                <td>States that had moderate activity levels 1 week prior, minimal activity levels 3 weeks prior, and minimal activity levels 4 weeks prior</td>
                <td>Massachusetts, Ohio, Wisconsin</td>
                <td>8</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparison With Baselines</title>
        <p>Minimum approximate description length provided summaries at less cost than those provided by description by solution for the weeks of January 21, 2017; February 18, 2017; and March 3, 2017 (<xref rid="figure3" ref-type="fig">Figure 3</xref>). For the remaining weeks, minimum approximate description length provided summaries at a cost equivalent to those provided by description by solution.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Solution comparison: minimum approximate description length versus description by solution.</p>
          </caption>
          <graphic xlink:href="publichealth_v6i3e12842_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings and Previous Work</title>
        <p>There has been a lot of previous work [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref22">22</xref>] on finding spatio-temporal patterns in different data sets. These have typically used unsupervised machine learning methods, and we refer the readers to [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] for surveys on different algorithms and their applications to various data sets. As is the case with other unsupervised methods, the specific technique depends on the application. We note that mining patterns from transactional data has been successfully used in many areas, such as analysis of retail transaction data [<xref ref-type="bibr" rid="ref23">23</xref>], biomedical data analysis [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref24">24</xref>] and information retrieval [<xref ref-type="bibr" rid="ref25">25</xref>]. The approach of finding patterns based on compression and small description have been found to be useful in many settings [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref28">28</xref>]. We found that our description length-based approach gives useful insights into spatio-temporal patterns in incidence of influenza-like illness, especially when negative clauses are allowed. However, no prior methods handle negative clauses, to the best of our knowledge. In addition to negative clauses, we also found that the relaxed versions can also significantly reduce the complexity of descriptions in many cases.</p>
        <p>Our ranking method also provides a systematic approach to identify trends and surprises in the spread of influenza-like illness. However, the descriptions of high score are not always intuitive or interesting, which is often the case with unsupervised machine learning methods. Instead, our ranking-based approach (or other variations of it) could help provide new insights to a domain expert, who might be able to find interesting spatio-temporal patterns more easily. Thus, such an approach could be a first step in processing epidemic incidence data. We believe that including more characteristics for the data (ie, more columns in the data matrix <italic>D</italic>) can help find more succinct descriptions. Furthermore, the integer programming–based approach is quite powerful, and more constraints can be easily added to generate descriptions with specific kinds of properties. Though the descriptions reported here were generated manually based on the outputs, the outputs are well structured and could conceivably be generated using natural language processing techniques easily.</p>
        <p>Comparing the performance of our method with 2 other pattern detection methods in the literature, though, as mentioned earlier, which do not consider negative clauses, the first method, called Apriori [<xref ref-type="bibr" rid="ref23">23</xref>] is a very popular approach for association rule mining and pattern detection in a database containing transactions. Each transaction is seen as a set of items called itemset. The Apriori algorithm finds the frequent item sets in the database, the item sets that appear frequently among the transactions of the database. We observed that the rules generated by Apriori using Weka [<xref ref-type="bibr" rid="ref29">29</xref>] are trivial in nature and are not highly informative.</p>
        <p>The work of Xiang et al [<xref ref-type="bibr" rid="ref19">19</xref>] (description by solution) can be considered as a special case of minimum description length, where only positive clauses are allowed. Xiang et al [<xref ref-type="bibr" rid="ref19">19</xref>] give a logarithmic approximation for the description by solution problem for such instances. We implement an integer linear program to solve this problem exactly. By comparing the solutions provided by minimum approximate description length with that of description by solution, we demonstrated the benefit of allowing differences in generating compact descriptions. We note that using additional attributes for the regions might allow for more succinct descriptions.</p>
        <p>Our methodology could be easily extended to other diseases and applications involving spatio-temporal data, since the method can handle very general kinds of features and clauses formed by them. The ranking method would have to be designed based on the specific domain. Also, we expect our method could scale to much larger data sets easily.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The feature values are real numbers (eg, the similarity with a past season can be a correlation metric) not binary. One way to handle this issue would be to map the nonbinary values to binary using discretization of the weights. Since we limited our focus to only meaningful features, our current approach explores target sets with temporal properties over small time intervals. In the case of an increase in number of features by a few orders of magnitude than we considered, the integer linear program may not be able to scale well. One way to address this problem would be to design scalable heuristics that give some theoretical or experimental guarantees.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>Automated generation of interesting spatio-temporal patterns and trends is an important problem, and can be especially useful to public health experts, as well as the general public. Our approach, based on techniques from pattern mining, provide a short-list of patterns in influenza-like illness data from the CDC. We found that sets with high compression ratio tend have common characteristics, which are often interesting. This is, however, an unsupervised machine learning method, and needs to be verified manually. Our ranking method is one way to select interesting patterns in an automated manner. The techniques developed in this paper could potentially be applied for other diseases, and other public health domains.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Additional material.</p>
        <media xlink:href="publichealth_v6i3e12842_app1.docx" xlink:title="DOCX File , 142 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">CDC</term>
          <def>
            <p>Centers for Disease Control and Prevention</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The work of the authors has been partially supported by the following: National Science Foundation grants (IIS-1633028, CCF-1918656 and ACI-1443054) and Defense Threat Reduction Agency grants (HDTRA1-11-1-0016 and HDTRA1-17-D-0023).</p>
    </ack>
    <fn-group>
      <fn fn-type="con">
        <p>PS, PB, BL, and AV designed the study. PS, PB, and AV developed the methods. All authors helped in the evaluation and writing.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Khadivi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mahendiran</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Butler</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Nsoesie</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Mekaru</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Marathe</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ramakrishnan</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Forecasting a moving target: ensemble models for ILI case count predictions</article-title>
          <year>2014</year>
          <conf-name>SIAM International Conference on Data Mining</conf-name>
          <conf-date>2014</conf-date>
          <conf-loc>Philadelphia</conf-loc>
          <fpage>E</fpage>
          <lpage>270</lpage>
          <pub-id pub-id-type="doi">10.1137/1.9781611973440.30</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tizzoni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bajardi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Poletto</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ramasco</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Balcan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gonçalves</surname>
              <given-names>Bruno</given-names>
            </name>
            <name name-style="western">
              <surname>Perra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Colizza</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Real-time numerical forecast of global epidemic spreading: case study of 2009 A/H1N1pdm</article-title>
          <source>BMC Med</source>
          <year>2012</year>
          <month>12</month>
          <day>13</day>
          <volume>10</volume>
          <fpage>165</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedicine.biomedcentral.com/articles/10.1186/1741-7015-10-165"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1741-7015-10-165</pub-id>
          <pub-id pub-id-type="medline">23237460</pub-id>
          <pub-id pub-id-type="pii">1741-7015-10-165</pub-id>
          <pub-id pub-id-type="pmcid">PMC3585792</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mekaru</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brownstein</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ramakrishnan</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Dynamic poisson autoregression for influenza-like-illness case count prediction</article-title>
          <year>2015</year>
          <conf-name>ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name>
          <conf-date>2015</conf-date>
          <conf-loc>Sydney</conf-loc>
          <fpage>1285</fpage>
          <lpage>1294</lpage>
          <pub-id pub-id-type="doi">10.1145/2783258.2783291</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brooks</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Farrow</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Hyun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tibshirani</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenfeld</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Flexible modeling of epidemics with an empirical bayes framework</article-title>
          <source>PLoS Comput Biol</source>
          <year>2015</year>
          <month>08</month>
          <volume>11</volume>
          <issue>8</issue>
          <fpage>e1004382</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pcbi.1004382"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pcbi.1004382</pub-id>
          <pub-id pub-id-type="medline">26317693</pub-id>
          <pub-id pub-id-type="pii">PCOMPBIOL-D-14-01884</pub-id>
          <pub-id pub-id-type="pmcid">PMC4552841</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McNeil Jr.</surname>
              <given-names>DG</given-names>
            </name>
          </person-group>
          <article-title>This Flu Season Is the Worst in Nearly a Decade</article-title>
          <source>The New York Times</source>
          <year>2018</year>
          <month>01</month>
          <day>26</day>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nytimes.com/2018/01/26/health/flu-rates-deaths.html">https://www.nytimes.com/2018/01/26/health/flu-rates-deaths.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaufman</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Flu Season Has Gone From Bad to Worse, CDC Reports, as 17 More Children Die in the US</article-title>
          <source>Mashable</source>
          <year>2018</year>
          <month>02</month>
          <day>02</day>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mashable.com/2018/02/02/cdc-says-2018-flu-season-worse-children-deaths/#6KaneYhQEmqf">https://mashable.com/2018/02/02/cdc-says-2018-flu-season-worse-children-deaths/#6KaneYhQEmqf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <article-title>2017-2018 Influenza Season Week 6 ending February 10, 2018</article-title>
          <source>CDC FluView</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/flu/weekly/weeklyarchives2017-2018/Week06.htm">https://www.cdc.gov/flu/weekly/weeklyarchives2017-2018/Week06.htm</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <article-title>2016-2017 Influenza Season Week 9 ending March 4, 2017</article-title>
          <source>CDC FluView</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/flu/weekly/weeklyarchives2016-2017/Week09.htm">https://www.cdc.gov/flu/weekly/weeklyarchives2016-2017/Week09.htm</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Frequent pattern mining: current status and future directions</article-title>
          <source>Data Min Knowl Disc</source>
          <year>2007</year>
          <month>1</month>
          <day>27</day>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>55</fpage>
          <lpage>86</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/article/10.1007/s10618-006-0059-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10618-006-0059-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mortazavi-Asl</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Dayal</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>M-C</given-names>
            </name>
          </person-group>
          <article-title>FreeSpan: frequent pattern-projected sequential pattern mining</article-title>
          <source>KDD '00: Proceedings of the Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</source>
          <year>2000</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>ACM</publisher-name>
          <fpage>355</fpage>
          <lpage>359</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="web">
          <article-title>FluView</article-title>
          <source>CDC</source>
          <access-date>2019-12-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://gis.cdc.gov/grasp/fluview/main.html">https://gis.cdc.gov/grasp/fluview/main.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <article-title>National, Regional, and State Level Outpatient Illness and Viral Surveillance</article-title>
          <source>FluView Interactive</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://gis.cdc.gov/grasp/fluview/fluportaldashboard.html">https://gis.cdc.gov/grasp/fluview/fluportaldashboard.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <article-title>Past Weekly Surveillance Reports</article-title>
          <source>CDC</source>
          <access-date>2019-06-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cdc.gov/flu/weekly/pastreports.htm">https://www.cdc.gov/flu/weekly/pastreports.htm</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <article-title>A Weekly Influenza Surveillance Report Prepared by the Influenza Division Weekly Influenza Activity Estimates Reported by State and Territorial Epidemiologists</article-title>
          <source>FluView Interactive</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://gis.cdc.gov/grasp/fluview/FluView8.html">https://gis.cdc.gov/grasp/fluview/FluView8.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grünwald</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>The Minimum Description Length Principle</article-title>
          <source>MIT Press</source>
          <year>2007</year>
          <access-date>2018-11-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mitpress.mit.edu/books/minimum-description-length-principle">https://mitpress.mit.edu/books/minimum-description-length-principle</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grünwald</surname>
              <given-names>PD</given-names>
            </name>
            <name name-style="western">
              <surname> Myung</surname>
              <given-names>JI</given-names>
            </name>
            <name name-style="western">
              <surname>Pitt</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <source>Advances in Minimum Description Length: Theory and Applications</source>
          <year>2005</year>
          <publisher-name>The MIT Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garey</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson </surname>
              <given-names>DS</given-names>
            </name>
          </person-group>
          <source>Computers and Intractability: A Guide to the Theory of NP-Completeness</source>
          <year>1979</year>
          <publisher-name>W.H. Freeman and Co</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <article-title>Gurobi - The fastest solver</article-title>
          <source>Gurobi</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.gurobi.com/">http://www.gurobi.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Fuhry</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Dragan</surname>
              <given-names>FF</given-names>
            </name>
          </person-group>
          <article-title>Summarizing transactional databases with overlapped hyperrectangles</article-title>
          <source>Data Min Knowl Disc</source>
          <year>2010</year>
          <month>10</month>
          <day>24</day>
          <volume>23</volume>
          <issue>2</issue>
          <fpage>215</fpage>
          <lpage>251</lpage>
          <pub-id pub-id-type="doi">10.1007/s10618-010-0203-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gowtham</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Anuj</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Vipin</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <source>ACM Computing Surveys</source>
          <year>2018</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>ACM Journals</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhenhui</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <source>Frequent Pattern Mining</source>
          <year>2014</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>978</fpage>
          <lpage>3</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jae-Gil</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jiawei</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kyu-Young</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Trajectory Clustering: A Partition-and-Group Framework</article-title>
          <source>SIGMOD '07: Proceedings of the 2007 ACM SIGMOD International Conference on Management of Data</source>
          <year>2007</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>ACM</publisher-name>
          <fpage>593</fpage>
          <lpage>604</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Srikant</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Fast algorithms for mining association rules in large databases</article-title>
          <year>1994</year>
          <conf-name>International Conference on Very Large Data Bases (VLDB)</conf-name>
          <conf-date>1994</conf-date>
          <conf-loc>Santiago de Chile</conf-loc>
          <fpage>487</fpage>
          <lpage>99</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://dl.acm.org/citation.cfm?id=645920.672836"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Madeira</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Oliveira</surname>
              <given-names>AL</given-names>
            </name>
          </person-group>
          <article-title>Biclustering algorithms for biological data analysis: a survey</article-title>
          <source>IEEE/ACM Trans Comput Biol Bioinform</source>
          <year>2004</year>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>24</fpage>
          <lpage>45</lpage>
          <pub-id pub-id-type="doi">10.1109/TCBB.2004.2</pub-id>
          <pub-id pub-id-type="medline">17048406</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Automatic pattern taxonomy extraction for web mining</article-title>
          <year>2004</year>
          <conf-name>International Conference on Web Intelligence</conf-name>
          <conf-date>2004</conf-date>
          <conf-loc>Beijing</conf-loc>
          <fpage>242</fpage>
          <lpage>48</lpage>
          <pub-id pub-id-type="doi">10.1109/wi.2004.10132</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chandola</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Summarization—compressing data into an informative representation</article-title>
          <year>2005</year>
          <conf-name>IEEE International Conference on Data Mining (ICDM'05)</conf-name>
          <conf-date>2005</conf-date>
          <conf-loc>Houston</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icdm.2005.137</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miettinen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Vreeken</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Model order selection for boolean matrix factorization</article-title>
          <year>2011</year>
          <conf-name>ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD)</conf-name>
          <conf-date>2011</conf-date>
          <conf-loc>San Diego</conf-loc>
          <fpage>51</fpage>
          <lpage>59</lpage>
          <pub-id pub-id-type="doi">10.1145/2020408.2020424</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vreeken</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Leeuwen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Siebes</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Krimp: mining itemsets that compress</article-title>
          <source>Data Min Knowl Disc</source>
          <year>2010</year>
          <month>10</month>
          <day>16</day>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>169</fpage>
          <lpage>214</lpage>
          <pub-id pub-id-type="doi">10.1007/s10618-010-0202-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Pal</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <source>Data Mining, Fourth Edition: Practical Machine Learning Tools and Techniques</source>
          <year>2016</year>
          <publisher-loc>San Francisco, CA, USA</publisher-loc>
          <publisher-name>Morgan Kaufmann Publishers Inc</publisher-name>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
