<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JPH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Public Health Surveill</journal-id>
      <journal-title>JMIR Public Health and Surveillance</journal-title>
      <issn pub-type="epub">2369-2960</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
    <article-id pub-id-type="publisher-id">v4i2e56</article-id>
    <article-id pub-id-type="pmid"/>
    <article-id pub-id-type="doi">10.2196/publichealth.9536</article-id>
    <article-categories>
      <subj-group subj-group-type="heading">
        <subject>Original Paper</subject>
      </subj-group>
      <subj-group subj-group-type="article-type">
        <subject>Original Paper</subject>
      </subj-group>
    </article-categories>
    <title-group>
      <article-title>Accurately Inferring Compliance to Five Major Food Guidelines Through Simplified Surveys: Applying Data Mining to the UK National Diet and Nutrition Survey</article-title>
    </title-group>
    <contrib-group>
      <contrib contrib-type="editor">
        <name>
          <surname>Sanchez</surname>
          <given-names>Travis</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Selya</surname>
          <given-names>Arielle</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Beheshti</surname>
          <given-names>Rahmat</given-names>
        </name>
      </contrib>
      <contrib contrib-type="reviewer">
        <name>
          <surname>Wark</surname>
          <given-names>Petra</given-names>
        </name>
      </contrib>
    </contrib-group>
    <contrib-group>
      <contrib contrib-type="author" id="contrib1">
        <name name-style="western">
          <surname>Rosso</surname>
          <given-names>Nicholas</given-names>
        </name>
        <xref rid="aff1" ref-type="aff">1</xref>
        <ext-link ext-link-type="orcid">http://orcid.org/0000-0002-2967-2981</ext-link>
      </contrib>
      <contrib contrib-type="author" id="contrib2" corresp="yes">
      <name name-style="western">
        <surname>Giabbanelli</surname>
        <given-names>Philippe</given-names>
      </name>
      <degrees>PhD</degrees>
      <xref rid="aff2" ref-type="aff">2</xref>
      <address>
        <institution>Data Analytics for Complex Human Behaviors Laboratory</institution>
        <institution>Department of Computer Science</institution>
        <institution>Furman University</institution>
        <addr-line>3300 Poinsett Hwy</addr-line>
        <addr-line>Greenville, SC, 29613</addr-line>
        <country>United States</country>
        <phone>1 864 294 2097</phone>
        <email>giabbanelli@gmail.com</email>
      </address>  
      <ext-link ext-link-type="orcid">http://orcid.org/0000-0001-6816-355X</ext-link></contrib>
    </contrib-group>
    <aff id="aff1">
    <sup>1</sup>
    <institution>Data Analytics for Complex Human Behaviors Laboratory</institution>
    <institution>Computer Science Department</institution>  
    <institution>Northern Illinois University</institution>  
    <addr-line>DeKalb, IL</addr-line>
    <country>United States</country></aff>
    <aff id="aff2">
    <sup>2</sup>
    <institution>Data Analytics for Complex Human Behaviors Laboratory</institution>
    <institution>Department of Computer Science</institution>  
    <institution>Furman University</institution>  
    <addr-line>Greenville, SC</addr-line>
    <country>United States</country></aff>
    <author-notes>
      <corresp>Corresponding Author: Philippe Giabbanelli 
      <email>giabbanelli@gmail.com</email></corresp>
    </author-notes>
    <pub-date pub-type="collection"><season>Apr-Jun</season><year>2018</year></pub-date>
    <pub-date pub-type="epub">
      <day>30</day>
      <month>05</month>
      <year>2018</year>
    </pub-date>
    <volume>4</volume>
    <issue>2</issue>
    <elocation-id>e56</elocation-id>
    <!--history from ojs - api-xml-->
    <history>
      <date date-type="received">
        <day>30</day>
        <month>11</month>
        <year>2017</year>
      </date>
      <date date-type="rev-request">
        <day>12</day>
        <month>1</month>
        <year>2018</year>
      </date>
      <date date-type="rev-recd">
        <day>7</day>
        <month>3</month>
        <year>2018</year>
      </date>
      <date date-type="accepted">
        <day>13</day>
        <month>4</month>
        <year>2018</year>
      </date>
    </history>
    <!--(c) the authors - correct author names and publication date here if necessary. Date in form ', dd.mm.yyyy' after jmir.org-->
    <copyright-statement>©Nicholas Rosso, Philippe Giabbanelli. Originally published in JMIR Public Health and Surveillance (http://publichealth.jmir.org), 30.05.2018.</copyright-statement>
    <copyright-year>2018</copyright-year>
    <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
      <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Public Health and Surveillance, is properly cited. The complete bibliographic information, a link to the original publication on http://publichealth.jmir.org, as well as this copyright and license information must be included.</p>
    </license>  
    <self-uri xlink:href="http://publichealth.jmir.org/2018/2/e56/" xlink:type="simple"/>
    <abstract>
      <sec sec-type="background">
        <title>Background</title>
        <p>National surveys in public health nutrition commonly record the weight of every food consumed by an individual. However, if the goal is to identify whether individuals are in compliance with the 5 main national nutritional guidelines (sodium, saturated fats, sugars, fruit and vegetables, and fats), much less information may be needed. A previous study showed that tracking only 2.89% of all foods (113/3911) was sufficient to accurately identify compliance. Further reducing the data needs could lower participation burden, thus decreasing the costs for monitoring national compliance with key guidelines.</p>
      </sec>
      <sec sec-type="objective">
        <title>Objective</title>
        <p>This study aimed to assess whether national public health nutrition surveys can be further simplified by only recording whether a food was consumed, rather than having to weigh it.</p>
      </sec>
      <sec sec-type="methods">
        <title>Methods</title>
        <p>Our dataset came from a generalized sample of inhabitants in the United Kingdom, more specifically from the National Diet and Nutrition Survey 2008-2012. After simplifying food consumptions to a binary value (1 if an individual consumed a food and 0 otherwise), we built and optimized decision trees to find whether the foods could accurately predict compliance with the major 5 nutritional guidelines.</p>
      </sec>
      <sec sec-type="results">
        <title>Results</title>
        <p>When using decision trees of a similar size to previous studies (ie, involving as many foods), we were able to correctly infer compliance for the 5 guidelines with an average accuracy of 80.1%. This is an average increase of 2.5 percentage points over a previous study, showing that further simplifying the surveys can actually yield more robust estimates. When we allowed the new decision trees to use slightly more foods than in previous studies, we were able to optimize the performance with an average increase of 3.1 percentage points.</p>
      </sec>
      <sec sec-type="conclusions">
        <title>Conclusions</title>
        <p>Although one may expect a further simplification of surveys to decrease accuracy, our study found that public health dietary surveys can be simplified (from accurately weighing items to simply checking whether they were consumed) while improving accuracy. One possibility is that the simplification reduced noise and made it easier for patterns to emerge. Using simplified surveys will allow to monitor public health nutrition in a more cost-effective manner and possibly decrease the number of errors as participation burden is reduced.</p>
      </sec>
    </abstract>
    <kwd-group>
      <kwd>diet, food, and nutrition</kwd>
      <kwd>public health informatics</kwd>
      <kwd>supervised machine learning</kwd>
    </kwd-group></article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Insufficient compliance with dietary guidelines can lead to several health problems, whereas following guidelines can have protective effects. Systematic reviews have linked excess salt consumption with increased blood pressure, which raises the risk for cardiovascular diseases [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Furthermore, other meta-reviews have found that a higher consumption of fruit and vegetables “was significantly associated with a lower risk of all-cause mortality” [<xref ref-type="bibr" rid="ref3">3</xref>]. It is, thus, essential to monitor compliance with such guidelines to understand and improve a population’s health. To assess whether guidelines are followed, data on nutritional intake must be compiled. A comprehensive assessment of nutritional intake can be burdening (as individuals need to record the exact amount and type of foods consumed), which may in part cause the inaccuracies found when individuals provide such reports [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>Data mining is a computational technique (often equated with machine learning), which offers significant potential to alleviate that burden by finding key patterns in data. Among the different tasks performed in data mining, our focus is on <italic>classification</italic>, which consists of automatically relating a set of feature variables (eg, age, gender, food consumed) to an outcome (eg, being in compliance with guidelines on salt). Classification has been increasingly used in recent years for research on several weight-related outcomes, such as obesity [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>], nutrition [<xref ref-type="bibr" rid="ref8">8</xref>], and physical activity [<xref ref-type="bibr" rid="ref9">9</xref>].Classification has demonstrated its potential to complement statistical regressions, particularly for nonlinear phenomena (as is often the case with human behaviors [<xref ref-type="bibr" rid="ref10">10</xref>] such as eating behaviors), and without requiring a priori assumptions on the relationship between patterns and outcomes [<xref ref-type="bibr" rid="ref11">11</xref>]. In particular, classification has been applied on several occasions to find the key questions that surveys need to infer a target behavior. For instance, in the case of adolescent binge drinking, researchers showed that rules in a household were strongly linked with the outcome, whereas other dimensions (eg, communication) were not as salient [<xref ref-type="bibr" rid="ref12">12</xref>]. Similarly, previous research in public health nutrition found that only 2.89% (113/3911) of the food items were required to infer compliance to the 5 major national guidelines [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>There are many algorithms to choose from when performing classification. Decision trees in particular have proven to be a popular approach [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>] for at least 2 reasons. First, they can then be used as a visual tool: instead of being a black-box model (such as a deep neural network or a support vector machine), they clearly articulate the rules that transform the description of a new participant’s case into an outcome (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Second, these rules can also be used as flowcharts in public health, or clinical settings, to support decision-making activities (eg, triage) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. In line with these studies, this paper employs the classification technique of decision trees.</p>
      <p>Our overarching goal is to further simplify public health nutrition surveys. Building on previous work showing that only 2.89% (113/3911) of the items were necessary [<xref ref-type="bibr" rid="ref8">8</xref>] to infer compliance with major food guidelines, we will assess whether survey items can be reduced to binaries (was a food eaten or not?) rather than requiring an accurate weight. To identify <italic>success</italic> in adequately simplifying surveys, we will compute whether decision trees can still accurately infer compliance with guidelines using the simplified surveys. Specifically, we will simplify items in the National Diet and Nutrition Survey (NDNS) 2008-2012 to binary and assess whether decision trees built on the simplified dataset are about as accurate as decision trees built on the initial dataset.</p>
      
      

      
      
      
      <p>The principal contributions of this study can be summarized as follows:</p>
      <list list-type="bullet">
        <list-item>
          <p>We demonstrate that simplifying the information recorded in a specific dietary survey is not necessarily detrimental to identifying key public health outcomes.</p>
        </list-item>
        <list-item>
          <p>The application of our work to dietary public health suggests that nutritional surveys may be simplified when the aim is to predict compliance with major nutritional guidelines. This simplification may reduce participation burden, lower study costs, or increase the sample size at a same cost.</p>
        </list-item>
        <list-item>
          <p>The methodological part of our work illustrates the potential for data mining to contribute to public health not only by making predictions, but by identifying what part of the data is truly needed to form these predictions.</p>
        </list-item>
      </list>
      
                  <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>A decision tree starts at a root (top). For a given individual, we repeatedly compare the individual’s data with the questions in the tree. In this example, if the individual did not consume food 1, then the follow-up question is whether food 2 was consumed. Eventually, we reach a conclusion: whether the individual was in compliance with the guideline or not. Such trees are automatically built from the data.</p>
        </caption>
        <graphic xlink:href="publichealth_v4i2e56_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      

    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Used</title>
        <p>Our dataset came from a generalized sample of inhabitants from the United Kingdom: the National Diet and Nutrition Survey (NDNS) 2008-2012. The NDNS data were obtained from the UK Data Archive [<xref ref-type="bibr" rid="ref16">16</xref>]. The NDNS is a cross-sectional survey that records the nutrient intake as well as the nutritional status of the population within the United Kingdom [<xref ref-type="bibr" rid="ref17">17</xref>]. To allow for comparison with previous studies [<xref ref-type="bibr" rid="ref8">8</xref>], we used data from years 1-4 of this program, collected in 2008-2012. The NDNS collected data from a sample of 1000 respondents per year, consisting of adults and children aged 18 months and above. Households across the United Kingdom were selected to take part in the NDNS using a multistage probability design. During each wave, a random sample of primary sampling units was selected for inclusion. These are small geographical areas that allow for more efficient data collection by being geographically focused.</p>
        <p>Within the dataset, food consumption at a daily level is recorded for participants over several days. To record portion sizes, common household measures (eg, one tablespoon, one cup) and weight in grams were used for the foods consumed throughout the study, including the consumption of liquids. Foods are described specifically and can be related to other foods in a subgroup or a group. For instance, the consumption of bananas would be entered as with 3 different levels of detail: as individual foods (eg, bananas raw flesh only), as subfood groups (eg, bananas), or as food groups (eg, fruit and vegetables).</p>
        <p>The NDNS dataset only contains the foods consumed, their composition, and demographical information. It does not make any conclusion in regard to nutritional guidelines. The dataset was expanded in a previous study to include this information [<xref ref-type="bibr" rid="ref8">8</xref>]. This was realized via the following process: (1) compute how much each individual consumed with respect to the 5 key dietary guidelines, then (2) compare this consumption with nutritional recommendations (which may be age-dependent), and (3) record the result as “Yes” when the participant was in compliance for a specific guideline or “No” when the participant was not in compliance. The detailed process is as follows.</p>
        <p>The NDNS dataset has 4156 participants including 1189 children younger than 11 years. First, for each of the 4156 participants, compute the mean daily intake of fruit and vegetables and sodium, as well as the main daily percentage of energy derived from fat, saturated fat, and free sugars. Then, compare each individual's numbers with the corresponding nutritional recommendations to determine whether the individual is in compliance with the recommendation. UK recommendations on <italic>fruit and vegetables</italic> apply only to those aged 11 years or older, thus 1189 participants were excluded for this specific comparison. To be recorded as “Yes,” those retained needed to consume at least five 80-g portions of fruit and vegetable daily, allowing for at most 1 portion of juice. Although UK recommendations on <italic>sodium</italic> are also dependent on the age category, they adjust the comparison rather than excluding participants. A participant would be labeled as “Yes” if the sodium intake does not exceed [<xref ref-type="bibr" rid="ref18">18</xref>] 2400 mg/d for those aged 11 years and older, 2000 mg/d for those aged 7-10 years, 1200 mg/d for those aged 4-6 years, and 800 mg/d for those aged 1-3 years.</p>
        <p>The World Health Organization (WHO) recommends limitations on how much energy can be derived from each of the following categories: at most 30% from fat, at most 10% from saturated fat, and at most 10% from free sugars (sixth table in [<xref ref-type="bibr" rid="ref19">19</xref>]). We then computed how much energy a participant derived from each category. If the energy derived from fat, saturated fat, and free sugars were under the WHO threshold, then we set the corresponding guideline to “YES.”</p>
        <p>For each participant, our final dataset includes selected data from the NDNS survey (age, gender, and consumption for all of the 3911 individual foods) and additional data computed through the process above (whether or not they were in compliance for each of the 5 nutritional guidelines).</p>
      </sec>
      <sec>
        <title>Methods Employed: Classification Using Decision Trees</title>
        <p>A <italic>classifier</italic> is a model automatically built from a subset of the data (called <italic>training set</italic>) in which we know both the predictor variables (ie, age, gender, and foods eaten) and the class outcomes (ie, whether or not each of the 5 guidelines was met). The intention is to build “good” classifiers, that is, models that learn and generalize from the training set so that they can accurately predict the outcomes when presented with new cases [<xref ref-type="bibr" rid="ref20">20</xref>]. Numerous methods build classifiers, such as support vector machines, decision trees, and rulesets [<xref ref-type="bibr" rid="ref21">21</xref>]. As detailed in the Introduction, our study uses decision trees, which are a commonly used approach [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>] that provides a usable visual tool (<xref ref-type="fig" rid="figure1">Figure 1</xref>) to support decision-making activities such as triage.</p>
        <p>There are 2 types of classifications: binary and multi-class. In a binary situation, the outcome we seek to predict can only have 2 different values. Conversely, in a multi-classification problem, the outcome has 3 or more values. Our study focuses on a binary classification problem: for each of the 5 guidelines, we want to know whether or not the guideline is met.</p>
        <p>The process to create a decision tree for binary classification has been detailed in numerous reference material such as Maimon and Rokach [<xref ref-type="bibr" rid="ref22">22</xref>]; thus, we provide only a brief overview of this process. The dataset (detailed in the previous section) comes in as a spreadsheet, where rows correspond to individuals and columns represent their features (ie, their age, gender, diet, and whether or not each of the 5 guidelines was met). The goal is to train a decision tree so it automatically identifies the combination of predictor variables (age, gender, individual foods) to determine the class outcome (for each of the five guidelines). A small portion of the rows are used as the <italic>training set</italic> to guide the decision tree algorithms to produce specific trees. The algorithm will repeatedly subdivide the data, where the variable used to subdivide is represented as a node in the tree (<xref ref-type="fig" rid="figure1">Figure 1</xref>), and the subdivisions corresponding to different values are shown as branches leaving this node. For instance, <xref ref-type="fig" rid="figure1">Figure 1</xref> shows that the first division is based on the hypothetical “food 1”: one subdivision is produced when the food was not consumed (left branch), and the other subdivision corresponds to consuming this food (right branch).</p>
        <p>A portion of the data is not provided to the algorithm for building the tree and is instead held to evaluate the quality of the generated tree [<xref ref-type="bibr" rid="ref20">20</xref>]. This portion is called the <italic>testing set</italic>. To avoid basing our evaluation from one specific portion of the data that may not be representative, a process known as <italic>cross-fold validation</italic> divides the dataset into multiple portions, building the tree on one (training) and evaluating it on the others (testing) before repeating the division until all parts have been used for training and testing. This common process to evaluate classification accuracy helps prevent overfitting, where performances on the training set are very good but its generalization on the training set performs poorly [<xref ref-type="bibr" rid="ref20">20</xref>]. The evaluation consists of presenting the tree with individuals from the testing set and asking what the classes should be. Then, the tree predicts a class outcome, which we compare with the real outcome from the dataset. The extent to which these outcomes match is called the <italic>accuracy</italic>. When the outcomes are binary, the percentage of “Yes” instances correctly classified is known as <italic>recall</italic>, and the percentage of “No” instances correctly classified is known as <italic>specificity</italic>. Intuitively, accuracy is the performance of the model across class outcomes, whereas recall and specificity are performances for one outcome in particular.</p>
        <p>Highlighting recall and sensitivity is useful when the costs of making mistakes may be different: in health studies, giving someone an intervention that they do not need may be a very different issue from initially suggesting not to give them the intervention that they need. In addition, datasets are frequently imbalanced, that is, there can be many more cases for one outcome than the other. In this case, a high accuracy may be misleading as the tree may do well for the most common case, while being very inaccurate for the less common case. By providing the recall and sensitivity, our study supports public health officials in evaluating our performance by giving more or less weight to specific outcomes. As in previous work, our overall accuracy assumes that the error costs are similar [<xref ref-type="bibr" rid="ref8">8</xref>], that is, concluding that someone does not follow guidelines while they do is no worse than concluding that they follow guidelines while they do not. Assuming different error costs would need additional evidence, and it would also lead to different methods as relatively few approaches can mine data under differential error costs [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>In general, class imbalance can be addressed by eliminating cases of the majority class (undersampling), creating new cases for the minority class (oversampling), or biasing the classification algorithm (eg, using nonuniform error costs on the classes) [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. For this study, we use sampling techniques. Specifically, we used Synthetic Minority Over-Sampling Technique, or SMOTE for short. As concluded by Batista et al, “over-sampling methods in general, and SMOTE-based methods in particular” were very efficient to address class imbalances [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].Although a comprehensive discussion on class balancing is beyond the scope of this study, we note that finding good approaches for synthetic over-sampling remains a very active area of research, as even popular methods such as SMOTE have weaknesses. However, such weaknesses are particularly encountered when dealing with very high-dimensional datasets such as text [<xref ref-type="bibr" rid="ref26">26</xref>], which is not the case here.</p>
      </sec>
      <sec>
        <title>Overall Process</title>
        <p>Our process is summarized in <xref ref-type="fig" rid="figure2">Figure 2</xref>. We start with the same dataset as used in our previous study: the NDNS 2008-2012 data expanded with compliance to each guideline [<xref ref-type="bibr" rid="ref8">8</xref>].We departed from the previous study [<xref ref-type="bibr" rid="ref8">8</xref>] by simplifying the dataset: we only recorded whether an item was consumed (1) or not (0). These data are given as input to the classification process, which was performed 5 times, for each of the guidelines. For a given guideline, we removed the compliance of the 4 other guidelines from the dataset. We do not want the algorithm to use compliance on fat to infer compliance on saturated fat: instead, compliance should be inferred from the foods, age, and gender only. As discussed in the previous subsection, balancing needs to be performed to avoid biasing the algorithm in favor of the most common outcome. We used SMOTE to ensure that both outcomes (meeting or not meeting a guideline) occur with the same prevalence. The balanced dataset was then fed into the Weka software version 3.7, maintained by the Machine Learning Group at the University of Waikato. We used the J48 decision tree algorithm, which implements the highly cited C4.5 algorithm by Ross Quinlan [<xref ref-type="bibr" rid="ref27">27</xref>].</p>
        <p>Like most classification algorithms, C4.5 (and its J48 implementations) take parameters that can impose further constraints on the resulting tree. We tested different parameter values to either (1) find the most accurate decision tree with a similar structure (ie, number of foods) to the trees generated in the previous study using the exact weights of foods, or (2) identify the most accurate tree without consideration for the number of foods involved [<xref ref-type="bibr" rid="ref8">8</xref>]. These allow to perform two operations. First, we can <italic>compare</italic> with the previous study [<xref ref-type="bibr" rid="ref8">8</xref>], in which the tree built for each guideline used a very small number of foods. Our objective was to constrain our new tree in using a similar number of foods, such that we can observe how accuracy changes when foods are simplified (in this study) instead of being recorded exactly (in the previous study). To lower the number of foods used by the algorithm, we increased the minimum number of cases required to further cut the data (ie, add a decision node to the tree). Second, we seek to <italic>optimize</italic>, by identifying how accurate we can be using our simplified foods, possibly at the expense of using more foods.</p>
        <p>After each tree was built, we used 10-fold cross-validation [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. This method for evaluation divides the dataset into 10 equal parts. Nine parts were used for the training set, and one for the testing set. After the process was repeated 10 times, the evaluation was conducted on all of the data, and the average results were reported. For full disclosure, all of our decision trees are available on the Open Science Framework platform (see [<xref ref-type="bibr" rid="ref29">29</xref>]).</p>
        <p>A sample of our approach to explore the trade-off between the number of foods and accuracy is illustrated in <xref ref-type="table" rid="table1">Table 1</xref>, showing a parameter sweep by increasing the minimum number of instances to (nonmonotonically) reduce the number of foods used. The rationale for this process is as follows. For the decision tree algorithm to create a new branch, it needs to find where to “cut” in the dataset. If there are not enough instances to cut, then a new branch will not be made. When this new branch <italic>would</italic> have been based on a factor not previously used in the tree, then preventing its creation limits the number of foods used. However, the branch may have involved an already existing factor. Raising the minimum number of instances thus limits opportunities for the algorithm to involve additional factors. <xref ref-type="table" rid="table1">Table 1</xref> exemplifies how the number of factors tends to decrease as the minimum number of instances increases.</p>
        <p>In the guiding example of <xref ref-type="table" rid="table1">Table 1</xref>, we predict adherence to the guideline on free sugars. The previous study used 28 foods for this class [<xref ref-type="bibr" rid="ref8">8</xref>], thus we seek the highest accuracy that we can achieve with 28 foods or less. The best trade-off is found using a minimum of 95 instances, leading to 25 foods and an accuracy of 77.9%, which is higher than the 76.5% previously found. This trade-off would thus be reported in our results.</p>
        <p><xref ref-type="table" rid="table1">Table 1</xref> exemplifies our methodology on choosing a decision tree comparable with the previous study [<xref ref-type="bibr" rid="ref8">8</xref>], by changing the minimum number of instances. We observe that, as this number increases, the number of factors tends to decrease. The goal is to find the result with the highest accuracy while using no more foods than in the previous study.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Flow diagram of our methodology, showing the acquisition, preprocessing, and mining of the data. NDNS: National Diet and Nutrition Survey; SMOTE: Synthetic Minority Over-Sampling Technique.</p>
          </caption>
          <graphic xlink:href="publichealth_v4i2e56_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Sample outcome for the decision tree classifier on free sugars.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="240"/>
            <col width="190"/>
            <col width="140"/>
            <col width="140"/>
            <col width="160"/>
            <thead>
              <tr valign="top">
                <td>Study</td>
                <td>Minimum number of instances</td>
                <td>Accuracy (average)</td>
                <td>Recall</td>
                <td>Specificity</td>
                <td>Number of factors</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Previous</td>
                <td>60</td>
                <td>76.5</td>
                <td>76.1</td>
                <td>76.9</td>
                <td>28</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>60</td>
                <td>78.2</td>
                <td>73.6</td>
                <td>82.9</td>
                <td>31</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>70</td>
                <td>78.1</td>
                <td>74.7</td>
                <td>77.3</td>
                <td>31</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>80</td>
                <td>78.3</td>
                <td>74.7</td>
                <td>78.3</td>
                <td>30</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>90</td>
                <td>77.9</td>
                <td>75.1</td>
                <td>80.8</td>
                <td>30</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>95</td>
                <td>77.9</td>
                <td>75.1</td>
                <td>80.7</td>
                <td>25</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>100</td>
                <td>77.3</td>
                <td>75.7</td>
                <td>78.9</td>
                <td>26</td>
              </tr>
              <tr valign="top">
                <td>Current</td>
                <td>115</td>
                <td>77.2</td>
                <td>75.5</td>
                <td>78.8</td>
                <td>22</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>Our dataset can broadly be understood as consisting of participants (the rows) and their food consumptions (the columns). Demographic characteristics of the participants (regardless of food consumptions) are summarized in <xref ref-type="table" rid="table2">Table 2</xref> including gender, nationality, marital status, and economic status. Participants were on average 30.5 (SD 20.9) years old. Patterns of food consumption are shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>. As will be shown in our results, it is not because a food is common that it should be included to identify whether participants meet a dietary guideline.</p>
      <p>The methods introduced in the previous section select a food if it helps to separate individuals in compliance versus those who are not. For instance, if eating bananas is highly prevalent in the population, then knowing whether a person ate bananas may not be useful to predict dietary compliance. Conversely, if a food was clearly associated with a healthier diet for a handful of individuals, the frequency may be too low to warrant its inclusion at the population level.</p>
      <p>Our new decision trees, built on simplified reporting of foods, were slightly more accurate than previous trees built using the exact weighted foods. This was found across all guidelines (<xref ref-type="table" rid="table3">Table 3</xref>). In 4 out of the 5 guidelines, the increase in accuracy was particularly noticeable to infer that someone did not meet a guideline. For instance, the previously reported accuracy of 78.4% [<xref ref-type="bibr" rid="ref8">8</xref>] on finding noncompliance with fat had now increased to 88.5%. The increase in finding noncompliant cases was met in 2 guidelines (salt, free sugar) with a small decrease in accuracy for compliant cases, whereas it was similar in a third guideline (fat).</p>
      <p>Across the 5 guidelines, our new decision trees had an accuracy of 80.1%. That is, in 4 out of 5 cases, by only knowing whether foods were consumed, and using at most a few dozen foods, we can successfully conclude whether nutritional guidelines are met. This accuracy is 2.6 percentage points higher than the average on previous decision trees (77.5%). That is, not asking individuals to weigh foods leads to being better able to tell if they meet guidelines.</p>
      <p>The optimized classifiers performed slightly better with an average accuracy of 80.6% on classified classes (<xref ref-type="table" rid="table4">Table 4</xref>). The optimized trees also had an average percentage increase of 3.1 points from the previous classifiers see (<xref ref-type="fig" rid="figure4">Figure 4</xref>). In all guidelines but one (salt), the increase in performance was obtained at the expense of using more foods. Although the number of foods used can increase by up to 50% (for saturated fat, fruits and vegetables), the absolute number of foods remains very small compared with the initial NDNS data and its 3911 foods.</p>
      <p>To better contrast optimized decision trees versus those limited in the number of foods, <xref ref-type="fig" rid="figure4">Figure 4</xref> shows where they led to either better (green) or lower (red) accuracy compared with the previous study [<xref ref-type="bibr" rid="ref8">8</xref>]. Both methods generally underperformed on finding noncompliance to fruit and vegetables, and on finding compliance on salt and free sugars. They over-performed on fat and saturated fat. In summary, the consequences of simplifying dietary surveys are not uniform across guidelines, as some will see a small reduction in accuracy, whereas others may see a large improvement, resulting in the average accuracy (across all guidelines) being improved.</p>
      <p>In <xref ref-type="table" rid="table5">Table 5</xref>, we list all individual foods used at least 5 times in predicting compliance with the guidelines, using either decision trees similar to the previous study [<xref ref-type="bibr" rid="ref8">8</xref>], or the optimized trees. The expanded list of foods used one or more times is provided as supplementary material online [<xref ref-type="bibr" rid="ref29">29</xref>]. Note that foods used to predict compliance with a guideline may not be part of what counts within this guideline. For instance, sausage rolls are neither fruit nor vegetables, yet they are used to predict fruit and vegetables consumption. We also observe that these foods are not necessarily the “common” ones shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Key characteristics of the National Diet and Nutrition Survey (NDNS) household dataset. All participants in the study were within the United Kingdom. There were several study waves, with around 1000 respondents per year.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="470"/>
          <col width="500"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Characteristics</td>
              <td>Categorical count, n (%)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="2"><bold>Gender</bold></td>
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Male</td>
              <td>5034 (47.41)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Female</td>
              <td>5439 (52.57)</td>
            </tr>
            <tr valign="top">
              <td colspan="2"><bold>Within compliance</bold></td>
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Free sugars</td>
              <td>1472 (35.41)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Salt</td>
              <td>2524 (60.73)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Fat</td>
              <td>1045 (25.14)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Saturated fat</td>
              <td>795 (19.13)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Fruits and vegetables</td>
              <td>656 (15.78)</td>
            </tr>
            <tr valign="top">
              <td colspan="2"><bold>Nationality</bold></td>
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>English</td>
              <td>5036 (48.08)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Northern Irish</td>
              <td>3442 (32.86)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Scottish</td>
              <td>684 (6.53)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Welsh</td>
              <td>398 (3.80)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Irish</td>
              <td>194 (1.85)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Other</td>
              <td>719 (6.88)</td>
            </tr>
            <tr valign="top">
              <td colspan="2"><bold>Marital status</bold></td>
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Single (never married)</td>
              <td>6240 (59.57)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Married (living with partner)</td>
              <td>1960 (18.71)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Divorced</td>
              <td>261 (2.49)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Married (living separate)</td>
              <td>3 (0.06)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Widowed</td>
              <td>139 (1.32)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Other</td>
              <td>1870 (17.85)</td>
            </tr>
            <tr valign="top">
              <td colspan="2"><bold>Economic status</bold></td>
              <td><break/></td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Going to school full-time</td>
              <td>2974 (28.39)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Full or part time employment</td>
              <td>4440 (42.39)</td>
            </tr>
            <tr valign="top">
              <td><break/></td>
              <td>Not working presently</td>
              <td>3039 (29.02)</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <fig id="figure3" position="float">
        <label>Figure 3</label>
        <caption>
          <p>Main foods either by (a) contribution to caloric intake, or (b) prevalence among individuals.</p>
        </caption>
        <graphic xlink:href="publichealth_v4i2e56_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <table-wrap position="float" id="table3">
        <label>Table 3</label>
        <caption>
          <p>Comparison of the best decision tree using the weight of foods (previous study, Giabbanelli and Adams, 2016 [<xref ref-type="bibr" rid="ref8">8</xref>]) or simplified foods (this study), while keeping the number of foods similar.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="100"/>
          <col width="180"/>
          <col width="170"/>
          <col width="150"/>
          <col width="130"/>
          <col width="130"/>
          <col width="140"/>
          <thead>
            <tr valign="top">
              <td>Study</td>
              <td>Guidelines</td>
              <td>Number of instances</td>
              <td>Accuracy (%)</td>
              <td>Recall</td>
              <td>Specificity</td>
              <td>Number of factors</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Previous</td>
              <td>Free sugars</td>
              <td>60</td>
              <td>76.5</td>
              <td>76.1</td>
              <td>76.9</td>
              <td>28</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Free sugars</td>
              <td>95</td>
              <td>77.9</td>
              <td>75.1</td>
              <td>80.7</td>
              <td>25</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Fat</td>
              <td>70</td>
              <td>72.4</td>
              <td>66.3</td>
              <td>78.4</td>
              <td>33</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Fat</td>
              <td>90</td>
              <td>79.4</td>
              <td>70.4</td>
              <td>88.5</td>
              <td>33</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Fruits and vegetables</td>
              <td>50</td>
              <td>83.1</td>
              <td>82.5</td>
              <td>83.8</td>
              <td>11</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Fruits and vegetables</td>
              <td>90</td>
              <td>82.2</td>
              <td>82.3</td>
              <td>82.2</td>
              <td>10</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Saturated fat</td>
              <td>20</td>
              <td>79.7</td>
              <td>75.8</td>
              <td>83.6</td>
              <td>28</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Saturated fat</td>
              <td>90</td>
              <td>84.6</td>
              <td>77.4</td>
              <td>91.8</td>
              <td>27</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Salt</td>
              <td>15</td>
              <td>75.8</td>
              <td>81.9</td>
              <td>69.8</td>
              <td>28</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Salt</td>
              <td>55</td>
              <td>76.3</td>
              <td>79.5</td>
              <td>73.2</td>
              <td>26</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <table-wrap position="float" id="table4">
        <label>Table 4</label>
        <caption>
          <p>Comparison of the best decision tree using the weight of foods (previous study, Giabbanelli and Adams, 2016 [<xref ref-type="bibr" rid="ref8">8</xref>]) or simplified foods (this study), without being limited by the number of foods.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="100"/>
          <col width="180"/>
          <col width="170"/>
          <col width="150"/>
          <col width="130"/>
          <col width="130"/>
          <col width="140"/>
          <thead>
            <tr valign="top">
              <td>Study</td>
              <td>Guidelines</td>
              <td>Number of instances</td>
              <td>Accuracy (%)</td>
              <td>Recall</td>
              <td>Specificity</td>
              <td>Number of factors</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Previous</td>
              <td>Free sugars</td>
              <td>60</td>
              <td>76.5</td>
              <td>76.1</td>
              <td>76.9</td>
              <td>28</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Free sugars</td>
              <td>60</td>
              <td>78.2</td>
              <td>73.6</td>
              <td>82.9</td>
              <td>31</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Fat</td>
              <td>70</td>
              <td>72.4</td>
              <td>66.3</td>
              <td>78.4</td>
              <td>33</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Fat</td>
              <td>70</td>
              <td>79.9</td>
              <td>72.3</td>
              <td>87.7</td>
              <td>43</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Fruits and vegetables</td>
              <td>50</td>
              <td>83.1</td>
              <td>82.5</td>
              <td>83.8</td>
              <td>11</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Fruits and vegetables</td>
              <td>50</td>
              <td>83.5</td>
              <td>84.9</td>
              <td>82.2</td>
              <td>16</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Saturated fat</td>
              <td>20</td>
              <td>79.7</td>
              <td>75.8</td>
              <td>83.6</td>
              <td>28</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Saturated fat</td>
              <td>20</td>
              <td>84.7</td>
              <td>79.3</td>
              <td>90.1</td>
              <td>42</td>
            </tr>
            <tr valign="top">
              <td>Previous</td>
              <td>Salt</td>
              <td>15</td>
              <td>75.8</td>
              <td>81.9</td>
              <td>69.8</td>
              <td>28</td>
            </tr>
            <tr valign="top">
              <td>Current</td>
              <td>Salt</td>
              <td>50</td>
              <td>76.6</td>
              <td>79.9</td>
              <td>73.2</td>
              <td>25</td>
            </tr>
          </tbody>
        </table>
      </table-wrap>
      <fig id="figure4" position="float">
        <label>Figure 4</label>
        <caption>
          <p>Accuracy, recall (“Yes”), and specificity (“No”) when (a) limiting the number of foods as in a previous study (Giabbanelli &#38; Adams, 2016 [<xref ref-type="bibr" rid="ref8">8</xref>]), or (b) using any number of foods to build the decision trees, giving us the optimized decision trees.</p>
        </caption>
        <graphic xlink:href="publichealth_v4i2e56_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <table-wrap position="float" id="table5">
        <label>Table 5</label>
        <caption>
          <p>Individual foods used as predictors at least 5 times in the trees generated using our 2 processes (similar/optimized) and for the 5 guidelines: Fruit and Vegetables, Fat, Saturated Fat, Salt, and Free Sugars. The frequency is the number of times that a food is used as a decision node across all trees (eg, if used 3 times in 5 trees each, it would be 15).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="220"/>
          <col width="60"/>
          <col width="60"/>
          <col width="70"/>
          <col width="60"/>
          <col width="60"/>
          <col width="60"/>
          <col width="60"/>
          <col width="60"/>
          <col width="60"/>
          <col width="60"/>
          <col width="140"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Variables</td>
              <td colspan="5">Similar decision tree</td>
              <td colspan="5">Optimized decision tree</td>
              <td>Total frequency</td>
            </tr>
            <tr valign="bottom">
            <td><break/></td>
            <td><break/></td>
              <td>FV<sup>b</sup></td>
              <td>Fat</td>
              <td>SatFat<sup>c</sup></td>
              <td>Salt</td>
              <td>Sug<sup>d</sup></td>
              <td>FV</td>
              <td>Fat</td>
              <td>SatFat</td>
              <td>Salt</td>
              <td>Sug</td>
              <td><break/></td>
            </tr>
          </thead>
          <tbody>
                      <tr valign="top">
              <td colspan="13"><bold>Individual food</bold></td>
              </tr>
           
            <tr valign="top">
            <td><break/></td>
              <td>Sausages</td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td>20</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Bananas raw</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>19</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Sausage roll</td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>16</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Cheese cheddar</td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td>14</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Milk chocolate</td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>12</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Butter salted</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td>10</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Cheese spreads</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>8</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Ice cream</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td>8</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Fruit drink</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td>8</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Chicken pieces</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>8</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Sex</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>7</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Potato crisps</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Apples</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Milk whole</td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Beans baked</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Onions</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Cola</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>6</td>
            </tr>
            <tr valign="bottom">
            <td><break/></td>
              <td>Apple juice unsweetened UHT<sup>a</sup></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Olive oil</td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Orange juice unsweetened</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Orange juice unsweetened UHT</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Bacon</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>6</td>
            </tr>
            <tr valign="top">
            <td><break/></td>
              <td>Apple juice unsweetened</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td>5</td>
            </tr>
                                  <tr valign="top">
              <td colspan="13"><bold>Demographic</bold></td>
              </tr>
               <tr valign="top">
            <td><break/></td>
              <td>Sex</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td><break/></td>
              <td>✓</td>
              <td><break/></td>
              <td>7</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table5fn1">
            <p><sup>a</sup>UHT: Ultra-high-temperature processing.</p>
          </fn>
          <fn id="table5fn2">
            <p><sup>b</sup>FV: fruits and vegetables.</p>
          </fn>
          <fn id="table5fn3">
            <p><sup>c</sup>SatFat: saturated fat.</p>
          </fn>
          <fn id="table5fn4">
            <p><sup>d</sup>Sug: free sugars.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Monitoring at the national level whether the population is in compliance with an array of nutritional guidelines currently requires an extensive data collection process, in which individuals report and weigh the exact foods that they consumed. Our previous study demonstrated that only 2.89% (113/3911) of the foods needed to be reported to predict with 77.5% accuracy (72%-83% across guidelines) whether individuals achieve key dietary recommendations regarding sodium, saturated fats, sugars, fruit/vegetables, and fats [<xref ref-type="bibr" rid="ref8">8</xref>]. In this study, we investigated the consequences of further simplifying reporting by only asking participants whether they ate a specific food rather than having to weight it.</p>
        <p>Although we may have expected a decreased accuracy as a consequence of removing information, our results paradoxically indicate that accuracy has improved to 80%. We observed that results were particularly improved when inferring compliance to the guidelines on fat and saturated fat, but a trade-off was operated on free sugars and salt where a decrease in recall was counter-balanced by a larger increase in specificity. Results were more nuanced on fruit and vegetables, where optimized decision trees were able to offset a loss of specificity with a higher gain in recall (thus resulting in higher accuracy), but nonoptimized decision trees resulted in a small loss of accuracy. Overall, these findings suggest that foods may not have to be weighted, but this may depend on (1) which food guidelines need to be monitored and (2) whether public health officials decide that recall is more important than sensitivity (or vice versa) instead of giving them equal weight.</p>
        <p>The main applications of our results are twofold. First, we may simplify surveys not only by asking for few foods in adaptive questionnaires (as shown in [<xref ref-type="bibr" rid="ref8">8</xref>]) but also by asking binary questions “Did you consume this food?” rather than requiring participants to provide an exact weight. This contribution will result in more time-effective assessments and may lower the cognitive effort required from participants, which in turn can decrease the error rate. Second, identifying a few questions yielding an accuracy of 80% is most applicable when a trade-off has to be found between accuracy and participation burden. For instance, a doctor may have many tools and physiological measures as part of the treatment process (eg, blood pressure, HbA1c), and including a few dietary questions with an accuracy of 80% may be more feasible than a more thorough survey. For population health, our work is particularly applicable in large studies where only a limited number of questions can be used to investigate a <italic>subgroup within arms</italic>. For instance, in the Netherlands, the nationwide Longitudinal Internet Studies for the Social sciences (LISS) panel sends questionnaires each month, dealing with many topics ranging from alcohol [<xref ref-type="bibr" rid="ref30">30</xref>] to happiness. Nutrition would only be one part, and a reduced measurement approach would be necessary.</p>
      </sec>
      <sec>
        <title>Comparison With Other Dietary Methods</title>
        <p>There are several alternatives to the analysis conducted here. First, an index-based analysis consists of a scoring system based on a priori knowledge that researchers have about (1) dietary guidance and (2) the scores to assign for sets of dietary components based on the guidance. This analysis can be used to assess adherence to guidelines [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>] or summarize an individual's diet quality [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Within epidemiology, indices are used to identify the risk an individual will have to certain diseases based a combination of foods [<xref ref-type="bibr" rid="ref31">31</xref>]. Although the reliance of indices on a priori knowledge makes them less sensitive to variations in the sample than our method, they may (depending on their design and structure) require more foods and accuracy in portion sizes. Considering the trade-off or “continuum” from few simple questions to favoring high accuracy, indices can lead to a higher accuracy than the method presented here but may not be as amenable to a “reduced” form as a short addendum to a large panel study such as LISS [<xref ref-type="bibr" rid="ref30">30</xref>]. We also note that the transparency and simplicity of decision trees can support practitioners in interpreting the rules (eg, for triage) with little to no training, whereas dietary indices can produce summary scores where expertise is still important for interpretation.</p>
        <p>Second, one could perform a cluster analysis. As summarized by Reedy et al, “clusters are driven by the sample from which they are derived, so their applicability as a standard for evaluating diets of different populations is limited because of the number of factors that determine food selection” [<xref ref-type="bibr" rid="ref34">34</xref>]. Cluster analysis is an <italic>unsupervised</italic> data mining technique that identifies similarities between groups based on their patterns of food consumption: for instance, “fatty meats” may be an important similarity between men [<xref ref-type="bibr" rid="ref34">34</xref>]. This is different from the classification approach taken here, which is a <italic>supervised</italic> data mining technique that seeks to predict an outcome.</p>
        <p>Finally, Food Frequency Questionnaires (FFQs) can provide a cost-effective approach to monitoring the health of a large population. Molag et al [<xref ref-type="bibr" rid="ref35">35</xref>], as well as Noethlings et al, suggested that portion sizes may not be necessary [<xref ref-type="bibr" rid="ref36">36</xref>]: “We conclude that the omission of individual portion size information would probably result in a notable reduction of interindividual variance. However, to reduce the respondents' burden and to increase data completeness in self-administration in large epidemiologic studies, the assignment of a constant portion size seems to be adequate.” Our study confirms this finding while pointing out that accuracy may even increase; however, the effect depends on which guideline we monitor.</p>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>Our study aimed to determine the effects of reducing the level of details employed by a national dietary survey. The NDNS survey used here has been the subject of many publications and provides a wealth of high-quality data. However, several limitations stem from using this survey. First, the NDNS survey relies on self-reported food intake. Individuals may consciously, or unconsciously, misreport their consumption within a 24-hour time frame [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Using the exact weigh of foods is thus sensitive to misreporting, which was a limitation of our previous study [<xref ref-type="bibr" rid="ref8">8</xref>]. In contrast, this study is not sensitive to misreporting how much of a food was consumed: it only takes into account whether <italic>any</italic> consumption of this food occurred. Reporting errors affecting our study would thus be to entirely ignore a specific food that was consumed or to report a food that was not consumed.</p>
        <p>Second, this survey was specific to the population of the United Kingdom, as can be seen in the specific foods used as predictors. This limitation of the data entails that our conclusion may not be generalized to populations that have important differences in eating behaviors. In this case, our approach can be replicated by collecting the complete dataset (in the first study wave) and then using data mining to investigate the consequences of simplifying it (for future study waves). Replicating results across target populations is necessary before concluding that monitoring compliance to nutritional guidelines may generally be simplified.</p>
        <p>Our study used the data mining technique of decision trees to automatically relate individual food consumption to meeting specific guidelines. This is a well-researched technique, which has been applied to problems arising in health on multiple occasions. One specific advantage of decision trees lies in their ability to produce a model that can easily be interpreted and used with limited training. For instance, in triage, decision trees provide a “flowchart” that lay participants as well as field specialists can use intuitively. That is, an adaptive questionnaire can be formed by following the rules induced by a tree (<xref ref-type="fig" rid="figure1">Figure 1</xref>), which can be done using a computer program or by individuals. In contrast, many other techniques (eg, Support Vector Machines, Neural Networks) produce “black box” models, which are meant to be executed by machines rather than being read by humans. Future studies primarily concerned with accuracy (rather than transparency/readability of the model) may explore using such techniques. Contrasting the use of neural networks to decision trees over the same dataset would provide valuable insight on how accurate we can be without restrictions, which would help to better situate the results from this study.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We sought to determine whether identifying individual dietary compliance can be further simplified while remaining as informative and accurate. We found that reporting very few foods and only whether they were consumed was sufficient to correctly identify compliance to 5 major nutritional guidelines. Being able to reduce the detail of a dataset for national monitoring can make it easier to increasing monitoring frequency or monitor more participants, thus increasing research participations without increasing study costs.</p>
      </sec>
    </sec>
  </body>
  <back>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">LISS</term>
          <def>
            <p>Longitudinal Internet Studies for the Social sciences</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">NDNS</term>
          <def>
            <p>National Diet and Nutrition Survey</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">SMOTE</term>
          <def>
            <p>Synthetic Minority Over-Sampling Technique</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">WHO</term>
          <def>
            <p>World Health Organization</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors wish to thank the Department of Computer Science and the College of Liberal Arts and Sciences at Northern Illinois University for funding this study via start-up funds to PG. NR was also financially supported by the Office of Student Engagement and Experiential Learning at Northern Illinois University.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Trieu</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>McLean</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Johnson</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Santos</surname>
            <given-names>JA</given-names>
          </name>
          <name name-style="western">
            <surname>Raj</surname>
            <given-names>TS</given-names>
          </name>
          <name name-style="western">
            <surname>Campbell</surname>
            <given-names>NR</given-names>
          </name>
          <name name-style="western">
            <surname>Webster</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>The Science of Salt: A Regularly Updated Systematic Review of the Implementation of Salt Reduction Interventions (November 2015 to February 2016)</article-title>
        <source>J Clin Hypertens (Greenwich)</source>  
        <year>2016</year>  
        <month>12</month>  
        <volume>18</volume>  
        <issue>12</issue>  
        <fpage>1194</fpage>  
        <lpage>1204</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.doi.org/10.1111/jch.12909"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1111/jch.12909</pub-id>
        <pub-id pub-id-type="medline">27704719</pub-id></nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Owen</surname>
            <given-names>AJ</given-names>
          </name>
          <name name-style="western">
            <surname>Retegan</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Rockell</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Jennings</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Reid</surname>
            <given-names>CM</given-names>
          </name>
        </person-group>
        <article-title>Inertia or inaction? Blood pressure management and cardiovascular risk in diabetes</article-title>
        <source>Clin Exp Pharmacol Physiol</source>  
        <year>2009</year>  
        <month>07</month>  
        <volume>36</volume>  
        <issue>7</issue>  
        <fpage>643</fpage>  
        <lpage>7</lpage>  
        <pub-id pub-id-type="doi">10.1111/j.1440-1681.2008.05125.x</pub-id>
        <pub-id pub-id-type="medline">19076166</pub-id>
        <pub-id pub-id-type="pii">CEP5125</pub-id></nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>X</given-names>
          </name>
          <name name-style="western">
            <surname>Ouyang</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Zhu</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Zhao</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Bao</surname>
            <given-names>W</given-names>
          </name>
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>FB</given-names>
          </name>
        </person-group>
        <article-title>Fruit and vegetable consumption and mortality from all causes, cardiovascular disease, and cancer: systematic review and dose-response meta-analysis of prospective cohort studies</article-title>
        <source>BMJ</source>  
        <year>2014</year>  
        <volume>349</volume>  
        <fpage>g4490</fpage> </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Poslusna</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Ruprich</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>de Vries</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Jakubikova</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>van't Veer</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Misreporting of energy and micronutrient intake estimated by food records and 24 hour recalls, control and adjustment methods in practice</article-title>
        <source>Br J Nutr</source>  
        <year>2009</year>  
        <month>07</month>  
        <volume>101</volume>  
        <issue>Suppl 2</issue>  
        <fpage>S73</fpage>  
        <lpage>85</lpage>  
        <pub-id pub-id-type="doi">10.1017/S0007114509990602</pub-id>
        <pub-id pub-id-type="medline">19594967</pub-id>
        <pub-id pub-id-type="pii">S0007114509990602</pub-id></nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Batista</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Prati</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Monard</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <person-group person-group-type="editor">
          <name name-style="western">
            <surname>Famili</surname>
            <given-names>AF</given-names>
          </name>
          <name name-style="western">
            <surname>Kok</surname>
            <given-names>JN</given-names>
          </name>
          <name name-style="western">
            <surname>Peña</surname>
            <given-names>JM</given-names>
          </name>
          <name name-style="western">
            <surname>Siebes</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Feelders</surname>
            <given-names>A</given-names>
          </name>
        </person-group>
        <article-title>Balancing Strategies and Class Overlapping</article-title>
        <source>Advances in Intelligent Data Analysis VI. Lecture Notes in Computer Science</source>  
        <year>2005</year>  
        <publisher-loc>Berlin, Heidelberg</publisher-loc>
        <publisher-name>Springer</publisher-name>
        <fpage>24</fpage>  
        <lpage>35</lpage> </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Nau</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Ellis</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Schwartz</surname>
            <given-names>BS</given-names>
          </name>
          <name name-style="western">
            <surname>Hirsch</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Bailey-Davis</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Kress</surname>
            <given-names>AM</given-names>
          </name>
          <name name-style="western">
            <surname>Pollak</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Glass</surname>
            <given-names>TA</given-names>
          </name>
        </person-group>
        <article-title>Exploring the forest instead of the trees: An innovative method for defining obesogenic and obesoprotective environments</article-title>
        <source>Health Place</source>  
        <year>2015</year>  
        <month>09</month>  
        <volume>35</volume>  
        <fpage>136</fpage>  
        <lpage>46</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/26398219"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1016/j.healthplace.2015.08.002</pub-id>
        <pub-id pub-id-type="medline">26398219</pub-id>
        <pub-id pub-id-type="pii">S1353-8292(15)00111-2</pub-id>
        <pub-id pub-id-type="pmcid">PMC4756636</pub-id></nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Seyednasrollah</surname>
            <given-names>F</given-names>
          </name>
          <name name-style="western">
            <surname>Mäkelä</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Pitkänen</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Juonala</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Hutri-Kähönen</surname>
            <given-names>N</given-names>
          </name>
          <name name-style="western">
            <surname>Lehtimäki</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Viikari</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Kelly</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Li</surname>
            <given-names>C</given-names>
          </name>
          <name name-style="western">
            <surname>Bazzano</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Elo</surname>
            <given-names>LL</given-names>
          </name>
          <name name-style="western">
            <surname>Raitakari</surname>
            <given-names>OT</given-names>
          </name>
        </person-group>
        <article-title>Prediction of Adulthood Obesity Using Genetic and Childhood Clinical Risk Factors in the Cardiovascular Risk in Young Finns Study</article-title>
        <source>Circ Cardiovasc Genet</source>  
        <year>2017</year>  
        <month>06</month>  
        <volume>10</volume>  
        <issue>3</issue>  
        <fpage>e001554</fpage>  
        <pub-id pub-id-type="doi">10.1161/CIRCGENETICS.116.001554</pub-id>
        <pub-id pub-id-type="medline">28620069</pub-id>
        <pub-id pub-id-type="pii">CIRCGENETICS.116.001554</pub-id></nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Giabbanelli</surname>
            <given-names>PJ</given-names>
          </name>
          <name name-style="western">
            <surname>Adams</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Identifying small groups of foods that can predict achievement of key dietary recommendations: data mining of the UK National Diet and Nutrition Survey, 2008-12</article-title>
        <source>Public Health Nutr</source>  
        <year>2016</year>  
        <month>06</month>  
        <volume>19</volume>  
        <issue>9</issue>  
        <fpage>1543</fpage>  
        <lpage>51</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://journals.cambridge.org/abstract_S1368980016000185"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1017/S1368980016000185</pub-id>
        <pub-id pub-id-type="medline">26879185</pub-id>
        <pub-id pub-id-type="pii">S1368980016000185</pub-id>
        <pub-id pub-id-type="pmcid">PMC4873899</pub-id></nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kerr</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Patterson</surname>
            <given-names>RE</given-names>
          </name>
          <name name-style="western">
            <surname>Ellis</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Godbole</surname>
            <given-names>S</given-names>
          </name>
          <name name-style="western">
            <surname>Johnson</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Lanckriet</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>Staudenmayer</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>Objective Assessment of Physical Activity: Classifiers for Public Health</article-title>
        <source>Med Sci Sports Exerc</source>  
        <year>2016</year>  
        <month>05</month>  
        <volume>48</volume>  
        <issue>5</issue>  
        <fpage>951</fpage>  
        <lpage>7</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/27089222"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1249/MSS.0000000000000841</pub-id>
        <pub-id pub-id-type="medline">27089222</pub-id>
        <pub-id pub-id-type="pii">00005768-201605000-00024</pub-id>
        <pub-id pub-id-type="pmcid">PMC4837464</pub-id></nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Huys</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Jirsa</surname>
            <given-names>VK (editors)</given-names>
          </name>
        </person-group>
        <source>Nonlinear Dynamics in Human Behavior. Studies in Computational Intelligence 328</source>  
        <year>2011</year>  
        <publisher-loc>Berlin Heidelberg</publisher-loc>
        <publisher-name>Springer-Verlag</publisher-name></nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Crutzen</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Giabbanelli</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Using Classifiers to Identify Binge Drinkers Based on Drinking Motives</article-title>
        <source>Subst Use Misuse</source>  
        <year>2015</year>  
        <volume>49</volume>  
        <issue>1-2</issue>  
        <fpage>110</fpage>  
        <pub-id pub-id-type="doi">10.3109/10826084.2013.824467</pub-id>
        <pub-id pub-id-type="medline">23964957</pub-id></nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Garcia</surname>
            <given-names>VL</given-names>
          </name>
          <name name-style="western">
            <surname>Sánchez</surname>
            <given-names>JS</given-names>
          </name>
          <name name-style="western">
            <surname>Mollineda</surname>
            <given-names>RA</given-names>
          </name>
          <name name-style="western">
            <surname>Alejo</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Sotoca</surname>
            <given-names>JM</given-names>
          </name>
        </person-group>
        <article-title>The class imbalance problem in pattern classification and learning</article-title>
        <year>2009</year>  
        <conf-name>the Congreso Español de Informática (CEDI)</conf-name>
        <conf-date>2007</conf-date>
        <conf-loc>Zaragoza, Spain</conf-loc></nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Crutzen</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Giabbanelli</surname>
            <given-names>PJ</given-names>
          </name>
          <name name-style="western">
            <surname>Jander</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Mercken</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>de Vries</surname>
            <given-names>H</given-names>
          </name>
        </person-group>
        <article-title>Identifying binge drinkers based on parenting dimensions and alcohol-specific parenting practices: building classifiers on adolescent-parent paired data</article-title>
        <source>BMC Public Health</source>  
        <year>2015</year>  
        <month>08</month>  
        <day>05</day>  
        <volume>15</volume>  
        <fpage>747</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://bmcpublichealth.biomedcentral.com/articles/10.1186/s12889-015-2092-8"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1186/s12889-015-2092-8</pub-id>
        <pub-id pub-id-type="medline">26243154</pub-id>
        <pub-id pub-id-type="pii">10.1186/s12889-015-2092-8</pub-id>
        <pub-id pub-id-type="pmcid">PMC4526422</pub-id></nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Tanner</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Schreiber</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Low</surname>
            <given-names>JG</given-names>
          </name>
          <name name-style="western">
            <surname>Ong</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Tolfvenstam</surname>
            <given-names>T</given-names>
          </name>
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>Y</given-names>
          </name>
          <name name-style="western">
            <surname>Ng</surname>
            <given-names>LC</given-names>
          </name>
          <name name-style="western">
            <surname>Leo</surname>
            <given-names>YS</given-names>
          </name>
          <name name-style="western">
            <surname>Thi Puong</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Vasudevan</surname>
            <given-names>SG</given-names>
          </name>
          <name name-style="western">
            <surname>Simmons</surname>
            <given-names>CP</given-names>
          </name>
          <name name-style="western">
            <surname>Hibberd</surname>
            <given-names>ML</given-names>
          </name>
          <name name-style="western">
            <surname>Ooi</surname>
            <given-names>EE</given-names>
          </name>
        </person-group>
        <article-title>Decision tree algorithms predict the diagnosis and outcome of dengue fever in the early phase of illness</article-title>
        <source>PLoS Negl Trop Dis</source>  
        <year>2008</year>  
        <month>03</month>  
        <day>12</day>  
        <volume>2</volume>  
        <issue>3</issue>  
        <fpage>e196</fpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://dx.plos.org/10.1371/journal.pntd.0000196"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1371/journal.pntd.0000196</pub-id>
        <pub-id pub-id-type="medline">18335069</pub-id>
        <pub-id pub-id-type="pmcid">PMC2263124</pub-id></nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Steele</surname>
            <given-names>R</given-names>
          </name>
          <name name-style="western">
            <surname>Green</surname>
            <given-names>SM</given-names>
          </name>
          <name name-style="western">
            <surname>Gill</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Coba</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Oh</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Clinical decision rules for secondary trauma triage: predictors of emergency operative management</article-title>
        <source>Ann Emerg Med</source>  
        <year>2006</year>  
        <month>02</month>  
        <volume>47</volume>  
        <issue>2</issue>  
        <fpage>135</fpage>  
        <pub-id pub-id-type="doi">10.1016/j.annemergmed.2005.10.018</pub-id>
        <pub-id pub-id-type="medline">16431223</pub-id>
        <pub-id pub-id-type="pii">S0196-0644(05)01860-3</pub-id></nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
        <source>Gov.UK</source>  
        <year>2014</year>  
        <month>05</month>  
        <access-date>2017-09-25</access-date>
        <comment>NDNS: results from Years 1 to 4 (combined) 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://www.gov.uk/government/statistics/national-diet-and-nutrition-survey-results-from-years-1-to-4-combined-of-the-rolling-programme-for-2008-and-2009-to-2011-and-2012">https://www.gov.uk/government/statistics/national-diet-and-nutrition-survey-results-from-years-1-to-4-combined-of-the-rolling-programme-for-2008-and-2009-to-2011-and-2012</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6tk7fopKD"/></comment> </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
        <source>Webarchive.nationalarchives.gov.uk</source>  
        <year>2010</year>  
        <access-date>2018-05-13</access-date>
        <comment>National Diet and Nutrition Survey: Headline Results from Year 1 of the Rolling Programme and comparison with previous surveys London: Food Standards Agency and Department of Health 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://webarchive.nationalarchives.gov.uk/20101210143146/http://www.food.gov.uk/multimedia/pdfs/publication/ndnsreport0809.pdf">http://webarchive.nationalarchives.gov.uk/20101210143146/http://www.food.gov.uk/multimedia/pdfs/publication/ndnsreport0809.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zOfNs9aU"/></comment> </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <collab>Scientific Advisory Committee on Nutrition</collab>
        </person-group>
        <source>Salt and Health</source>  
        <year>2003</year>  
        <publisher-loc>London, UK</publisher-loc>
        <publisher-name>The Stationery Office</publisher-name></nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
        <source>Apps.WHO</source>  
        <year>2003</year>  
        <access-date>2018-05-04</access-date>
        <comment>Diet, Nutrition and the Prevention of Chronic Diseases. Report of a Joint WHO/FAO Expert Consultation. WHO Technical Report Series no 916 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://apps.who.int/iris/bitstream/10665/42665/1/WHO_TRS_916.pdf">http://apps.who.int/iris/bitstream/10665/42665/1/WHO_TRS_916.pdf</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zAVQNyTn"/></comment> </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bramer</surname>
            <given-names>M</given-names>
          </name>
        </person-group>
        <article-title>Estimating the Predictive Accuracy of a Classifier</article-title>
        <source>Principles of Data Mining. Undergraduate Topics in Computer Science</source>  
        <year>2013</year>  
        <publisher-loc>London</publisher-loc>
        <publisher-name>Springer</publisher-name>
        <fpage>79</fpage>  
        <lpage>92</lpage> </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Giabbanelli</surname>
            <given-names>PJ</given-names>
          </name>
          <name name-style="western">
            <surname>Peters</surname>
            <given-names>JG</given-names>
          </name>
        </person-group>
        <article-title>An Algebraic Approach to Combining Classifiers</article-title>
        <source>Procedia Comput Sci</source>  
        <year>2015</year>  
        <volume>51</volume>  
        <fpage>1545</fpage>  
        <lpage>1554</lpage> </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Rokach</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Maimon</surname>
            <given-names>O</given-names>
          </name>
        </person-group>
        <person-group person-group-type="editor">
          <name name-style="western">
            <surname>Bunke</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>PSP</given-names>
          </name>
        </person-group>
        <source>Data Mining with Decision Trees:Theory and Applications. 2nd Edition. Machine Perception and Artificial Intelligence Series. Vol 81</source>  
        <year>2014</year>  
        <month>10</month>  
        <publisher-loc>Singapore</publisher-loc>
        <publisher-name>World Scientific Publishing Co. Pte. Ltd</publisher-name></nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Weiss</surname>
            <given-names>G</given-names>
          </name>
          <name name-style="western">
            <surname>McCarthy</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Zabar</surname>
            <given-names>B</given-names>
          </name>
        </person-group>
        <article-title>Cost-Sensitive Learning vs Sampling: Which is Best for Handling Unbalanced Classes with Unequal Error Costs</article-title>
        <year>2007</year>  
        <conf-name>Proceedings of the International conference on Data Mining</conf-name>
        <conf-date>2007</conf-date>
        <conf-loc>Washington DC, USA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Maloof</surname>
            <given-names>MA</given-names>
          </name>
        </person-group>
        <article-title>Learning When Data Sets are Imbalanced and When Costs are Unequal and Unknown</article-title>
        <year>2003</year>  
        <conf-name>International Conference of Machine Learning</conf-name>
        <conf-date>2003</conf-date>
        <conf-loc>Washington DC, USA</conf-loc></nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Chawla</surname>
            <given-names>NV</given-names>
          </name>
          <name name-style="western">
            <surname>Bowyer</surname>
            <given-names>KW</given-names>
          </name>
          <name name-style="western">
            <surname>Hall</surname>
            <given-names>LO</given-names>
          </name>
          <name name-style="western">
            <surname>Kegelmeyer</surname>
            <given-names>WP</given-names>
          </name>
        </person-group>
        <article-title>SMOTE: Synthetic Minority Over-sampling Technique</article-title>
        <source>J Artif Intell Res</source>  
        <year>2002</year>  
        <month>06</month>  
        <day>01</day>  
        <volume>16</volume>  
        <fpage>321</fpage>  
        <lpage>357</lpage> </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="web">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Bellinger</surname>
            <given-names>C</given-names>
          </name>
        </person-group>
        <source>Ruor.uottawa</source>  
        <year>2016</year>  
        <access-date>2018-05-04</access-date>
        <comment>Beyond the Boundaries of SMOTE: A Framework for Manifold-Based Synthetically Oversampling 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://ruor.uottawa.ca/handle/10393/34643">https://ruor.uottawa.ca/handle/10393/34643</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zAVjiALQ"/></comment> </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="book">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Ross Quinlan</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <source>C4.5: programs for machine learning</source>  
        <year>1993</year>  
        <publisher-loc>San Francisco</publisher-loc>
        <publisher-name>Morgan Kaufmann</publisher-name>
        <fpage>1993</fpage> </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kohavi</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>A study of cross-validation and bootstrap for accuracy estimation and model selection</article-title>
        <year>1995</year>  
        <conf-name>14th international joint conference on Artificial Intelligence</conf-name>
        <conf-date>August 20 - 25, 1995</conf-date>
        <conf-loc>Montreal QC, Canada</conf-loc>
        <fpage>1137</fpage>  
        <lpage>1143</lpage> </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
        <source>Osf.io</source>  
        <access-date>2018-05-11</access-date>
        <comment>OSF: Data Mining on Foods 
        <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="https://osf.io/znv82/">https://osf.io/znv82/</ext-link>
        <ext-link ext-link-type="webcite" xlink:href="6zKonPcvj"/></comment> </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Giabbanelli</surname>
            <given-names>PJ</given-names>
          </name>
          <name name-style="western">
            <surname>Crutzen</surname>
            <given-names>R</given-names>
          </name>
        </person-group>
        <article-title>An agent-based social network model of binge drinking among Dutch adults</article-title>
        <source>J Artif Soc Soc Simul</source>  
        <year>2013</year>  
        <volume>16</volume>  
        <issue>2</issue>  
        <fpage>10</fpage> </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Kant</surname>
            <given-names>AK</given-names>
          </name>
        </person-group>
        <article-title>Indexes of overall diet quality: a review</article-title>
        <source>J Am Diet Assoc</source>  
        <year>1996</year>  
        <month>08</month>  
        <volume>96</volume>  
        <issue>8</issue>  
        <fpage>785</fpage>  
        <lpage>91</lpage>  
        <pub-id pub-id-type="doi">10.1016/S0002-8223(96)00217-9</pub-id>
        <pub-id pub-id-type="medline">8683010</pub-id>
        <pub-id pub-id-type="pii">S0002-8223(96)00217-9</pub-id></nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>FB</given-names>
          </name>
        </person-group>
        <article-title>Dietary pattern analysis: a new direction in nutritional epidemiology</article-title>
        <source>Curr Opin Lipidol</source>  
        <year>2002</year>  
        <month>02</month>  
        <volume>13</volume>  
        <issue>1</issue>  
        <fpage>3</fpage>  
        <lpage>9</lpage>  
        <pub-id pub-id-type="medline">11790957</pub-id></nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Schwingshackl</surname>
            <given-names>L</given-names>
          </name>
          <name name-style="western">
            <surname>Hoffmann</surname>
            <given-names>G</given-names>
          </name>
        </person-group>
        <article-title>Diet quality as assessed by the Healthy Eating Index, the Alternate Healthy Eating Index, the Dietary Approaches to Stop Hypertension score, and health outcomes: a systematic review and meta-analysis of cohort studies</article-title>
        <source>J Acad Nutr Diet</source>  
        <year>2015</year>  
        <month>05</month>  
        <volume>115</volume>  
        <issue>5</issue>  
        <fpage>780</fpage>  
        <lpage>800.e5</lpage>  
        <pub-id pub-id-type="doi">10.1016/j.jand.2014.12.009</pub-id>
        <pub-id pub-id-type="medline">25680825</pub-id>
        <pub-id pub-id-type="pii">S2212-2672(14)01871-1</pub-id></nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Reedy</surname>
            <given-names>J</given-names>
          </name>
          <name name-style="western">
            <surname>Wirfält</surname>
            <given-names>E</given-names>
          </name>
          <name name-style="western">
            <surname>Flood</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Mitrou</surname>
            <given-names>PN</given-names>
          </name>
          <name name-style="western">
            <surname>Krebs-Smith</surname>
            <given-names>SM</given-names>
          </name>
          <name name-style="western">
            <surname>Kipnis</surname>
            <given-names>V</given-names>
          </name>
          <name name-style="western">
            <surname>Midthune</surname>
            <given-names>D</given-names>
          </name>
          <name name-style="western">
            <surname>Leitzmann</surname>
            <given-names>M</given-names>
          </name>
          <name name-style="western">
            <surname>Hollenbeck</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Schatzkin</surname>
            <given-names>A</given-names>
          </name>
          <name name-style="western">
            <surname>Subar</surname>
            <given-names>AF</given-names>
          </name>
        </person-group>
        <article-title>Comparing 3 dietary pattern methods--cluster analysis, factor analysis, and index analysis--With colorectal cancer risk: The NIH-AARP Diet and Health Study</article-title>
        <source>Am J Epidemiol</source>  
        <year>2010</year>  
        <month>02</month>  
        <day>15</day>  
        <volume>171</volume>  
        <issue>4</issue>  
        <fpage>479</fpage>  
        <lpage>87</lpage>  
        <comment>
          <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/20026579"/>
        </comment>  
        <pub-id pub-id-type="doi">10.1093/aje/kwp393</pub-id>
        <pub-id pub-id-type="medline">20026579</pub-id>
        <pub-id pub-id-type="pii">kwp393</pub-id>
        <pub-id pub-id-type="pmcid">PMC2842201</pub-id></nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Molag</surname>
            <given-names>ML</given-names>
          </name>
          <name name-style="western">
            <surname>de Vries</surname>
            <given-names>JH</given-names>
          </name>
          <name name-style="western">
            <surname>Ocké</surname>
            <given-names>MC</given-names>
          </name>
          <name name-style="western">
            <surname>Dagnelie</surname>
            <given-names>PC</given-names>
          </name>
          <name name-style="western">
            <surname>van den Brandt</surname>
            <given-names>PA</given-names>
          </name>
          <name name-style="western">
            <surname>Jansen</surname>
            <given-names>MC</given-names>
          </name>
          <name name-style="western">
            <surname>van Staveren</surname>
            <given-names>WA</given-names>
          </name>
          <name name-style="western">
            <surname>van't Veer</surname>
            <given-names>P</given-names>
          </name>
        </person-group>
        <article-title>Design characteristics of food frequency questionnaires in relation to their validity</article-title>
        <source>Am J Epidemiol</source>  
        <year>2007</year>  
        <month>12</month>  
        <day>15</day>  
        <volume>166</volume>  
        <issue>12</issue>  
        <fpage>1468</fpage>  
        <lpage>78</lpage>  
        <pub-id pub-id-type="doi">10.1093/aje/kwm236</pub-id>
        <pub-id pub-id-type="medline">17881382</pub-id>
        <pub-id pub-id-type="pii">kwm236</pub-id></nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Noethlings</surname>
            <given-names>U</given-names>
          </name>
          <name name-style="western">
            <surname>Hoffmann</surname>
            <given-names>K</given-names>
          </name>
          <name name-style="western">
            <surname>Bergmann</surname>
            <given-names>MM</given-names>
          </name>
          <name name-style="western">
            <surname>Boeing</surname>
            <given-names>H</given-names>
          </name>
          <collab>European Investigation into Cancer and Nutrition</collab>
        </person-group>
        <article-title>Portion size adds limited information on variance in food intake of participants in the EPIC-Potsdam study</article-title>
        <source>J Nutr</source>  
        <year>2003</year>  
        <month>02</month>  
        <volume>133</volume>  
        <issue>2</issue>  
        <fpage>510</fpage>  
        <lpage>5</lpage>  
        <pub-id pub-id-type="doi">10.1093/jn/133.2.510</pub-id>
        <pub-id pub-id-type="medline">12566492</pub-id></nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Wehling</surname>
            <given-names>H</given-names>
          </name>
          <name name-style="western">
            <surname>Lusher</surname>
            <given-names>J</given-names>
          </name>
        </person-group>
        <article-title>People with a body mass index ⩾30 under-report their dietary intake: A systematic review</article-title>
        <source>J Health Psychol</source>  
        <year>2017</year>  
        <month>07</month>  
        <day>01</day>  
        <fpage>1359105317714318</fpage>  
        <pub-id pub-id-type="doi">10.1177/1359105317714318</pub-id>
        <pub-id pub-id-type="medline">28810493</pub-id></nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
        <person-group person-group-type="author">
          <name name-style="western">
            <surname>Gibson</surname>
            <given-names>RS</given-names>
          </name>
          <name name-style="western">
            <surname>Charrondiere</surname>
            <given-names>UR</given-names>
          </name>
          <name name-style="western">
            <surname>Bell</surname>
            <given-names>W</given-names>
          </name>
        </person-group>
        <article-title>Measurement Errors in Dietary Assessment Using Self-Reported 24-Hour Recalls in Low-Income Countries and Strategies for Their Prevention</article-title>
        <source>Adv Nutr</source>  
        <year>2017</year>  
        <month>11</month>  
        <day>15</day>  
        <volume>8</volume>  
        <issue>6</issue>  
        <fpage>980</fpage>  
        <lpage>991</lpage>  
        <pub-id pub-id-type="doi">10.3945/an.117.016980</pub-id>
        <pub-id pub-id-type="medline">29141979</pub-id>
        <pub-id pub-id-type="pii">8/6/980</pub-id>
        <pub-id pub-id-type="pmcid">PMC5683000</pub-id></nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
