<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.1 20151215//EN" "http://jats.nlm.nih.gov/publishing/1.1/JATS-journalpublishing1.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.1">
   <front>
      <journal-meta>
         <journal-id journal-id-type="publisher-id">peerj</journal-id>
         <journal-id journal-id-type="pmc">peerj</journal-id>
         <journal-id journal-id-type="nlm-ta">PeerJ</journal-id>
         <journal-title-group>
            <journal-title>PeerJ</journal-title>
            <abbrev-journal-title abbrev-type="publisher">PeerJ</abbrev-journal-title>
         </journal-title-group>
         <issn pub-type="epub">2167-8359</issn>
         <publisher>
            <publisher-name>PeerJ Inc.</publisher-name>
            <publisher-loc>San Francisco, USA</publisher-loc>
         </publisher>
      </journal-meta>
      <article-meta>
         <article-id pub-id-type="publisher-id">3817</article-id>
         <article-id pub-id-type="doi">10.7717/peerj.3817</article-id>
         <article-categories>
            <subj-group subj-group-type="categories">
               <subject>Bioinformatics</subject>
               <subject>Ecology</subject>
               <subject>Genomics</subject>
               <subject>Microbiology</subject>
            </subj-group>
         </article-categories>
         <title-group>
            <article-title>Benchmarking viromics: an <italic>in silico</italic> evaluation of metagenome-enabled estimates of viral community composition and diversity</article-title>
         </title-group>
         <contrib-group content-type="authors">
            <contrib id="author-1" contrib-type="author" corresp="yes">
               <name>
                  <surname>Roux</surname>
                  <given-names>Simon</given-names>
               </name>
               <email>sroux@lbl.gov</email><xref ref-type="aff" rid="aff-1">1</xref></contrib>
            <contrib id="author-2" contrib-type="author">
               <name>
                  <surname>Emerson</surname>
                  <given-names>Joanne B.</given-names>
               </name><xref ref-type="aff" rid="aff-1">1</xref></contrib>
            <contrib id="author-3" contrib-type="author">
               <name>
                  <surname>Eloe-Fadrosh</surname>
                  <given-names>Emiley A.</given-names>
               </name><xref ref-type="aff" rid="aff-2">2</xref></contrib>
            <contrib id="author-4" contrib-type="author" corresp="yes">
               <name>
                  <surname>Sullivan</surname>
                  <given-names>Matthew B.</given-names>
               </name>
               <email>mbsulli@gmail.com</email><xref ref-type="aff" rid="aff-1">1</xref><xref ref-type="aff" rid="aff-3">3</xref></contrib>
            <aff id="aff-1"><label>1</label><institution>Department of Microbiology, Ohio State University</institution>, <city>Columbus</city>, <state>OH</state>, <country>United States of America</country></aff>
            <aff id="aff-2"><label>2</label><institution>Joint Genome Institute, Department of Energy</institution>, <city>Walnut Creek</city>, <state>CA</state>, <country>United States of America</country></aff>
            <aff id="aff-3"><label>3</label><institution>Department of Civil, Environmental and Geodetic Engineering, Ohio State University</institution>, <city>Columbus</city>, <state>OH</state>, <country> United States of America</country></aff>
         </contrib-group>
         <contrib-group content-type="editors">
            <contrib contrib-type="editor">
               <name>
                  <surname>Hallett</surname>
                  <given-names>Michael</given-names>
               </name>
            </contrib>
         </contrib-group>
         <pub-date pub-type="epub" date-type="pub" iso-8601-date="2017-09-21">
            <day>21</day>
            <month>9</month>
            <year iso-8601-date="2017">2017</year>
         </pub-date>
         <volume>5</volume>
         <elocation-id>e3817</elocation-id>
         <history>
            <date date-type="received" iso-8601-date="2017-06-24">
               <day>24</day>
               <month>6</month>
               <year iso-8601-date="2017">2017</year>
            </date>
            <date date-type="accepted" iso-8601-date="2017-08-26">
               <day>26</day>
               <month>8</month>
               <year iso-8601-date="2017">2017</year>
            </date>
         </history>
         <permissions>
            <copyright-statement>©2017 Roux et al.</copyright-statement>
            <copyright-year>2017</copyright-year>
            <copyright-holder>Roux et al.</copyright-holder>
            <license xlink:href="http://creativecommons.org/licenses/by/4.0/">
               <license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, reproduction and adaptation in any medium and for any purpose provided that it is properly attributed. For attribution, the original author(s), title, publication source (PeerJ) and either DOI or URL of the article must be cited.</license-p>
            </license>
         </permissions>
         <self-uri xlink:href="https://peerj.com/articles/3817"/>
         <abstract>
            <sec>
               <title>Background</title>
               <p>Viral metagenomics (viromics) is increasingly used to obtain uncultivated viral genomes, evaluate community diversity, and assess ecological hypotheses. While viromic experimental methods are relatively mature and widely accepted by the research community, robust bioinformatics standards remain to be established. Here we used <italic>in silico</italic> mock viral communities to evaluate the viromic sequence-to-ecological-inference pipeline, including (i) read pre-processing and metagenome assembly, (ii) thresholds applied to estimate viral relative abundances based on read mapping to assembled contigs, and (iii) normalization methods applied to the matrix of viral relative abundances for alpha and beta diversity estimates.</p>
            </sec>
            <sec>
               <title>Results</title>
               <p>Tools specifically designed for metagenomes, specifically metaSPAdes, MEGAHIT, and IDBA-UD, were the most effective at assembling viromes. Read pre-processing, such as partitioning, had virtually no impact on assembly output, but may be useful when hardware is limited. Viral populations with 2–5 × coverage typically assembled well, whereas lesser coverage led to fragmented assembly. Strain heterogeneity within populations hampered assembly, especially when strains were closely related (average nucleotide identity, or ANI ≥97%) and when the most abundant strain represented &lt;50% of the population. Viral community composition assessments based on read recruitment were generally accurate when the following thresholds for detection were applied: (i) ≥10 kb contig lengths to define populations, (ii) coverage defined from reads mapping at ≥90% identity, and (iii) ≥75% of contig length with ≥1 × coverage. Finally, although data are limited to the most abundant viruses in a community, alpha and beta diversity patterns were robustly estimated (±10%) when comparing samples of similar sequencing depth, but more divergent (up to 80%) when sequencing depth was uneven across the dataset. In the latter cases, the use of normalization methods specifically developed for metagenomes provided the best estimates.</p>
            </sec>
            <sec>
               <title>Conclusions</title>
               <p>These simulations provide benchmarks for selecting analysis cut-offs and establish that an optimized sample-to-ecological-inference viromics pipeline is robust for making ecological inferences from natural viral communities. Continued development to better accessing RNA, rare, and/or diverse viral populations and improved reference viral genome availability will alleviate many of viromics remaining limitations.</p>
            </sec>
         </abstract>
         <kwd-group kwd-group-type="author">
            <kwd>Virus</kwd>
            <kwd>Virome</kwd>
            <kwd>Viral ecology</kwd>
            <kwd>Metagenome</kwd>
            <kwd>Assembly</kwd>
            <kwd>Benchmarks</kwd>
         </kwd-group>
         <funding-group>
            <award-group id="fund-1">
               <funding-source>Gordon and Betty Moore Foundation</funding-source>
               <award-id>GBMF #3790</award-id>
            </award-group>
            <award-group id="fund-2">
               <funding-source>NSF Biological Oceanography</funding-source>
               <award-id>OCE-1536989</award-id>
            </award-group>
            <award-group id="fund-3">
               <funding-source>US Department of Energy, Office of Science, Office of Biological and Environmental Research</funding-source>
               <award-id>DE-SC0010580</award-id>
               <award-id>DE-SC0016440</award-id>
            </award-group>
            <award-group id="fund-4">
               <funding-source>US Department of Energy Joint Genome Institute</funding-source>
               <award-id>DE-AC02-05CH11231</award-id>
            </award-group>
            <funding-statement>Matthew B. Sullivan and Simon Roux were supported by grants from the Gordon and Betty Moore Foundation (GBMF #3790) and NSF Biological Oceanography (OCE-1536989) awarded to Matthew B. Sullivan. Joanne B. Emerson was supported by the US Department of Energy, Office of Science, Office of Biological and Environmental Research, under the Genomic Science program (Awards DE-SC0010580 and DE-SC0016440). The work conducted by the US Department of Energy Joint Genome Institute, a DOE Office of Science User Facility, is supported under Contract No. DE-AC02-05CH11231. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
         </funding-group>
      </article-meta>
   </front>
   <body>
      <sec>
         <title>Background</title>
         <p>Microbial communities and their associated viruses are abundant, diverse, and play key roles in Earth’s ecosystems and processes (<xref ref-type="bibr" rid="ref-19">Falkowski, Fenchel &amp; Delong, 2008</xref>; <xref ref-type="bibr" rid="ref-13">Cobián Güemes et al., 2016</xref>). However, because most microbes and viruses remain uncultivated, and because viruses do not harbor a universal marker gene, viral ecology studies remain challenging to perform (<xref ref-type="bibr" rid="ref-10">Brum &amp; Sullivan, 2015</xref>; <xref ref-type="bibr" rid="ref-61">Solden, Lloyd &amp; Wrighton, 2016</xref>). Viral metagenomics (viromics) is a uniquely powerful tool for high-throughput analysis of uncultivated viruses (<xref ref-type="bibr" rid="ref-10">Brum &amp; Sullivan, 2015</xref>; <xref ref-type="bibr" rid="ref-13">Cobián Güemes et al., 2016</xref>). Initial viromics studies, despite being limited to gene-level analyses, revealed the large diversity of viral-encoded genes (<xref ref-type="bibr" rid="ref-18">Edwards &amp; Rohwer, 2005</xref>; <xref ref-type="bibr" rid="ref-57">Schoenfeld et al., 2008</xref>), provided first estimates of richness and functional diversity across natural viral communities (<xref ref-type="bibr" rid="ref-31">Hurwitz, Hallam &amp; Sullivan, 2013</xref>; <xref ref-type="bibr" rid="ref-30">Hurwitz, Brum &amp; Sullivan, 2015</xref>), and suggested the existence of biome-specific viral communities distributed worldwide (<xref ref-type="bibr" rid="ref-50">Rodriguez-Brito et al., 2010</xref>; <xref ref-type="bibr" rid="ref-53">Roux et al., 2012</xref>).</p>
         <p>Thanks to recent improvements in high-throughput sequencing technologies and genome assembly, viromes now also provide the opportunity to assemble large genomes fragments (and even complete genomes) of uncultivated viruses (reviewed in <xref ref-type="bibr" rid="ref-10">Brum &amp; Sullivan, 2015</xref>; <xref ref-type="bibr" rid="ref-52">Rose et al., 2016</xref>). Historically, <italic>in silico</italic> benchmarks of the assembly process for microbial metagenomes indicated that accurate bacterial and archaeal genomes (complete or partial) could be recovered for relatively abundant lineages given sufficient sequencing depth, but revealed potential issues including misassemblies deriving from the presence of very closely related organisms (<xref ref-type="bibr" rid="ref-39">Mavromatis et al., 2007</xref>; <xref ref-type="bibr" rid="ref-41">Mende et al., 2012</xref>; <xref ref-type="bibr" rid="ref-25">Greenwald et al., 2017</xref>; <xref ref-type="bibr" rid="ref-58">Sczyrba et al., 2017</xref>). Viral community datasets are typically processed using the same methodologies, and viral-specific benchmarks came to a similar conclusion: viral genomes can be assembled from metagenomes, but the presence of co-existing viruses with highly similar regions in their genome can lead to reduced contig size and/or chimeric contigs (<xref ref-type="bibr" rid="ref-1">Aguirre de Cárcer, Angly &amp; Alcamí, 2014</xref>; <xref ref-type="bibr" rid="ref-64">Vázquez-Castellanos et al., 2014</xref>; <xref ref-type="bibr" rid="ref-23">García-López, Vázquez-Castellanos &amp; Moya, 2015</xref>; <xref ref-type="bibr" rid="ref-38">Martinez-Hernandez et al., 2017</xref>; <xref ref-type="bibr" rid="ref-67">White, Wang &amp; Hall, 2017</xref>). However, new metagenome assembly softwares (e.g., metaSPAdes, <xref ref-type="bibr" rid="ref-43">Nurk et al., 2017</xref>) and methods for read filtering and/or partitioning prior to assembly (e.g., khmer, <xref ref-type="bibr" rid="ref-14">Crusoe et al., 2015</xref>) that might improve assembly quality have yet to be evaluated with viral data.</p>
         <p>For bacteria and archaea, advances in genome binning and genome validation approaches (e.g., <xref ref-type="bibr" rid="ref-46">Parks et al., 2015</xref>) have significantly improved the recovery of accurately reconstructed genomes from increasingly complex environments (<xref ref-type="bibr" rid="ref-69">Wrighton et al., 2012</xref>; <xref ref-type="bibr" rid="ref-60">Sharon et al., 2013</xref>; <xref ref-type="bibr" rid="ref-66">Waldor et al., 2015</xref>; <xref ref-type="bibr" rid="ref-56">Sangwan, Xia &amp; Gilbert, 2016</xref>; <xref ref-type="bibr" rid="ref-58">Sczyrba et al., 2017</xref>). These methods rely on single-copy marker genes to assess genome bin completeness and “contamination” (i.e., multiple genomes in the same genome bin), two metrics critical to guide the optimization of genome binning parameters and curate the final dataset (<xref ref-type="bibr" rid="ref-46">Parks et al., 2015</xref>; <xref ref-type="bibr" rid="ref-8">Bowers et al., 2017</xref>). Unfortunately, because of the absence of universal single-copy viral marker gene, viral genome bins are much more challenging to interpret and analyze. Since viral genomes are also smaller than microbial ones and thus more frequently assembled in a single contig, viromics studies usually rely on the assembled contigs without applying any genome binning step.</p>
         <p>For ecological analyses, a community abundance matrix of microbial OTU counts across samples is the typical starting point, and this “OTU table” is often derived from 16S rRNA gene abundances in amplicon sequencing datasets or metagenomes (<xref ref-type="bibr" rid="ref-29">Hill et al., 2003</xref>; <xref ref-type="bibr" rid="ref-51">Roesch et al., 2007</xref>; <xref ref-type="bibr" rid="ref-22">Fulthorpe et al., 2008</xref>; <xref ref-type="bibr" rid="ref-20">Fierer et al., 2011</xref>; <xref ref-type="bibr" rid="ref-34">Logares et al., 2014</xref>). Even for these relatively established microbial ecological analyses, appropriate normalization methods that account for different sequencing throughput across samples are still debated, and rarely are results compared across multiple normalization methods to establish best practices (<xref ref-type="bibr" rid="ref-16">Doll et al., 2013</xref>; <xref ref-type="bibr" rid="ref-47">Paulson et al., 2013</xref>; <xref ref-type="bibr" rid="ref-40">McMurdie &amp; Holmes, 2014</xref>). This microbial ecology pipeline also needs adjustment when applied to viruses because viruses lack a universal marker gene, precluding amplicon-based viral population abundance estimates at the community scale (although amplicon-based studies have been successful for ecological analyses of specific viral lineages, e.g., <xref ref-type="bibr" rid="ref-21">Filée et al., 2005</xref>; <xref ref-type="bibr" rid="ref-24">Goldsmith et al., 2011</xref>; <xref ref-type="bibr" rid="ref-12">Chow &amp; Fuhrman, 2012</xref>). Notably, comparative genomic and ecological analysis of model systems enabled the identification of sequence-discrete populations, which represent stable ecotypes in natural viral communities (<xref ref-type="bibr" rid="ref-35">Marston &amp; Amrich, 2009</xref>; <xref ref-type="bibr" rid="ref-26">Gregory et al., 2016</xref>; <xref ref-type="bibr" rid="ref-36">Marston &amp; Martiny, 2016</xref>). Thus, in the absence of a universal viral marker gene, these genome-based populations have been proposed to be used as a viral population units (akin to a microbial operational taxonomic unit, OTU) in ecological analysis (<xref ref-type="bibr" rid="ref-9">Brum et al., 2015</xref>). Pragmatically, viral populations are derived from <italic>de novo</italic> metagenomic assemblies, with abundances estimated by metagenomic read recruitment. Ecological analyses of these contig-derived abundance matrices still have to be comprehensively evaluated, although one bias specific to this approach has already been identified: counting each assembled contig as a separate OTU can lead to over-estimates of the number of different viruses in the community (<xref ref-type="bibr" rid="ref-5">Aziz et al., 2015</xref>; <xref ref-type="bibr" rid="ref-23">García-López, Vázquez-Castellanos &amp; Moya, 2015</xref>).</p>
         <p>Here we used 14 <italic>in silico</italic> simulated viral metagenomes to (i) compare the assembly results across different reads pre-processing methods and assemblers, both in terms of the overall genomes recovery and the number and type of errors observed, (ii) assess potential biases and identify optimal thresholds for identification and quantification of viral populations from metagenomic contigs, and (iii) determine if virome populations abundance matrices can provide reliable estimates of alpha diversity (i.e., diversity within a community) and beta diversity (i.e., differentiation between communities), even in cases where sequencing depth vary widely (up to two orders of magnitude) between samples.</p>
      </sec>
      <sec sec-type="methods">
         <title>Methods</title>
         <sec>
            <title>Mock community design</title>
            <p>Viral genomes were randomly selected among the complete genomes of viruses infecting bacteria or archaea in the NCBI RefSeq database (v69, 2015-02). For each mock community, the total number of viruses randomly selected (between 500 and 1,000, <xref ref-type="supplementary-material" rid="supp-2">Table S1</xref>, <xref ref-type="supplementary-material" rid="supp-1">Fig. S1A</xref>), as well as the parameter of the power law distribution used to model relative abundances (between 1 and 50) were varied (<xref ref-type="supplementary-material" rid="supp-1">Figs. S1B</xref>–<xref ref-type="supplementary-material" rid="supp-1">S1D</xref>). To create patterns of beta diversity across samples, the 50 most abundant viruses were homogenized within each of four sample groups, i.e., samples within a group shared 30 to 50 of their most abundant viruses, and samples between groups did not share any of their most abundant 50 viruses. This led to a clear beta diversity pattern with the mock communities clustering into four groups (<xref ref-type="supplementary-material" rid="supp-1">Figs. S1E</xref> &amp; <xref ref-type="supplementary-material" rid="supp-1">S1F</xref>, a PerMANOVA was performed in R with the package vegan (<xref ref-type="bibr" rid="ref-44">Oksanen et al., 2017</xref>) to verify that the sample groups were significantly different).</p>
         </sec>
         <sec>
            <title>Virome simulations</title>
            <p>To simulate virome sequencing for each mock community, the number of reads derived from each genome was first calculated based on the relative abundance of the genome in the mock community and the total number of reads sequenced in the virome (10 millions paired-end reads in the initial viromes, 1 million and 100,000 paired-end reads for the subsets at 10% and 1% respectively). Then, NeSSM (<xref ref-type="bibr" rid="ref-32">Jia et al., 2013</xref>) was used to generate random reads (2 × 100 bp) at the prescribed abundances with simulated Illumina HiSeq errors.</p>
         </sec>
         <sec>
            <title>Reads processing</title>
            <p>Reads generated by NeSSM were first quality-controlled with Trimmomatic (<xref ref-type="bibr" rid="ref-7">Bolger, Lohse &amp; Usadel, 2014</xref>) with a minimum base quality threshold of 30 evaluated on sliding windows of 4 bases, and minimum read length of 50. We opted not to evaluate different error correction softwares or to compare raw reads to quality-controlled (QC) reads, as previous studies have already provided such benchmarks for genomic assembly, which should be applicable to metagenomic assembly as well (e.g., <xref ref-type="bibr" rid="ref-70">Yang, Chockalingam &amp; Aluru, 2013</xref>).</p>
            <p>All sets of additionally pre-processed reads were generated from these QC reads using khmer v1.4.1 (<xref ref-type="bibr" rid="ref-14">Crusoe et al., 2015</xref>), following the online protocols (<ext-link ext-link-type="uri" xlink:href="http://khmer-protocols.readthedocs.io/">http://khmer-protocols.readthedocs.io/</ext-link>, <xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>). First, a dataset of digitally normalized reads was generated, i.e., a dataset in which all reads with median k-mer abundance higher than a specified threshold were eliminated. This was done in two steps by normalizing k-mer coverage first to 20 × then to 5 × (script “normalize-by-median”, dataset “Digital normalization”). The script “do-partition” was then used to partition these digitally normalized datasets, i.e., separate reads that did not connect to each other in the k-mer graphs in different bins (dataset “Partitioned reads (normalized)”). These reads partitions were then re-inflated, i.e., the original abundance of reads was restored to its value prior to digital normalization, with the script “sweep-reads” (dataset “Partitioned reads (inflated)”). Finally, three sets of reads were generated by trimming all low-abundance k-mers for highly covered reads, i.e., highly covered reads (in this case, ≥20 ×) were truncated at the first occurrence of a k-mer below a given abundance cutoff (here ≤2 ×, ≤5 ×, and ≤20 × for the three datasets “Low k-mer filter (2 ×)”, “Low k-mer filter (5 ×)”, “Low k-mer filter (20 ×)”, respectively). This was done with the script “filter-abund”, with option “variable-coverage” as recommended for metagenomes.</p>
         </sec>
         <sec>
            <title>Assembly and comparison to input genomes</title>
            <p>The different read sets were assembled with five different assembly software tools, using metagenomic-optimized parameters (when available, <xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>). IDBA-UD v.1.1.1 (<xref ref-type="bibr" rid="ref-48">Peng et al., 2012</xref>) was used with the option “pre-correction” and from fasta reads (converted from fastq reads with the tool “fq2fa”). MetaSPAdes assemblies (<xref ref-type="bibr" rid="ref-43">Nurk et al., 2017</xref>) were computed from the software version 3.10.0, with the option “metagenomic” (all other options default). MEGAHIT assemblies (<xref ref-type="bibr" rid="ref-33">Li et al., 2016</xref>) were computed from version v1.0.6 with presets “meta” (all other options default). MetaVelvet assemblies (<xref ref-type="bibr" rid="ref-42">Namiki et al., 2012</xref>) were computed with software version 1.2.07 with the “discard_chimera” option selected, default parameters otherwise. Omega assemblies (<xref ref-type="bibr" rid="ref-28">Haider et al., 2014</xref>) were computed with software version 1.0.2 and minimum overlap length of 60. Each assembler was applied to each read pool from each sample (7 read pools × 14 samples = 98 assemblies, <xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>), retaining all contigs ≥500 bp for each assembly (<xref ref-type="supplementary-material" rid="supp-2">Table S4</xref>).</p>
            <p>Contigs were compared to the input genomes with nucmer (<xref ref-type="bibr" rid="ref-15">Delcher, Salzberg &amp; Phillippy, 2003</xref>)(default options). When ≥95% of a contig’s length matched an input genome at ≥90% nucleotide identity, that contig was considered to be a genuine assembly of the input genome. Otherwise, if a contig was similar to multiple genomes but to none over ≥95% of its length, it was considered a chimera. Circular contigs were detected based on identical 5′ and 3′ ends, as in (<xref ref-type="bibr" rid="ref-55">Roux et al., 2014</xref>). A circular contig with a length corresponding to ≥95% of the original genome length was considered a genuine complete genome assembly, while circular contigs covering less than 95% of the original genomes were considered false positives (i.e., incomplete contigs incorrectly predicted as complete genome assemblies). R was used to conduct <italic>t</italic>-test when comparing rate of chimeric contigs across assemblers and reads pre-processing methods, using the assembly of QC reads with MEGAHIT as the control (the set of contigs with the lower number of chimeras).</p>
         </sec>
         <sec>
            <title>Generation of the non-redundant pool of population contigs and coverage estimation</title>
            <p>Based on the previous benchmarks, the assemblies obtained with metaSPAdes from the QC reads were considered to be the most optimal assemblies and were used in all subsequent benchmarking analyses. Contigs from all samples were clustered with nucmer (<xref ref-type="bibr" rid="ref-15">Delcher, Salzberg &amp; Phillippy, 2003</xref>) at ≥95% ANI across ≥80% of their lengths, as in (<xref ref-type="bibr" rid="ref-9">Brum et al., 2015</xref>; <xref ref-type="bibr" rid="ref-26">Gregory et al., 2016</xref>), to generate a pool of non-redundant “population contigs”. QC reads from each sample were then mapped to these population contigs with bbmap (<ext-link ext-link-type="uri" xlink:href="http://bit.ly/bbMap">http://bit.ly/bbMap</ext-link>), with ambiguous mapping assigned to contigs at random (option ambiguous=random). A custom python script was then used to estimate the number of reads and coverage of each contig.</p>
         </sec>
         <sec>
            <title>Alpha and beta diversity estimates</title>
            <p>The abundance of each population contig in a given sample was estimated based on the number of reads mapping to that contig, normalized by the contig length (to account for differences in contig / genome size). Beyond the raw read counts (normalized by contig length), five abundance matrices were generated with different library size normalization methods as follow (summarized in <xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>):</p>
            <list id="list-1" list-type="bullet">
               <list-item><label> •</label><p>“Normalized”: counts were divided by the total library size, i.e., the total number of QC reads in the sample, as used for example in <xref ref-type="bibr" rid="ref-9">Brum et al. (2015)</xref>. This approach is also known as “total-sum scaling”.</p>
               </list-item>
               <list-item><label> •</label><p>“MGSeq”: counts were normalized through cumulative-sum scaling with the metagenomeSeq R package (<xref ref-type="bibr" rid="ref-47">Paulson et al., 2013</xref>). This method was specifically designed for metagenomes in which communities are under-sampled (as is the case in most viral metagenome studies), and will divide counts by a cumulative sum of count to a given percentile (as opposed to dividing by total counts as in “Normalized”). This will minimize the effects of the few highly abundant viruses potentially dominating the community, and introducing biases in relative abundances (<xref ref-type="bibr" rid="ref-47">Paulson et al., 2013</xref>).</p>
               </list-item>
               <list-item><label> •</label><p>“EdgeR”: counts were normalized using scaling factors for libraries designed to minimize the log-fold change between samples for most of the populations, computed with the edgeR R package (<xref ref-type="bibr" rid="ref-49">Robinson, McCarthy &amp; Smyth, 2009</xref>). This method was initially developed for count-based expression data and assumes that the relative abundances of most features (here populations) will not vary between two samples.</p>
               </list-item>
               <list-item><label> •</label><p>“DeSeq”: as with EdgeR, counts were normalized to minimize variations between samples for most populations but with a different underlying model, computed with the DESeq R package (<xref ref-type="bibr" rid="ref-3">Anders &amp; Huber, 2010</xref>). As with EdgeR, this method was initially developed for the detection of differentially expressed features in sequence count data analysis.</p>
               </list-item>
               <list-item><label> •</label><p>“Rarefied”: new counts were generated based on rarefied sets of reads, i.e., quality-controlled reads are subsampled (without replacement) to the smallest number of quality-controlled reads across all samples. Thus, all of the libraries are artificially set to the same size, however some data are “wasted” in the process, i.e.<italic>,</italic> for the more deeply sequenced samples, some observations will not be included in the rarefied counts (<xref ref-type="bibr" rid="ref-40">McMurdie &amp; Holmes, 2014</xref>).</p>
               </list-item>
            </list>
            <p>Each abundance matrix was then used to calculate alpha and beta diversity indices, namely the Shannon index, Simpson index, and pairwise Bray–Curtis dissimilarities between samples with a custom perl script. R was used to generate all plots using the ggplot2 package (<xref ref-type="bibr" rid="ref-68">Wickham, 2009</xref>), as well as the NMDS and PerMANOVA analyses, computed with the vegan package (<xref ref-type="bibr" rid="ref-44">Oksanen et al., 2017</xref>). For alpha diversity, we opted to only test indices reflecting community structure (Shannon and Simpson indexes) and not indices predicting sample richness (e.g., Chao estimators (<xref ref-type="bibr" rid="ref-11">Chao, 1984</xref>)), since the latter have been highlighted as not suitable for cases in which rare members of the community are not adequately sampled (<xref ref-type="bibr" rid="ref-27">Haegeman et al., 2013</xref>).</p>
         </sec>
         <sec>
            <title>Under-sequencing and strain heterogeneity benchmarks</title>
            <p>To evaluate the impact of under-sequencing on alpha and beta diversity estimates, the same pipeline (assembly with metaSPAdes from QC Reads, selection of population contigs, and estimation of alpha and beta diversity) was applied to datasets in which seven of the 14 samples were under-sequenced. Two levels of under-sequencing were tested, one in which under-sequenced samples were set at 10% of the initial library size (i.e., 1,000,000 reads) and another at 1% of the initial library size (100,000 reads, <xref ref-type="supplementary-material" rid="supp-2">Table S1</xref>).</p>
            <p>To evaluate the impact of strain heterogeneity (within-population genomic diversity) on assembly success, a custom perl script was used to simulate strain variations as observed on natural populations of T4-like cyanophages (<xref ref-type="bibr" rid="ref-26">Gregory et al., 2016</xref>), i.e., a set of potentially mutated positions were determined for each new simulated strain gathering all intergenic positions, all third codons positions in protein-coding genes, and all positions in two randomly selected genes (to simulate genes undergoing diversifying selections). These simulations were based on the mock community “Sample_1”, for which every genome was transformed into a population composed of a set of related strains.</p>
            <p>For each population, three parameters selected randomly and independently:</p>
            <list id="list-2" list-type="bullet">
               <list-item><label> •</label><p>The total number of strains was set at 10, 50, or 100 strains simulated.</p>
               </list-item>
               <list-item><label> •</label><p>The strain divergence, controlled by a “mutation rate”, i.e., the ratios of positions mutated within the set of positions identified as “potentially mutated” (see above). The other positions in the genome, not selected as potentially mutated, were mutated at a rate 100 times lower. This “mutation rate” was set at 5%, 10%, or 20%. This led to ANI between the generated strains and the original reference genomes of 97–100%, 95–97%, and 90–95%, respectively.</p>
               </list-item>
               <list-item><label> •</label><p>The relative abundance of individual strains within the population, sampled from a power-law distribution. The shape of the distribution was controlled by the power-law parameter, set at 0.1, 1, 10, 100, or 1,000. This led to the dominant (i.e., most abundant) strain representing from 1% to 100% of the population.</p>
               </list-item>
            </list>
            <p>For each population, reads were then simulated with NeSSM (<xref ref-type="bibr" rid="ref-32">Jia et al., 2013</xref>), with the total reads generated for each population calculated based on the input coverage (as for previous simulations), and the number of reads generated from each strain calculated from the strains relative abundance. Reads were then processed as previously, i.e., quality-controlled, partitioned, or filtered, and assembled with the five assemblers tested using the same options as for the simulated viromes. Finally, the size of the largest contig recovered for each population was compared to the size of the largest contig recovered for the same genome without strain heterogeneity, to evaluate the impact of strain heterogeneity independently from differences in assembly efficiency between coverage levels, reads processing methods, and assemblers.</p>
         </sec>
      </sec>
      <sec>
         <title>Results and Discussion</title>
         <sec>
            <title>Mock communities design</title>
            <p>A set of 14 viral communities was designed to provide a gradient of alpha diversity and clear beta diversity patterns (<xref ref-type="supplementary-material" rid="supp-1">Fig. S1</xref>, <xref ref-type="supplementary-material" rid="supp-2">Tables S1</xref> &amp; <xref ref-type="supplementary-material" rid="supp-2">S2</xref>). These communities were composed of 500 to 1,000 genomes (randomly sampled within bacteriophages and archaeal viruses available in NCBI RefSeq v69), with the relative abundance of individual genomes based on power law distributions with varying exponents. These simulations are thus designed to reflect a diverse viral community, as is usually observed in environmental samples (e.g., oceans, lakes, soils, or human gut), but would not correspond to viral communities dominated by a single type of virus, e.g., clinical samples associated with a specific host or epidemiological samples targeting a specific type of virus. Beyond differences in alpha diversity, these communities were also designed to organize into four “ecological” clusters, i.e., four groups of mock communities sharing more genomes within than between groups (<xref ref-type="supplementary-material" rid="supp-1">Fig. S1</xref>). Thus, this simulated dataset allows us to evaluate the ability of virome-based population ecology approaches to recover absolute values of alpha diversity, as well as trends in alpha diversity and beta diversity patterns across samples.</p>
            <p>Virome reads were simulated <italic>in silico</italic> with NeSSM (<xref ref-type="bibr" rid="ref-32">Jia et al., 2013</xref>) for each mock community (10,000,000 paired-end Illumina HiSeq reads, 2 × 100 bp). Since the number of reads derived from each genome was based on its prescribed relative abundance in the community, 29.1% to 75.2% of the viral genomes in each mock community did not get “sequenced” at all (i.e., did not yield any reads). This was by design to mimic the lack of sampling for rare viruses by current sequencing efforts of environmental samples.</p>
            <sec>
               <title>Testing the capacity and accuracy of assembly tools</title>
               <p>Given metagenomic sequence data from these 14 mock communities, we first evaluated currently available assembly algorithms. To this end, five assemblers (IDBA-UD (<xref ref-type="bibr" rid="ref-48">Peng et al., 2012</xref>), MEGAHIT (<xref ref-type="bibr" rid="ref-33">Li et al., 2016</xref>), MetaVelvet (<xref ref-type="bibr" rid="ref-42">Namiki et al., 2012</xref>), Omega (<xref ref-type="bibr" rid="ref-28">Haider et al., 2014</xref>), and metaSPAdes (<xref ref-type="bibr" rid="ref-43">Nurk et al., 2017</xref>), all adapted to assemble metagenomic data) were compared to assess their ability to accurately assemble genomes of bacterial and archaeal viruses from viromes (<xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>). As expected, each of the assemblers successfully assembled highly covered genomes (10 × or higher) and failed to assemble most low-coverage genomes (2 × and lower, <xref ref-type="fig" rid="fig-1">Fig. 1A</xref>, <xref ref-type="supplementary-material" rid="supp-1">Fig. S3A</xref>). However, MetaVelvet and Omega required higher coverage to assemble viral genomes (∼5 − 10 ×), while IDBA-UD, MEGAHIT, and metaSPAdes routinely assembled genomes at ∼2 − 5 × coverage (<xref ref-type="fig" rid="fig-1">Fig. 1A</xref>, <xref ref-type="supplementary-material" rid="supp-1">Fig. S3A</xref>). A similar trend was found when observing genome recovery in a single contig (i.e., the percentage of a genome assembled in a single contig, as opposed to the percentage of a genome assembled when cumulating all contigs). Again, IDBA-UD, MEGAHIT, and metaSPAdes were more efficient than MetaVelvet and Omega for assembling large genome fragments at lower read coverage (∼2 − 20 ×), and metaSPAdes was also better than IDBA-UD and MEGAHIT for assembling low-coverage genomes in a single large contig (<xref ref-type="fig" rid="fig-1">Fig. 1B</xref>, <xref ref-type="supplementary-material" rid="supp-1">Fig. S3B</xref>).</p>
               <fig id="fig-1">
                  <object-id pub-id-type="doi">10.7717/peerj.3817/fig-1</object-id><label>Figure 1</label><caption>
                     <title>Influence of assembly software and read curation on genome recovery.</title>
                     <p>All plots display the input coverage on the <italic>x</italic>-axis, and either the cumulated genome recovery across all contigs (A &amp; C) or the highest genome recovery by a single contig (B &amp; D) on the <italic>y</italic>-axis. (A &amp; B) display a comparison of assemblers applied to quality-controlled (QC) reads. (C &amp; D) present a comparison of read pre-processing methods, all assembled with metaSPAdes. Comparable plots for reads assembled with the other assemblers are available in <xref ref-type="supplementary-material" rid="supp-1">Fig. S5</xref>.</p>
                  </caption>
                  <graphic mimetype="image" mime-subtype="png" xlink:href="https://peerj.com/articles/3817/fig-1.png"/>
               </fig>
               <p>When comparing individual genome assemblies across the three best assemblers (metaSPAdes, IDBA-UD, and MEGAHIT), no clear differences could be observed in the genome recovery (<xref ref-type="supplementary-material" rid="supp-1">Fig. S4</xref>, correlation coefficients between assemblers &gt; 0.99). However, the percentage of each genome recovered in a single contig was more variable among assemblers (<xref ref-type="supplementary-material" rid="supp-1">Fig. S4</xref>, correlations coefficients: 0.88–0.98). This comparison did not indicate that one assembler would be systematically better than another, but rather that the best assembly for a given genome could come from any of these three assemblers.</p>
               <p>Together these comparisons suggest that: (i) IDBA-UD, MEGAHIT, and metaSPAdes are currently the best available choices for maximizing assembly of viral contigs from short-read (100 bp) viromes (assembly accuracy discussed below), (ii) regardless of the choice of assembly tool, low coverage genomes (&lt;2 ×) are under-assembled, and (iii) because assembly success varies across genomes and assemblers, multiple tools should be compared to optimally assemble desired target genomes from viromes. Overall, these results are consistent with microbial metagenomic benchmarks, which also indicated that assemblers designed specifically for metagenomes, especially metaSPAdes, MEGAHIT, and IDBA-UD, provided the best assemblies (<xref ref-type="bibr" rid="ref-58">Sczyrba et al., 2017</xref>; <xref ref-type="bibr" rid="ref-65">Vollmers, Wiegand &amp; Kaster, 2017</xref>).</p>
            </sec>
            <sec>
               <title>Impact of k-mer-based read filtering and partitioning on assembly</title>
               <p>Next, we evaluated how available read pre-processing approaches impacted genome assembly (using approaches from the khmer package and summarized in <xref ref-type="supplementary-material" rid="supp-2">Table S3</xref> and <xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>) (<xref ref-type="bibr" rid="ref-14">Crusoe et al., 2015</xref>). Briefly, beyond the reference dataset of quality controlled reads, the different methods tested were (i) trimming of reads based on low-abundance k-mers, i.e., reads are truncated at the first occurrence of a low-abundance k-mer likely originating from sequencing error, (ii) digital normalization, i.e., the removal of redundant sequences to normalize genome coverage at or under a specific value (here 5 ×), and (iii) read partitioning, i.e., separate assembly of the disconnected components of the k-mer graph.</p>
               <p>Overall, and compared with the effect of the different assembly algorithms, the read pre-processing had a minimal impact on the assembly output (<xref ref-type="fig" rid="fig-1">Figs. 1C</xref> and <xref ref-type="fig" rid="fig-1">1D</xref>, <xref ref-type="supplementary-material" rid="supp-1">Figs. S3C</xref> &amp; <xref ref-type="supplementary-material" rid="supp-1">S3D</xref> with metaSPAdes; the same observations were made with different assemblers in <xref ref-type="supplementary-material" rid="supp-1">Fig. S5</xref>). The main effects observed were that (i) digital normalization (treatments “Digital normalization” and “Partitioned reads (normalized)”) led to sub-optimal assemblies, likely because differences in coverage above 5 × are useful for assemblers to distinguish between related genomes, and (ii) trimming of low-abundance k-mers led to sub-optimal assemblies when the threshold used to define low abundance k-mers was close to the threshold used to define “abundant” reads to be trimmed (effect especially noticeable for the 20 × filter, <xref ref-type="fig" rid="fig-1">Figs. 1C</xref> &amp; <xref ref-type="fig" rid="fig-1">1D</xref>). Conversely, partitioning reads and keeping their coverage information (treatment “Partitioned reads (inflated)”) or trimming low-abundance k-mers from high coverage reads (with thresholds of 2 × and 5 ×) had little effect on the assembly output, except on low-coverage genomes (&lt;5 ×). These observations are consistent with the initial expectations of khmer’s performance (<xref ref-type="bibr" rid="ref-14">Crusoe et al., 2015</xref>), although these simulations illustrate that digital normalization alone (i.e., without read partitioning and restoration of original read coverage) can lead to a sub-optimal metagenomic assembly.</p>
            </sec>
            <sec>
               <title>Errors and limitations of genome assembly from viromes</title>
               <p>Beyond the assembly of low-coverage genomes, which was found to be challenging for all assemblers tested, other errors are known to occur during the <italic>de novo</italic> assembly of viromes.</p>
               <p>First, chimeric contigs (i.e., contigs representing artificial constructs assembled from two or more distinct genomes) were generated in each assembly, as previously noted (<xref ref-type="bibr" rid="ref-1">Aguirre de Cárcer, Angly &amp; Alcamí, 2014</xref>; <xref ref-type="bibr" rid="ref-64">Vázquez-Castellanos et al., 2014</xref>; <xref ref-type="bibr" rid="ref-23">García-López, Vázquez-Castellanos &amp; Moya, 2015</xref>). In our simulated data, these usually represented less than 2.5% of the assembled datasets, and less than 5% of the large contigs (≥10 kb), but these numbers varied between assemblers and read curation methods (<xref ref-type="fig" rid="fig-2">Figs. 2A</xref> &amp; <xref ref-type="fig" rid="fig-2">2B</xref>). This low number of chimeric contigs is in accordance with benchmarks of microbial metagenomes, and suggests that metagenome assemblers in general can correctly reconstruct microbial and/or viral genomes (<xref ref-type="bibr" rid="ref-41">Mende et al., 2012</xref>). For all assemblers, reads after digital normalization always yielded more chimeric contigs, which confirmed that the digital normalization step led to sub-optimal assemblies (<italic>p</italic>-value &lt;0.01). MEGAHIT systematically produced fewer chimeric contigs than IDBA-UD and metaSPAdes, especially for large (≥10 kb) contigs (<xref ref-type="fig" rid="fig-2">Fig. 2B</xref>, <italic>p</italic>-value &lt;0.01). Hence, although MEGAHIT did not assemble as many large genome fragments, the fragments that were assembled contained fewer chimeras.</p>
               <fig id="fig-2">
                  <object-id pub-id-type="doi">10.7717/peerj.3817/fig-2</object-id><label>Figure 2</label><caption>
                     <title>Types and frequency of errors observed in genome assembly from viral metagenomes.</title>
                     <p>(A) Percentage of chimeric contigs (i.e., contigs originating from two distinct genomes) across all assembled sequences, by assembler (<italic>x</italic>-axis) and read curation method (colors). (B) Percentage of chimeric contigs among large (≥10 kb) contigs, by assembler (<italic>x</italic>-axis) and read curation method (colors). (C) Percentage of false-positive circular contigs, i.e., contigs identified as circular (matching 5′ and 3′ ends) but representing 95% or less of the original genome, by assembler (<italic>x</italic>-axis) and read curation method (color). (D) Impact of strain heterogeneity (i.e., presence of multiple strains from the same population) on the assembly efficiency. These tests were computed on one mock community (Sample_1), for which each reference genome was replaced with a set of related strains with varying divergence and relative abundances. The <italic>y</italic>-axis represents the ratio between the largest contig assembled for a genome when strain heterogeneity is introduced and the same parameter without strain heterogeneity (i.e., previous assemblies of the same Sample_1). Populations are grouped based on the two main parameters explaining assembly inefficiency: proportion of the most abundant strain in the population (C, D) and divergence of strains in the population (A, B). Data presented here include assemblies from QC reads with IDBA-UD, MEGAHIT, and metaSPAdes, while the full set of parameters and approaches tested are presented in <xref ref-type="supplementary-material" rid="supp-1">Fig. S6</xref>.</p>
                  </caption>
                  <graphic mimetype="image" mime-subtype="png" xlink:href="https://peerj.com/articles/3817/fig-2.png"/>
               </fig>
               <p>Next, we investigated whether finished and closed viral genomes assemblies could be robustly identified as “circular” contigs, i.e., contigs with matching 5′ and 3′ ends, as previously suggested (<xref ref-type="bibr" rid="ref-55">Roux et al., 2014</xref>). The ratio of false-positive circular contigs, i.e., circular contigs that represented less than 95% of the original genome and thus likely arose from repeat regions within a genome, was not modified by read pre-processing but was different among assemblers (<xref ref-type="fig" rid="fig-2">Fig. 2C</xref>). Specifically, 10 to 30% of the circular contigs generated by MEGAHIT and IDBA-UD did not correspond to a complete genome, while metaSPAdes assemblies rarely included any false positive (4 contigs, or &lt;2%, for metaSPAdes assemblies of quality-controlled reads). This suggests that metaSPAdes circular contigs are more likely to correspond to complete genomes and that the “circularization” of a contig cannot be considered as proof of completeness for MEGAHIT and IDBA-UD contigs.</p>
               <p>Finally, we evaluated the impact of population strain heterogeneity— i.e., the co-existence of closely related strains with distinct genomes from the same population—on virome assembly. In microbial communities, strain heterogeneity is known to considerably hamper the assembly of the corresponding genomes (<xref ref-type="bibr" rid="ref-59">Sharon et al., 2015</xref>; <xref ref-type="bibr" rid="ref-38">Martinez-Hernandez et al., 2017</xref>; <xref ref-type="bibr" rid="ref-58">Sczyrba et al., 2017</xref>). Population genetic studies of natural viral communities are however challenged by the paucity of cultivated systems that include multiple viral genomic representatives from a single population. Pragmatically, this means that although strain heterogeneity has been observed for specific model systems (<xref ref-type="bibr" rid="ref-26">Gregory et al., 2016</xref>; <xref ref-type="bibr" rid="ref-36">Marston &amp; Martiny, 2016</xref>), community-wide strain variations that would accurately reflect natural viral communities cannot be pulled from these data. Hence, we opted to generate a mock community using the same populations and relative abundances as Sample 1 above, but introduced some level of strain heterogeneity for each population by varying a combination of three parameters: (i) the number of strains in the population, either low (<italic>n</italic> = 10), medium (<italic>n</italic> = 50), or high (<italic>n</italic> = 100), (ii) the diversity of these strains, presented as the average ANI of strains compared to the consensus population genome, either low (90–95%), medium (95–97%), or high (97–100%), and (iii) the evenness of the power-law distribution of strain frequency in the population, either low (dominant variant represents 75–100% of the population), medium (dominant variant 50–75%), or high (dominant variant &lt; 25%). For each genome, reads were thus not generated from the reference genome sequence as before, but from a set of strains generated and sampled using a random combination of these 3 parameters. Then, the same pipeline of read processing and assembly was applied, and the size of the largest contig obtained for each population was compared to the size of the largest contig obtained in the previous mock community assembly (i.e., without strain heterogeneity, <xref ref-type="fig" rid="fig-2">Fig. 2D</xref> and <xref ref-type="supplementary-material" rid="supp-1">Fig. S6</xref>).</p>
               <p>An ANOVA was performed on the complete dataset (i.e., all combinations of assemblers and read processing) to evaluate which component of strain heterogeneity impacted the assembly process (see ‘Methods’). The three parameters (number of strains, strain diversity, and evenness of strain distribution) significantly but differently impacted the assembly: population shape (i.e., strain distribution) was the main explanatory variable of suboptimal assemblies (<italic>F</italic>-value 149.8, <italic>p</italic>-value &lt;  1<italic>e</italic> − 16), strain diversity was also a strong driver of assembly failures (<italic>F</italic>-value 70.4, <italic>p</italic>-value &lt; 1<italic>e</italic> − 16), while the number of strains in the populations had a more marginal effect (<italic>F</italic>-value 2.8, <italic>p</italic>-value 0.06). Overall, when compared to the assemblies generated without strain heterogeneity, contigs were shorter for populations with an even strain distribution (i.e., dominant strain ≤ 50% of the population) and/or when strains were more similar to the consensus genome (i.e., average ANI to consensus ≥ 97%) and to each other, with the combination of both leading to the greater reduction in contig length (<xref ref-type="fig" rid="fig-2">Fig. 2D</xref>). These results indicate that strain heterogeneity within natural viral populations will likely be a key factor contributing to assembly success and failure, and populations of evenly distributed closely related strains will be the most likely to fail to assemble in virome studies. A similar trend was observed for microbial genomes in the Critical Assessment of Metagenome Interpretation benchmarks, where the assembly of closely related genomes (i.e., those with strain-level heterogeneity) was found to be challenging for all assemblers tested, although the experimental design did not allow the evaluation of which level and parameter of strain heterogeneity were most impactful (<xref ref-type="bibr" rid="ref-58">Sczyrba et al., 2017</xref>).</p>
            </sec>
            <sec>
               <title>Population identification and quantification</title>
               <p>In viral ecological studies, the next step after assembly often consists of identifying viral populations (i.e., contigs representing individual populations) and quantifying their relative abundances in each sample. We opted to use the contigs assembled with metaSPAdes from quality-controlled reads, as they represented the largest contigs overall across the different samples (despite ∼1% chimerism). We pooled contigs generated from all samples into a single non-redundant database (contigs were clustered at ≥95% of nucleotide identity across ≥80% of the contig length, in accordance with population genome analysis (<xref ref-type="bibr" rid="ref-26">Gregory et al., 2016</xref>)). Quality-controlled reads were then mapped to this database to estimate contig coverage across the 14 samples. Two types of thresholds were evaluated in this mapping step: (i) minimum nucleotide identity for a given read to be considered mapped to a given contig, and (ii) minimum length of the contig covered to consider a contig as “detected” in a sample (<xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>). Reads not meeting the threshold were removed from abundance counts, and contigs not meeting the detection threshold in a given sample were given abundance values of zero for that sample in the resulting coverage table.</p>
               <p>Considering all non-redundant contigs ≥500 bp as different populations, we observed that increasing the two thresholds (read mapping identity percentage and length of contig covered) progressively decreased the sensitivity of the analysis (evaluated here as the percentage of genomes recovered among genomes which were covered ≥1 × in the sample, <xref ref-type="fig" rid="fig-3">Fig. 3A</xref>) and the false discovery rate (or FDR, which is the percentage of contigs recovered that were not part of the initial community, i.e., these genomes did not provide any reads to the simulated metagenome, <xref ref-type="fig" rid="fig-3">Fig. 3B</xref>). However, because FDR decreased more precipitously than sensitivity, there is an optimal combination of thresholds for which FDR can be minimized and sensitivity maximized. In these simulations, that optimal threshold was ≥75% on the contig length coverage associated with ≥90% nucleotide identity for the read mapping, which led to a 3% decrease in sensitivity (compared to the most permissive thresholds), but only 13% FDR (compared to 49% for the most permissive thresholds).</p>
               <fig id="fig-3">
                  <object-id pub-id-type="doi">10.7717/peerj.3817/fig-3</object-id><label>Figure 3</label><caption>
                     <title>Impact of read mapping thresholds on accuracy of viral population detection.</title>
                     <p>Two parameters were investigated when parsing the mapping of individual virome reads to the population contigs pool: (i) the percentage of a contig covered by a sample to considered the contig as detected (<italic>x</italic>-axis), and (ii) the percentage of identity of reads mapping to the contig (color scale). Two pools of population contigs were tested: all non-redundant contigs of ≥500 bp (A–C), and all non-redundant contigs ≥10 kb (D–F). Three metrics were calculated to evaluate the impact of mapping reads thresholds. The detection sensitivity is estimated as the percentage of “expected” genomes (i.e., genomes covered ≥1 × in the sample) that were detected through mapping to population contigs (A and D). The false-discovery rate corresponds to the percentage of contigs detected in a sample through mapping to population contigs, but were not associated with any genomes from the initial sample (i.e., these genomes did not provide any reads to the simulated virome, so these contigs should not be detected, B and E). Finally the average number of distinct population contigs detected is calculated for each individual genome initially covered ≥1 ×, and correspond to the number of times a single genome is “counted” (i.e., multiple contigs suggest multiple populations, even though it is really just one population, C and F).</p>
                  </caption>
                  <graphic mimetype="image" mime-subtype="png" xlink:href="https://peerj.com/articles/3817/fig-3.png"/>
               </fig>
               <p>As noted by previous studies (<xref ref-type="bibr" rid="ref-5">Aziz et al., 2015</xref>; <xref ref-type="bibr" rid="ref-23">García-López, Vázquez-Castellanos &amp; Moya, 2015</xref>), considering all non-redundant contigs as distinct populations strongly over-estimated the total number of populations (on average, two to three contigs were counted for each individual genome, <xref ref-type="fig" rid="fig-3">Fig. 3C</xref>). Thus, we re-analyzed our dataset using only non-redundant contigs ≥10 kb or circular as was proposed previously, and as required for taxonomic classification by gene content network-based analysis (<xref ref-type="bibr" rid="ref-6">Bolduc et al., 2017</xref>). Again, the optimal threshold combination was ≥75% of the contig length covered and ≥90% read mapping identity (<xref ref-type="fig" rid="fig-3">Figs. 3D</xref>–<xref ref-type="fig" rid="fig-3">3F</xref>). However, while sensitivity declined slightly (∼15%) compared to the dataset including all contigs ≥500 bp, FDR improved drastically to 0.2%, compared to 13% observed in the above analyses. Further, by increasing the stringency of the population definition, the number of contigs per genome that were counted as a population was 1.2 which is much closer to the correct number of 1 contig per genomne. More generally, increasing this contig size threshold quickly decreased the number of contig observed per genome, and most of the over-estimation observed earlier seemed to arise from contigs &lt;5 kb (<xref ref-type="supplementary-material" rid="supp-1">Fig. S7</xref>).</p>
               <p>In summary, we recommend that viral populations (as an operational taxonomic unit) be defined and analyzed in viromes using contigs that are ≥10 kb or circular, and only considered “detected” when the contig is covered over ≥75% of its length by read mapping at ≥90% nucleotide identity. However, we also anticipate that the data from these sensitivity analyses will help researchers tune these thresholds to match a given study’s need for high sensitivity or low FDR. Importantly though, these suggestions are specific to viromes, since microbial metagenomic studies can rely on genome binning and universally conserved, single-copy marker genes to estimate more robustly the global number and completeness of the different genomes assembled (<xref ref-type="bibr" rid="ref-58">Sczyrba et al., 2017</xref>).</p>
            </sec>
            <sec>
               <title>Alpha and beta diversity estimation from virome-derived populations</title>
               <p>We next sought to evaluate how the variation in community structure of our 14 mock community metagenomes impacted diversity estimations, and did so using our recommended optimized population cut-offs for identifying populations and then estimating their abundances by read mapping. These population count matrices (counting either base pairs or reads mapped to each population contig) were used as input for alpha and beta diversity estimations and compared across the dataset. Notably, these matrices included only a fraction (10–33%) of the original genomes in the dataset, as rare viral genomes were not “sequenced”, and low-coverage genomes produced only small (&lt;10 kb) contigs (<xref ref-type="fig" rid="fig-4">Fig. 4A</xref>).</p>
               <fig id="fig-4">
                  <object-id pub-id-type="doi">10.7717/peerj.3817/fig-4</object-id><label>Figure 4</label><caption>
                     <title>Estimation of alpha and beta diversity from virome-derived viral populations.</title>
                     <p>To evaluate the impact of varying sequencing depth, six viromes (highlighted in bold in A–C), were sub-sampled at 10% (long dash) or 1% (short dash) of the original read number (“Initial” corresponds to the assemblies presented in <xref ref-type="fig" rid="fig-1">Figs. 1</xref>–<xref ref-type="fig" rid="fig-3">3</xref>, for which all viromes had the same initial number of reads). A. Number of genomes observed from the read mapping to viral populations. The actual number of genomes in the initial simulated community is indicated with black dots, while estimated based on viromes are colored in red. B. Comparison of Shannon diversity index from the true community composition (black dots) and estimated from the viromes (colored dots). The different estimations are based on 3 different normalization methods: counts divided by the total number of reads sequenced in the virome and the contig size (“Normalized”), counts after rarefying all viromes to the smallest dataset and normalized by contig size (“Rarefied”), and counts normalized via DESeq (“DESeq”). (C) Comparison of Simpson diversity index from the true community composition and estimated from the viromes (color codes are the same as in B). (D) Distribution of differences in Bray–Curtis dissimilarities between samples calculated from true community composition and the same dissimilarities estimated from the viromes analysis. The different normalization methods (<italic>x</italic>-axis) are as follows: counts divided by genome size (“Counts”), counts rarefied to the smallest dataset and normalized by contig size (“Rarefied”), counts divided by the total number of reads sequenced in the library and the contig size (“Normalized”), counts normalized by metagenomeSeq (“MGSeq”), EdgeR (“RPKM”), and DESeq (“DESeq”). (E) Distribution of differences in Bray–Curtis dissimilarities between samples calculated from true community composition and the same dissimilarities estimated from virome analysis, including 6 samples sequenced at 10%. Methods are similar as in (D). (F) Distribution of differences in Bray–Curtis dissimilarities between samples calculated from true community composition and from virome analysis, including 6 samples sequenced at 1%. Methods are similar as in (D).</p>
                  </caption>
                  <graphic mimetype="image" mime-subtype="png" xlink:href="https://peerj.com/articles/3817/fig-4.png"/>
               </fig>
               <p>Before calculating any index, the read counts were first normalized by the contig length, since viral genome lengths can be highly variable (∼2 orders of magnitude, <xref ref-type="bibr" rid="ref-4">Angly et al., 2009</xref>). Then, to account for potential differences in library sizes, we compared five different methods: (i) a simple normalization in which counts are divided by the library size, “Normalized” (ii) a method specifically designed to account for under-sampling of metagenomes, from the metagenomeSeq R package, “MGSeq” (iii and iv) two methods designed to minimize log-fold changes between samples for most of the populations, from the edgeR R package, “edgeR”, and the DESeq R package, “DESeq”, and (v) a rarefaction approach whereby all libraries get randomly down-sampled without replacement to the size of the smallest library, “Rarefied” (<xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>).</p>
               <p>For both Shannon and Simpson alpha diversity indices, the values calculated from normalized count matrices were within 10% of the actual value calculated from the whole community (<xref ref-type="fig" rid="fig-4">Figs. 4B</xref> &amp; <xref ref-type="fig" rid="fig-4">4C</xref>). Hence, the recovery of abundant members of the community seems to be enough to estimate alpha diversity values. Since both Shannon and Simpson indices are based on the relative abundance of individual members of the community, the three methods that applied a sample-wide correction factor (normalization by library size, MGSeq, EdgeR) all led to the same estimations, while rarefied count matrices and DESeq, which can (slightly) modify relative abundance of populations within communities, provided statistically indistinguishable estimates (<xref ref-type="fig" rid="fig-4">Figs. 4B</xref> &amp; <xref ref-type="fig" rid="fig-4">4C</xref>). Similarly, for beta diversity estimates, pairwise Bray–Curtis dissimilarities between samples calculated from normalized counts matrices were highly similar to the dissimilarities calculated from the whole communities for all normalization methods (within 15% of actual values, <italic>p</italic>-value ≤0.001 for Mantel test comparing true and estimated dissimilarity matrices, <xref ref-type="fig" rid="fig-4">Fig. 4D</xref>). Thus, as long as the count matrices were normalized to account for different contig lengths and library sizes, each of the five methods tested here provided reliable estimates of alpha and beta diversity.</p>
            </sec>
            <sec>
               <title>Impact of under-sequencing and possible corrections</title>
               <p>Finally, to help guide researchers in making decisions about under-sequenced samples, we evaluated how alpha and beta diversity estimates were impacted by such samples in a dataset. Specifically, we performed the same computations (assembly with metaSPAdes from quality-controlled reads, generation of a pool of dereplicated population contigs, mapping of quality-controlled reads and computation of normalized count matrices), but we did so with a dataset in which half of the samples were drastically under-sequenced either at 10% (subset_10) or 1% (subset_1) of the original sequencing depth, respectively (<xref ref-type="supplementary-material" rid="supp-2">Table S1</xref>, <xref ref-type="supplementary-material" rid="supp-1">Fig. S2</xref>).</p>
               <p>Not surprisingly, under-sequenced samples resulted in fewer genomes detected (<italic>t</italic>-test, <italic>p</italic>-value &lt;1<italic>e</italic> − 05, <xref ref-type="fig" rid="fig-4">Fig. 4A</xref>). Using the same five normalization methods to account for these differences in sequencing depth, we found that the diversity estimations were impacted. The subset_10 samples resulted in Shannon and Simpson estimations that were close (within 16%) to the initial estimates, but the diversity estimates in the subset_1 samples varied as much as 30% (<xref ref-type="fig" rid="fig-4">Figs. 4B</xref> &amp; <xref ref-type="fig" rid="fig-4">4C</xref>). Hence, although the different normalization methods tested here helped to compensate for some degree of under-sequencing, none was able to recover the correct values of alpha diversity when sequencing depth was highly variable and/or when some samples were significantly under-sequenced.</p>
               <p>Similarly, beta diversity patterns (evaluated as pairwise Bray–Curtis dissimilarities) were not estimated as accurately with the under-sequenced samples than with the initial samples: dissimilarities estimated from subset_10 samples varied as much as 61% compared with the true dissimilarities (mean: 5.9%), and the ones estimated from subset_1 samples varied as much as 77% (mean: 4.4%; <xref ref-type="fig" rid="fig-4">Figs. 4E</xref> &amp; <xref ref-type="fig" rid="fig-4">4F</xref>). Rarefaction and MGSeq were the two normalization methods most efficient at limiting these biases, as they led to maximum variations of 11.5% and 11.3% for subset_10, and 10.9% and 52.7% for subset_1, respectively. Moreover, even with the subset_1 samples, the results of an NMDS based on these normalized count matrices were still strongly correlated with the results of an NMDS based on true relative abundances (<xref ref-type="supplementary-material" rid="supp-1">Fig. S8</xref>, <italic>r</italic><sup>2</sup>&gt;0.9 for all normalization methods but “rarefied”, for which the positions of two groups are switched leading to a lower <italic>r</italic><sup>2</sup> of 0.64). Hence, beta diversity trends can be recovered even when sequencing depth was highly variable.</p>
               <p>Although not formally evaluated through <italic>in silico</italic> benchmarks, it is very likely that microbial metagenomes with highly uneven sequencing depth would be subjected to similar biases, and the tools tested here would be expected to perform comparably on viral and microbial metagenomes, since the input data (i.e., coverage matrix) is essentially identical. Hence, the information and guidelines provided here can in all likelihood be considered relevant for microbial metagenomes as well.</p>
            </sec>
            <sec>
               <title>Current limitations of the sample-to-ecological-inference pipeline</title>
               <p>Overall, these benchmarks confirmed that virome-derived abundance matrices can be used in ecological studies, with two main caveats. First, absolute viral richness will likely be under-estimated, because the assembly will only yield large contigs for abundant viral genotypes without evenly distributed and/or closely related strains. Hence, absolute values of richness and diversity should be interpreted with care, although once normalized, sample comparisons of these richness and diversity metrics are generally robust to differences in community complexity and sequencing depth. Second, because this approach relies on coverage as a proxy for relative abundance, only quantitative (or near-quantitative) datasets can be used as input (<xref ref-type="bibr" rid="ref-17">Duhaime et al., 2012</xref>). Notably, protocols to generate these quantitative viromes are currently available only for dsDNA and/or ssDNA viruses (<xref ref-type="bibr" rid="ref-17">Duhaime et al., 2012</xref>; <xref ref-type="bibr" rid="ref-54">Roux et al., 2016</xref>), and still remain to be developed for their RNA counterparts, although these RNA viruses might represent up to half of the viral particles in some environments (<xref ref-type="bibr" rid="ref-62">Steward et al., 2013</xref>). Thus, when interpreting viromics-based ecological studies, it is important to remember and clearly state that these reflect only the sub-part of viral communities with (ds)DNA genomes.</p>
            </sec>
         </sec>
      </sec>
      <sec sec-type="conclusions">
         <title>Conclusions</title>
         <p>Our comparative analysis of 14 simulated viromes showed that the genome-assembly-to-ecological-inference viromics pipeline can efficiently and robustly identify abundant viruses and recover trends in alpha and beta diversity. As viromics becomes routine in viral ecology, the approaches underlined here (both the tools and thresholds used) offer an initial set of “best practices” for data analysis.</p>
         <p>Moving forward, increased library size and number associated with improved genome recovery from metagenomes will undoubtedly lead to an unprecedented catalog of uncultivated viral genomes (e.g., 125,000 released in a single study; <xref ref-type="bibr" rid="ref-45">Paez-Espino et al., 2016</xref>). These will be complemented by viral genomes obtained from other methods, such as single-virus sequencing, which can access less dominant viruses and those with high strain heterogeneity (<xref ref-type="bibr" rid="ref-38">Martinez-Hernandez et al., 2017</xref>). As standards emerge, such uncultivated viral genomes will migrate toward specifically-designed databases (e.g., IMG/VR, <xref ref-type="bibr" rid="ref-45">Paez-Espino et al., 2016</xref>), and viral ecological studies will be greatly improved by these centralized reference genome data. Beyond improved references (which will also need to include uncultivated RNA viruses), viromics will need to advance from relative abundance estimations to absolute quantification of viral populations, likely coupled with “ground-truthing” provided by quantitative, lineage-specific molecular methods such as phageFISH, polonies, microarrays, or microfluidic PCR (<xref ref-type="bibr" rid="ref-63">Tadmor et al., 2011</xref>; <xref ref-type="bibr" rid="ref-2">Allers et al., 2013</xref>; <xref ref-type="bibr" rid="ref-37">Martínez-García et al., 2014</xref>). Once in-hand, such approaches should enable researchers to address long-standing questions in the viral ecology field, and more fully bring viruses into predictive ecological models across Earth’s ecosystems.</p>
      </sec>
      <sec sec-type="supplementary-material" id="supplemental-information">
         <title> Supplemental Information</title>
         <supplementary-material id="supp-1" mimetype="application" mime-subtype="pdf" xlink:href="https://peerj.com/articles/3817/Metagenome_simulations_Supplementary_Figures_R1.pdf">
            <object-id pub-id-type="doi">10.7717/peerj.3817/supp-1</object-id><label>Supplemental Information 1</label><caption>
               <title>Supplementary Figures</title>
               <p><bold>Supplementary Figure 1.</bold> <bold>Characteristics of</bold><bold>mock viral communities.</bold> Mock communities generated had different number of genomes (A) and population distribution (B). This led to a range of alpha diversity, as illustrated with Shannon diversity index (C) and Simpson index (D). Mock communities were also designed to display a beta diversity pattern with 4 groups of samples (E and F, BC: Bray–Curtis). The 14 mock communities were designed to cluster into 4 distinct groups, and are colored across all panels according to these 4 groups of significantly similar communities (F, PerMANOVA <italic>p</italic>-value &lt;0.001).</p>
               <p><bold>Supplementary Figure</bold><bold>2</bold><bold>.</bold> <bold>Schematic of the methods evaluated in this study.</bold> (A) Benchmarking of the assemblers, read pre-processing methods, and thresholds on genome coverage and read mapping identity used to calculate abundance matrices. (B) Estimation of the impact of strain heterogeneity on the assembly efficiency. Reference genomes were replaced by populations composed of a set of related strains controlled by 3 parameters. (C) Evaluation of the different normalization methods across the three types of datasets, with varying differences in sequencing depth across samples. For all panels, the different methods tested are indicated for each step, and the method and/or threshold chosen or optimal are highlighted in blue (other tests are colored in gray). The metrics used to identify the optimal methods/thresholds are indicated on the left, in green for metrics to maximize, red for metrics to minimize, and black for metrics to compare to “true” data based on whole communities. QC: quality-controlled.</p>
               <p><bold>Supplementary Figure</bold><bold>3</bold><bold>.</bold> <bold>Influence of assembly software and read curation on genome recovery –</bold><bold>dotplots (underlying data for boxplots presented in Figure 1).</bold> In these plots, each dot represents the assembly of a single genome in a single sample. (A) Genome recovery (i.e. genome coverage by all contigs, <italic>y</italic>-axis) by genome coverage (<italic>x</italic>-axis) for different assemblers (colors). (B) Genome recovery in a single contig (i.e. genome coverage by the largest assembled contig, <italic>y</italic>-axis) by genome coverage (<italic>x</italic>-axis) for different assemblers (colors). (C) Genome recovery (i.e. genome coverage by all contigs, <italic>y</italic>-axis) by genome coverage (<italic>x</italic>-axis) for different read curation methods (colors).</p>
               <p><bold>Sup</bold><bold>plementary Figure 4. Correlation between assembly results of different assemblers.</bold> Top panels display the correlations of genome recovery (i.e. genome coverage by all contigs) for each genome between MEGAHIT and IDBA-UD (A), metaSPAdes and IDBA-UD (B) and metaSPAdes and MEGAHIT (C). Bottom panels display the correlations of genome recovery in a single contig (i.e. genome coverage by the largest assembled contig) for each genome between MEGAHIT and IDBA-UD (D), metaSPAdes and IDBA-UD (E) and metaSPAdes and MEGAHIT (F).</p>
               <p><bold>Supplementary Figure 5. Influence of read curation on genome recovery for different assemblers.</bold> The assemblers used here are MEGAHIT (A, B), IDBA-UD (C, D), MetaVelvet (E, F), and Omega (G, H). For each assembler, the genome recovery (i.e. genome coverage by all contigs, <italic>y</italic>-axis) by genome coverage (<italic>x</italic>-axis) for different read curation methods (A, C, E, and G) as well as genome recovery in a single contig (i.e. genome coverage by the largest assembled contig, <italic>y</italic>-axis) by genome coverage (<italic>x</italic>-axis) for different read curation methods (B, D, F, and H) are displayed. Similar data are displayed for metaSPAdes in <xref ref-type="fig" rid="fig-1">Figs. 1C</xref> and <xref ref-type="fig" rid="fig-1">1D</xref>.</p>
               <p><bold>Supplementary Figure 6.</bold> <bold>Influence of strain-level diversity on assembly efficiency.</bold> These tests were computed on one mock community (Sample_1), in which each reference genome was replaced with a set of related strains with varying divergence and relative abundances. In each plot, the <italic>y</italic>-axis represents the ratio between the largest contig assembled for a genome when strain heterogeneity is introduced and the same parameter without strain heterogeneity (i.e. previous assemblies of the same Sample_1). Plots on the top row display the differences in QC reads assemblies between assemblers, while plots on the bottom row show differences between different reads processing for metaSPAdes assemblies. Populations are grouped based on the different parameters controlling strain heterogeneity, i.e. relative abundance of the dominant strain (left), divergence of the strains (middle), and number of strains in the population (right).</p>
               <p><bold>Supplementary Figure</bold><bold>7</bold><bold>. Number of population contigs detec</bold><bold>ted for each input genome depending on the minimum contig size threshold for inclusion in population pool.</bold> The threshold on contig size used for inclusion in population contigs pools (in bp) is displayed on the <italic>x</italic>-axis. The distribution of average number of contigs for a given genome (across the samples in which this genome was covered ≥1 ×) is displayed on the <italic>y</italic>-axis.</p>
               <p><bold>Supplementary Figure 8. Comparison of NMDS based on viral population counts and the reference mock community composition (reference in top left panel).</bold> The different NMDS were computed from the viral population count matrices normalized with the different methods tested in the manuscript (in bold), from the dataset including 6 samples strongly under-sequenced (subset_1, i.e. half of the datasets subsampled at 1%). For each NMDS, the sum of square difference, scaling factor, and significance of the correlation to the reference NMDS is indicated in the plot title (calculated with the function protest from the R package vegan). Samples are colored as in <xref ref-type="supplementary-material" rid="supp-1">Fig. S1</xref>, and an arrow is used to illustrate the difference between the original sample placement in the reference NMDS and the new placement in the NMDS derived from population contigs.</p>
            </caption>
         </supplementary-material>
         <supplementary-material id="supp-2" mimetype="application" mime-subtype="vnd.ms-excel" xlink:href="https://peerj.com/articles/3817/Supplementary_Tables.xls">
            <object-id pub-id-type="doi">10.7717/peerj.3817/supp-2</object-id><label>Supplemental Information 2</label><caption>
               <title>Supplementary Tables</title>
               <p><bold>Supplementary Table 1. Mock community design and simulated viromes.</bold> Each sample (1 to 14) represents a different mock community, with varying compositions and population structures (<xref ref-type="supplementary-material" rid="supp-1">Fig. S1</xref>). A first virome was simulated for all samples with 20 million reads, and subsets at 2 million and 200,000 reads were also generated for half of the samples.</p>
               <p><bold>Supplementary Table 2. Mock community composition.</bold> The relative abundance of each genome across the 14 samples is indicated. Viral genomes are identified through their NCBI gi number.</p>
               <p><bold>Supplementary Table</bold><bold>3</bold><bold>. Number of reads retained after each read treatment for initial samples (20 millions raw reads).</bold> Treatments are indicated by their code in the first column and detailed in the second column. For read partitioning, the size of each partition (in number of reads) is indicated.</p>
               <p><bold>Supplementary Table</bold><bold>4</bold><bold>.</bold> <bold>Assembly statistics for each assembler and each treatment for initial samples.</bold> For each assembly (combination of one sample, one assembler, and one read treatment), the number of contigs, N50, and N80 are indicated.</p>
            </caption>
         </supplementary-material>
      </sec>
   </body>
   <back>
      <glossary content-type="abbreviations" id="glossary-1">
         <title>List of abbreviations</title>
         <def-list id="dl1">
            <def-item>
               <term> ANI</term>
               <def>
                  <p>Average Nucleotide Identity</p>
               </def>
            </def-item>
            <def-item>
               <term> ANOVA</term>
               <def>
                  <p>ANalysis Of Variance</p>
               </def>
            </def-item>
            <def-item>
               <term> FDR</term>
               <def>
                  <p>False Discovery Rate</p>
               </def>
            </def-item>
            <def-item>
               <term> NMDS</term>
               <def>
                  <p>Non-metric MultiDimensional Scaling</p>
               </def>
            </def-item>
            <def-item>
               <term> OTU</term>
               <def>
                  <p>Operational Taxonomic Unit</p>
               </def>
            </def-item>
            <def-item>
               <term> QC</term>
               <def>
                  <p>Quality-controlled (for reads)</p>
               </def>
            </def-item>
         </def-list>
      </glossary>
      <ack>
         <p>High performance computing resources were provided by the Ohio Supercomputer Center, and the National Energy Research Scientific Computing Center supported by the Office of Science of the US Department of Energy.</p>
      </ack>
      <sec sec-type="additional-information">
         <title>Additional Information and Declarations</title>
         <fn-group content-type="competing-interests">
            <title>Competing Interests</title><fn id="conflict-1" fn-type="conflict"><p>The authors declare there are no competing interests.</p></fn></fn-group>
         <fn-group content-type="author-contributions">
            <title>Author Contributions</title><fn id="contribution-1" fn-type="con"><p><xref ref-type="contrib" rid="author-1">Simon Roux</xref> conceived and designed the experiments, performed the experiments, analyzed the data, wrote the paper, prepared figures and/or tables, reviewed drafts of the paper.</p></fn><fn id="contribution-2" fn-type="con"><p><xref ref-type="contrib" rid="author-2">Joanne B. Emerson</xref>, <xref ref-type="contrib" rid="author-3">Emiley A. Eloe-Fadrosh</xref> and <xref ref-type="contrib" rid="author-4">Matthew B. Sullivan</xref> conceived and designed the experiments, analyzed the data, wrote the paper, prepared figures and/or tables, reviewed drafts of the paper.</p></fn></fn-group>
         <fn-group content-type="other">
            <title>Data Availability</title><fn id="addinfo-1"><p>The following information was supplied regarding data availability:</p>
            <p>The scripts used in this study are available at <ext-link ext-link-type="uri" xlink:href="https://bitbucket.org/MAVERICLab/benchmarking_viromics">https://bitbucket.org/MAVERICLab/benchmarking_viromics</ext-link>.</p>
            <p>The datasets generated for this study are available at <ext-link ext-link-type="uri" xlink:href="http://datacommons.cyverse.org/browse/iplant/home/shared/iVirus/Virome_pipeline_benchmark">http://datacommons.cyverse.org/browse/iplant/home/shared/iVirus/Virome_pipeline_benchmark</ext-link>.</p></fn></fn-group>
      </sec>
      <ref-list content-type="authoryear">
         <title>References</title>
         <ref id="ref-1"><label>Aguirre de Cárcer, Angly &amp; Alcamí (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Aguirre de Cárcer</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Angly</surname>
                     <given-names>FE</given-names>
                  </name>
                  <name>
                     <surname>Alcamí</surname>
                     <given-names>A</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Evaluation of viral genome assembly and diversity estimation in deep metagenomes</article-title>
               <source>BMC Genomics</source>
               <volume>15</volume>
               <issue>1</issue>
               <elocation-id>e368</elocation-id>
               <pub-id pub-id-type="doi">10.1186/1471-2164-15-989</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-2"><label>Allers et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Allers</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Moraru</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Duhaime</surname>
                     <given-names>MB</given-names>
                  </name>
                  <name>
                     <surname>Beneze</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Solonenko</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Canosa</surname>
                     <given-names>JB</given-names>
                  </name>
                  <name>
                     <surname>Amann</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Single-cell and population level viral infection dynamics revealed by phageFISH, a method to visualize intracellular and free viruses</article-title>
               <source>Environmental Microbiology</source>
               <volume>15</volume>
               <fpage>2306</fpage>
               <lpage>2318</lpage>
               <pub-id pub-id-type="doi">10.1111/1462-2920.12100</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-3"><label>Anders &amp; Huber (2010)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Anders</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Huber</surname>
                     <given-names>W</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2010">2010</year>
               <article-title>Differential expression analysis for sequence count data</article-title>
               <source>Genome Biology</source>
               <volume>11</volume>
               <comment>Article R106</comment>
               <pub-id pub-id-type="doi">10.1186/gb-2010-11-10-r106</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-4"><label>Angly et al. (2009)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Angly</surname>
                     <given-names>FE</given-names>
                  </name>
                  <name>
                     <surname>Willner</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Prieto-Davó</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Edwards</surname>
                     <given-names>RA</given-names>
                  </name>
                  <name>
                     <surname>Schmieder</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Vega-Thurber</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Antonopoulos</surname>
                     <given-names>DA</given-names>
                  </name>
                  <name>
                     <surname>Barott</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Cottrell</surname>
                     <given-names>MT</given-names>
                  </name>
                  <name>
                     <surname>Desnues</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Dinsdale</surname>
                     <given-names>EA</given-names>
                  </name>
                  <name>
                     <surname>Furlan</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Haynes</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Henn</surname>
                     <given-names>MR</given-names>
                  </name>
                  <name>
                     <surname>Hu</surname>
                     <given-names>Y</given-names>
                  </name>
                  <name>
                     <surname>Kirchman</surname>
                     <given-names>DL</given-names>
                  </name>
                  <name>
                     <surname>McDole</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>McPherson</surname>
                     <given-names>JD</given-names>
                  </name>
                  <name>
                     <surname>Meyer</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Miller</surname>
                     <given-names>RM</given-names>
                  </name>
                  <name>
                     <surname>Mundt</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Naviaux</surname>
                     <given-names>RK</given-names>
                  </name>
                  <name>
                     <surname>Rodriguez-Mueller</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Stevens</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Wegley</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Zhang</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Zhu</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Rohwer</surname>
                     <given-names>F</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2009">2009</year>
               <article-title>The GAAS metagenomic tool and its estimations of viral and microbial average genome size in four major biomes</article-title>
               <source>PLOS Computational Biology</source>
               <volume>5</volume>
               <elocation-id>e1000593</elocation-id>
               <pub-id pub-id-type="doi">10.1371/journal.pcbi.1000593</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-5"><label>Aziz et al. (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Aziz</surname>
                     <given-names>RK</given-names>
                  </name>
                  <name>
                     <surname>Dwivedi</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Akhter</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Breitbart</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Edwards</surname>
                     <given-names>RA</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Multidimensional metrics for estimating phage abundance, distribution, gene density, and sequence coverage in metagenomes</article-title>
               <source>Frontiers in Microbiology</source>
               <volume>6</volume>
               <comment>Article 381</comment>
               <pub-id pub-id-type="doi">10.3389/fmicb.2015.00381</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-6"><label>Bolduc et al. (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Bolduc</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Jang</surname>
                     <given-names>HB</given-names>
                  </name>
                  <name>
                     <surname>Doulcier</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>You</surname>
                     <given-names>Z-Q</given-names>
                  </name>
                  <name>
                     <surname>Roux</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>vConTACT: an iVirus tool to classify double-stranded DNA viruses that infect <italic>Archaea</italic> and <italic>Bacteria</italic></article-title>
               <source>PeerJ</source>
               <volume>5</volume>
               <elocation-id>e3243</elocation-id>
               <pub-id pub-id-type="doi">10.7717/peerj.3243</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-7"><label>Bolger, Lohse &amp; Usadel (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Bolger</surname>
                     <given-names>AM</given-names>
                  </name>
                  <name>
                     <surname>Lohse</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Usadel</surname>
                     <given-names>B</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Trimmomatic: a flexible trimmer for Illumina sequence data</article-title>
               <source>Bioinformatics</source>
               <volume>30</volume>
               <fpage>2114</fpage>
               <lpage>2120</lpage>
               <pub-id pub-id-type="doi">10.1093/bioinformatics/btu170</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-8"><label>Bowers et al. (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Bowers</surname>
                     <given-names>RM</given-names>
                  </name>
                  <name>
                     <surname>Kyrpides</surname>
                     <given-names>NC</given-names>
                  </name>
                  <name>
                     <surname>Stepanauskas</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Harmon-Smith</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Schulz</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Doud</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Reddy</surname>
                     <given-names>TBK</given-names>
                  </name>
                  <name>
                     <surname>Jarett</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Rivers</surname>
                     <given-names>AR</given-names>
                  </name>
                  <name>
                     <surname>Eloe-Fadrosh</surname>
                     <given-names>EA</given-names>
                  </name>
                  <name>
                     <surname>Tringe</surname>
                     <given-names>SG</given-names>
                  </name>
                  <name>
                     <surname>Ivanova</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Copeland</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Clum</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Becraft</surname>
                     <given-names>ED</given-names>
                  </name>
                  <name>
                     <surname>Malmstrom</surname>
                     <given-names>RR</given-names>
                  </name>
                  <name>
                     <surname>Birren</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Schriml</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Podar</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Bork</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Weinstock</surname>
                     <given-names>GM</given-names>
                  </name>
                  <name>
                     <surname>Banfield</surname>
                     <given-names>JF</given-names>
                  </name>
                  <name>
                     <surname>Garrity</surname>
                     <given-names>GM</given-names>
                  </name>
                  <name>
                     <surname>Hugenholtz</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Parks</surname>
                     <given-names>DH</given-names>
                  </name>
                  <name>
                     <surname>Tyson</surname>
                     <given-names>GW</given-names>
                  </name>
                  <name>
                     <surname>Rinke</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Dodsworth</surname>
                     <given-names>JA</given-names>
                  </name>
                  <name>
                     <surname>Yooseph</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Sutton</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Yilmaz</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Glockner</surname>
                     <given-names>FO</given-names>
                  </name>
                  <name>
                     <surname>Meyer</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Gilbert</surname>
                     <given-names>JA</given-names>
                  </name>
                  <name>
                     <surname>Nelson</surname>
                     <given-names>WC</given-names>
                  </name>
                  <name>
                     <surname>Hallam</surname>
                     <given-names>SJ</given-names>
                  </name>
                  <name>
                     <surname>Jungbluth</surname>
                     <given-names>SP</given-names>
                  </name>
                  <name>
                     <surname>Ettema</surname>
                     <given-names>TJG</given-names>
                  </name>
                  <name>
                     <surname>Tighe</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Konstantinidis</surname>
                     <given-names>KT</given-names>
                  </name>
                  <name>
                     <surname>Liu</surname>
                     <given-names>W-T</given-names>
                  </name>
                  <name>
                     <surname>Baker</surname>
                     <given-names>BJ</given-names>
                  </name>
                  <name>
                     <surname>Rattei</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Eisen</surname>
                     <given-names>JA</given-names>
                  </name>
                  <name>
                     <surname>Hedlund</surname>
                     <given-names>BP</given-names>
                  </name>
                  <name>
                     <surname>Mcmahon</surname>
                     <given-names>KD</given-names>
                  </name>
                  <name>
                     <surname>Fierer</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Knight</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Finn</surname>
                     <given-names>RD</given-names>
                  </name>
                  <name>
                     <surname>Karsch-Mizrachi</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Eren</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Woyke</surname>
                     <given-names>T</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>Minimum information about a single amplified genome (MISAG) and a metagenome-assembled genome (MIMAG) of bacteria and archaea</article-title>
               <source>Nature Biotechnology</source>
               <volume>35</volume>
               <fpage>725</fpage>
               <lpage>731</lpage>
               <pub-id pub-id-type="doi">10.1038/nbt.3893</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-9"><label>Brum et al. (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Brum</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Ignacio-Espinoza</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Roux</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Doulcier</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Acinas</surname>
                     <given-names>SG</given-names>
                  </name>
                  <name>
                     <surname>Alberti</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Chaffron</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Cruaud</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>De Vargas</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Gasol</surname>
                     <given-names>JM</given-names>
                  </name>
                  <name>
                     <surname>Gorsky</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Gregory</surname>
                     <given-names>AC</given-names>
                  </name>
                  <name>
                     <surname>Ogata</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Pesant</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Poulos</surname>
                     <given-names>BT</given-names>
                  </name>
                  <name>
                     <surname>Schwenck</surname>
                     <given-names>SM</given-names>
                  </name>
                  <name>
                     <surname>Speich</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Dimier</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Kandels-Lewis</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Picheral</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Searson</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Coordinators</surname>
                     <given-names>TO</given-names>
                  </name>
                  <name>
                     <surname>Bork</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Bowler</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Sunagawa</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Wincker</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Karsenti</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Patterns and ecological drivers of ocean viral communities</article-title>
               <source>Science</source>
               <volume>348</volume>
               <comment>Article 1261498</comment>
               <pub-id pub-id-type="doi">10.1126/science.1261498</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-10"><label>Brum &amp; Sullivan (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Brum</surname>
                     <given-names>JR</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Rising to the challenge: accelerated pace of discovery transforms marine virology</article-title>
               <source>Nature Reviews. Microbiology</source>
               <volume>13</volume>
               <fpage>1</fpage>
               <lpage>13</lpage>
               <pub-id pub-id-type="doi">10.1038/nrmicro3404</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-11"><label>Chao (1984)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Chao</surname>
                     <given-names>A</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="1984">1984</year>
               <article-title>Nonparametric estimation of the number of classes in a population</article-title>
               <source>Scandinavian Journal of Statistics</source>
               <volume>11</volume>
               <fpage>265</fpage>
               <lpage>270</lpage>
            </element-citation>
         </ref>
         <ref id="ref-12"><label>Chow &amp; Fuhrman (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Chow</surname>
                     <given-names>CET</given-names>
                  </name>
                  <name>
                     <surname>Fuhrman</surname>
                     <given-names>JA</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>Seasonality and monthly dynamics of marine myovirus communities</article-title>
               <source>Environmental Microbiology</source>
               <volume>14</volume>
               <fpage>2171</fpage>
               <lpage>2183</lpage>
               <pub-id pub-id-type="doi">10.1111/j.1462-2920.2012.02744.x</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-13"><label>Cobián Güemes et al. (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Cobián Güemes</surname>
                     <given-names>AG</given-names>
                  </name>
                  <name>
                     <surname>Youle</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Cantú</surname>
                     <given-names>VA</given-names>
                  </name>
                  <name>
                     <surname>Felts</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Nulton</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Rohwer</surname>
                     <given-names>F</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>Viruses as winners in the game of life</article-title>
               <source>Annual Review of Virology</source>
               <volume>3</volume>
               <fpage>197</fpage>
               <lpage>214</lpage>
               <pub-id pub-id-type="doi">10.1146/annurev-virology-100114-054952</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-14"><label>Crusoe et al. (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Crusoe</surname>
                     <given-names>MR</given-names>
                  </name>
                  <name>
                     <surname>Alameldin</surname>
                     <given-names>HF</given-names>
                  </name>
                  <name>
                     <surname>Awad</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Boucher</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Caldwell</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Cartwright</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Charbonneau</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Constantinides</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Edvenson</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Fay</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Fenton</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Fenzl</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Fish</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Garcia-Gutierrez</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Garl</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Gluck</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>González</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Guermond</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Guo</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Gupta</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Herr</surname>
                     <given-names>JR</given-names>
                  </name>
                  <name>
                     <surname>Howe</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Hyer</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Härpfer</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Irber</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Kidd</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Lin</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Lippi</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Mansour</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>McA’Nulty</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>McDonald</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Mizzi</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Murray</surname>
                     <given-names>KD</given-names>
                  </name>
                  <name>
                     <surname>Nahum</surname>
                     <given-names>JR</given-names>
                  </name>
                  <name>
                     <surname>Nanlohy</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Nederbragt</surname>
                     <given-names>AJ</given-names>
                  </name>
                  <name>
                     <surname>Ortiz-Zuazaga</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Ory</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Pell</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Pepe-Ranney</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Russ</surname>
                     <given-names>ZN</given-names>
                  </name>
                  <name>
                     <surname>Schwarz</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Scott</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Seaman</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Sievert</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Simpson</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Skennerton</surname>
                     <given-names>CT</given-names>
                  </name>
                  <name>
                     <surname>Spencer</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Srinivasan</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Standage</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Stapleton</surname>
                     <given-names>JA</given-names>
                  </name>
                  <name>
                     <surname>Steinman</surname>
                     <given-names>SR</given-names>
                  </name>
                  <name>
                     <surname>Stein</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Taylor</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Trimble</surname>
                     <given-names>W</given-names>
                  </name>
                  <name>
                     <surname>Wiencko</surname>
                     <given-names>HL</given-names>
                  </name>
                  <name>
                     <surname>Wright</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Wyss</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Zhang</surname>
                     <given-names>Q</given-names>
                  </name>
                  <name>
                     <surname>Zyme</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Brown</surname>
                     <given-names>CT</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>The khmer software package: enabling efficient nucleotide sequence analysis</article-title>
               <source>F1000Research</source>
               <volume>4</volume>
               <comment>Article 900</comment>
               <pub-id pub-id-type="doi">10.12688/f1000research.6924.1</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-15"><label>Delcher, Salzberg &amp; Phillippy (2003)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Delcher</surname>
                     <given-names>AL</given-names>
                  </name>
                  <name>
                     <surname>Salzberg</surname>
                     <given-names>SL</given-names>
                  </name>
                  <name>
                     <surname>Phillippy</surname>
                     <given-names>AM</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2003">2003</year>
               <article-title>Using MUMmer to identify similar regions in large sequence sets</article-title>
               <source>Current Protocols in Bioinformatics</source>
               <volume>10.3</volume>
               <fpage>1</fpage>
               <lpage>18</lpage>
               <pub-id pub-id-type="doi">10.1002/0471250953.bi1003s00</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-16"><label>Doll et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Doll</surname>
                     <given-names>HM</given-names>
                  </name>
                  <name>
                     <surname>Armitage</surname>
                     <given-names>DW</given-names>
                  </name>
                  <name>
                     <surname>Daly</surname>
                     <given-names>RA</given-names>
                  </name>
                  <name>
                     <surname>Emerson</surname>
                     <given-names>JB</given-names>
                  </name>
                  <name>
                     <surname>Goltsman</surname>
                     <given-names>DSA</given-names>
                  </name>
                  <name>
                     <surname>Yelton</surname>
                     <given-names>AP</given-names>
                  </name>
                  <name>
                     <surname>Kerekes</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Firestone</surname>
                     <given-names>MK</given-names>
                  </name>
                  <name>
                     <surname>Potts</surname>
                     <given-names>MD</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Utilizing novel diversity estimators to quantify multiple dimensions of microbial biodiversity across domains</article-title>
               <source>BMC Microbiology</source>
               <volume>13</volume>
               <fpage>259</fpage>
               <pub-id pub-id-type="doi">10.1186/1471-2180-13-259</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-17"><label>Duhaime et al. (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Duhaime</surname>
                     <given-names>MB</given-names>
                  </name>
                  <name>
                     <surname>Deng</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Poulos</surname>
                     <given-names>BT</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>Towards quantitative metagenomics of wild viruses and other ultra-low concentration DNA samples: a rigorous assessment and optimization of the linker amplification method</article-title>
               <source>Environmental Microbiology</source>
               <volume>14</volume>
               <fpage>2526</fpage>
               <lpage>2537</lpage>
               <pub-id pub-id-type="doi">10.1111/j.1462-2920.2012.02791.x</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-18"><label>Edwards &amp; Rohwer (2005)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Edwards</surname>
                     <given-names>RA</given-names>
                  </name>
                  <name>
                     <surname>Rohwer</surname>
                     <given-names>F</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2005">2005</year>
               <article-title>Viral metagenomics</article-title>
               <source>Nature Reviews Microbiology</source>
               <volume>3</volume>
               <fpage>504</fpage>
               <lpage>510</lpage>
               <pub-id pub-id-type="doi">10.1038/nrmicro1163</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-19"><label>Falkowski, Fenchel &amp; Delong (2008)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Falkowski</surname>
                     <given-names>PG</given-names>
                  </name>
                  <name>
                     <surname>Fenchel</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Delong</surname>
                     <given-names>EF</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2008">2008</year>
               <article-title>The microbial engines that drive earth’s biogeochemical cycles</article-title>
               <source>Science</source>
               <volume>320</volume>
               <fpage>1034</fpage>
               <lpage>1039</lpage>
               <pub-id pub-id-type="doi">10.1126/science.1153213</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-20"><label>Fierer et al. (2011)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Fierer</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>McCain</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Meir</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Zimmerman</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Rapp</surname>
                     <given-names>JM</given-names>
                  </name>
                  <name>
                     <surname>Silman</surname>
                     <given-names>MR</given-names>
                  </name>
                  <name>
                     <surname>Knight</surname>
                     <given-names>R</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2011">2011</year>
               <article-title>Microbes do not follow the elevational diversity patterns of plants and animals</article-title>
               <source>Ecology</source>
               <volume>92</volume>
               <issue>4</issue>
               <fpage>797</fpage>
               <lpage>804</lpage>
            </element-citation>
         </ref>
         <ref id="ref-21"><label>Filée et al. (2005)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Filée</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Tétart</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Suttle</surname>
                     <given-names>CA</given-names>
                  </name>
                  <name>
                     <surname>Krisch</surname>
                     <given-names>HM</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2005">2005</year>
               <article-title>Marine T4-type bacteriophages, a ubiquitous component of the dark matter of the biosphere</article-title>
               <source>Proceedings of the National Academy of Sciences of the United States of America</source>
               <volume>102</volume>
               <fpage>12471</fpage>
               <lpage>12476</lpage>
               <pub-id pub-id-type="doi">10.1073/pnas.0503404102</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-22"><label>Fulthorpe et al. (2008)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Fulthorpe</surname>
                     <given-names>RR</given-names>
                  </name>
                  <name>
                     <surname>Roesch</surname>
                     <given-names>LFW</given-names>
                  </name>
                  <name>
                     <surname>Riva</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Triplett</surname>
                     <given-names>EW</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2008">2008</year>
               <article-title>Distantly sampled soils carry few species in common</article-title>
               <source>The ISME Journal</source>
               <volume>2</volume>
               <fpage>901</fpage>
               <lpage>910</lpage>
               <pub-id pub-id-type="doi">10.1038/ismej.2008.55</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-23"><label>García-López, Vázquez-Castellanos &amp; Moya (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>García-López</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Vázquez-Castellanos</surname>
                     <given-names>JF</given-names>
                  </name>
                  <name>
                     <surname>Moya</surname>
                     <given-names>A</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Fragmentation and coverage variation in viral metagenome assemblies, and their effect in diversity calculations</article-title>
               <source>Frontiers in Bioengineering and Biotechnology</source>
               <volume>3</volume>
               <comment>Article 141</comment>
               <pub-id pub-id-type="doi">10.3389/fbioe.2015.00141</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-24"><label>Goldsmith et al. (2011)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Goldsmith</surname>
                     <given-names>DB</given-names>
                  </name>
                  <name>
                     <surname>Crosti</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Dwivedi</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>McDaniel</surname>
                     <given-names>LD</given-names>
                  </name>
                  <name>
                     <surname>Varsani</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Suttle</surname>
                     <given-names>CA</given-names>
                  </name>
                  <name>
                     <surname>Weinbauer</surname>
                     <given-names>MG</given-names>
                  </name>
                  <name>
                     <surname>Sandaa</surname>
                     <given-names>R-AA</given-names>
                  </name>
                  <name>
                     <surname>Breitbart</surname>
                     <given-names>M</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2011">2011</year>
               <article-title>Development of phoH as a novel signature gene for assessing marine phage diversity</article-title>
               <source>Applied and Environmental Microbiology</source>
               <volume>77</volume>
               <fpage>7730</fpage>
               <lpage>7739</lpage>
               <pub-id pub-id-type="doi">10.1128/AEM.05531-11</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-25"><label>Greenwald et al. (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Greenwald</surname>
                     <given-names>WW</given-names>
                  </name>
                  <name>
                     <surname>Klitgord</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Seguritan</surname>
                     <given-names>V</given-names>
                  </name>
                  <name>
                     <surname>Yooseph</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Venter</surname>
                     <given-names>JC</given-names>
                  </name>
                  <name>
                     <surname>Garner</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Nelson</surname>
                     <given-names>KE</given-names>
                  </name>
                  <name>
                     <surname>Li</surname>
                     <given-names>W</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>Utilization of defined microbial communities enables effective evaluation of meta-genomic assemblies</article-title>
               <source>BMC Genomics</source>
               <volume>18</volume>
               <fpage>296</fpage>
               <pub-id pub-id-type="doi">10.1186/s12864-017-3679-5</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-26"><label>Gregory et al. (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Gregory</surname>
                     <given-names>AC</given-names>
                  </name>
                  <name>
                     <surname>Solonenko</surname>
                     <given-names>SA</given-names>
                  </name>
                  <name>
                     <surname>Ignacio-Espinoza</surname>
                     <given-names>JC</given-names>
                  </name>
                  <name>
                     <surname>LaButti</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Copeland</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Sudek</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Maitland</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Chittick</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Dos Santos</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Weitz</surname>
                     <given-names>JS</given-names>
                  </name>
                  <name>
                     <surname>Worden</surname>
                     <given-names>AZ</given-names>
                  </name>
                  <name>
                     <surname>Woyke</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>Genomic differentiation among wild cyanophages despite widespread horizontal gene transfer</article-title>
               <source>BMC Genomics</source>
               <volume>17</volume>
               <fpage>930</fpage>
               <pub-id pub-id-type="doi">10.1186/s12864-016-3286-x</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-27"><label>Haegeman et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Haegeman</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Hamelin</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Moriarty</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Neal</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Dushoff</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Weitz</surname>
                     <given-names>JS</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Robust estimation of microbial diversity in theory and in practice</article-title>
               <source>The ISME Journal</source>
               <volume>7</volume>
               <fpage>1092</fpage>
               <lpage>1101</lpage>
               <pub-id pub-id-type="doi">10.1038/ismej.2013.10</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-28"><label>Haider et al. (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Haider</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Ahn</surname>
                     <given-names>TH</given-names>
                  </name>
                  <name>
                     <surname>Bushnell</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Chai</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Copel</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Pan</surname>
                     <given-names>C</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Omega: an overlap-graph de novo assembler for metagenomics</article-title>
               <source>Bioinformatics</source>
               <volume>30</volume>
               <fpage>2717</fpage>
               <lpage>2722</lpage>
               <pub-id pub-id-type="doi">10.1093/bioinformatics/btu395</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-29"><label>Hill et al. (2003)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Hill</surname>
                     <given-names>TCJ</given-names>
                  </name>
                  <name>
                     <surname>Walsh</surname>
                     <given-names>KA</given-names>
                  </name>
                  <name>
                     <surname>Harris</surname>
                     <given-names>JA</given-names>
                  </name>
                  <name>
                     <surname>Moffett</surname>
                     <given-names>BF</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2003">2003</year>
               <article-title>Using ecological diversity measures with bacterial communities</article-title>
               <source>FEMS Microbiology Ecology</source>
               <volume>43</volume>
               <fpage>1</fpage>
               <lpage>11</lpage>
               <pub-id pub-id-type="doi">10.1111/j.1574-6941.2003.tb01040.x</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-30"><label>Hurwitz, Brum &amp; Sullivan (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Hurwitz</surname>
                     <given-names>BL</given-names>
                  </name>
                  <name>
                     <surname>Brum</surname>
                     <given-names>JR</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Depth-stratified functional and taxonomic niche specialization in the “core” and “flexible” Pacific Ocean Virome</article-title>
               <source>The ISME Journal</source>
               <volume>9</volume>
               <fpage>472</fpage>
               <lpage>484</lpage>
               <pub-id pub-id-type="doi">10.1038/ismej.2014.143</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-31"><label>Hurwitz, Hallam &amp; Sullivan (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Hurwitz</surname>
                     <given-names>BL</given-names>
                  </name>
                  <name>
                     <surname>Hallam</surname>
                     <given-names>SJ</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Metabolic reprogramming by viruses in the sunlit and dark ocean</article-title>
               <source>Genome Biology</source>
               <volume>14</volume>
               <comment>Article R123</comment>
               <pub-id pub-id-type="doi">10.1186/gb-2013-14-11-r123</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-32"><label>Jia et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Jia</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Xuan</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Cai</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Hu</surname>
                     <given-names>Z</given-names>
                  </name>
                  <name>
                     <surname>Ma</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Wei</surname>
                     <given-names>C</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>NeSSM: a next-generation sequencing simulator for metagenomics</article-title>
               <source>PLOS ONE</source>
               <volume>8</volume>
               <elocation-id>e75448</elocation-id>
               <pub-id pub-id-type="doi">10.1371/journal.pone.0075448</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-33"><label>Li et al. (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Li</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Luo</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Liu</surname>
                     <given-names>CM</given-names>
                  </name>
                  <name>
                     <surname>Leung</surname>
                     <given-names>CM</given-names>
                  </name>
                  <name>
                     <surname>Ting</surname>
                     <given-names>HF</given-names>
                  </name>
                  <name>
                     <surname>Sadakane</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Yamashita</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Lam</surname>
                     <given-names>TW</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>MEGAHIT v1.0: a fast and scalable metagenome assembler driven by advanced methodologies and community practices</article-title>
               <source>Methods</source>
               <volume>102</volume>
               <fpage>3</fpage>
               <lpage>11</lpage>
               <pub-id pub-id-type="doi">10.1016/j.ymeth.2016.02.020</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-34"><label>Logares et al. (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Logares</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Sunagawa</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Salazar</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Cornejo-Castillo</surname>
                     <given-names>FM</given-names>
                  </name>
                  <name>
                     <surname>Ferrera</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Sarmento</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Hingamp</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Ogata</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>De Vargas</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Lima-Mendez</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Raes</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Poulain</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Jaillon</surname>
                     <given-names>O</given-names>
                  </name>
                  <name>
                     <surname>Wincker</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Kandels-Lewis</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Karsenti</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Bork</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Acinas</surname>
                     <given-names>SG</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Metagenomic 16S rDNA illumina tags are a powerful alternative to amplicon sequencing to explore diversity and structure of microbial communities</article-title>
               <source>Environmental Microbiology</source>
               <volume>16</volume>
               <fpage>2659</fpage>
               <lpage>2671</lpage>
               <pub-id pub-id-type="doi">10.1111/1462-2920.12250</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-35"><label>Marston &amp; Amrich (2009)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Marston</surname>
                     <given-names>MF</given-names>
                  </name>
                  <name>
                     <surname>Amrich</surname>
                     <given-names>CG</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2009">2009</year>
               <article-title>Recombination and microdiversity in coastal marine cyanophages</article-title>
               <source>Environmental Microbiology</source>
               <volume>11</volume>
               <fpage>2893</fpage>
               <lpage>2903</lpage>
               <pub-id pub-id-type="doi">10.1111/j.1462-2920.2009.02037.x</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-36"><label>Marston &amp; Martiny (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Marston</surname>
                     <given-names>MF</given-names>
                  </name>
                  <name>
                     <surname>Martiny</surname>
                     <given-names>JBH</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>Genomic diversification of marine cyanophages into stable ecotypes</article-title>
               <source>Environmental Microbiology</source>
               <volume>18</volume>
               <fpage>4240</fpage>
               <lpage>4253</lpage>
               <pub-id pub-id-type="doi">10.1111/1462-2920.13556</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-37"><label>Martínez-García et al. (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Martínez-García</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Santos</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Moreno-Paz</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Parro</surname>
                     <given-names>V</given-names>
                  </name>
                  <name>
                     <surname>Antón</surname>
                     <given-names>J</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Unveiling viral–host interactions within the “microbial dark matter”</article-title>
               <source>Nature Communications</source>
               <volume>5</volume>
               <fpage>1</fpage>
               <lpage>8</lpage>
               <pub-id pub-id-type="doi">10.1038/ncomms5542</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-38"><label>Martinez-Hernandez et al. (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Martinez-Hernandez</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Fornas</surname>
                     <given-names>O</given-names>
                  </name>
                  <name>
                     <surname>Lluesma</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Bolduc</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Cruz</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Martinez Martinez</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Anton</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Gasol</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Rosselli</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Rodriguez-Valera</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
                  <name>
                     <surname>Acinas</surname>
                     <given-names>SG</given-names>
                  </name>
                  <name>
                     <surname>Martinez-Garcia</surname>
                     <given-names>M</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>Single-virus genomics reveals hidden cosmopolitan and abundant viruses</article-title>
               <source>Nature Communications</source>
               <volume>8</volume>
               <comment>Article 15892</comment>
               <pub-id pub-id-type="doi">10.1038/ncomms15892</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-39"><label>Mavromatis et al. (2007)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Mavromatis</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Ivanova</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Barry</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Shapiro</surname>
                     <given-names>H</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2007">2007</year>
               <article-title>Use of simulated data sets to evaluate the fidelity of metagenomic processing methods</article-title>
               <source>Nature Methods</source>
               <volume>4</volume>
               <fpage>495</fpage>
               <lpage>500</lpage>
               <pub-id pub-id-type="doi">10.1038/NMETH1043</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-40"><label>McMurdie &amp; Holmes (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>McMurdie</surname>
                     <given-names>PJ</given-names>
                  </name>
                  <name>
                     <surname>Holmes</surname>
                     <given-names>S</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Waste not, want not: why rarefying microbiome data is inadmissible</article-title>
               <source>PLOS Computational Biology</source>
               <volume>10</volume>
               <elocation-id>e1003531</elocation-id>
               <pub-id pub-id-type="doi">10.1371/journal.pcbi.1003531</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-41"><label>Mende et al. (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Mende</surname>
                     <given-names>DR</given-names>
                  </name>
                  <name>
                     <surname>Waller</surname>
                     <given-names>AS</given-names>
                  </name>
                  <name>
                     <surname>Sunagawa</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Järvelin</surname>
                     <given-names>AI</given-names>
                  </name>
                  <name>
                     <surname>Chan</surname>
                     <given-names>MM</given-names>
                  </name>
                  <name>
                     <surname>Arumugam</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Raes</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Bork</surname>
                     <given-names>P</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>Assessment of metagenomic assembly using simulated next generation sequencing data</article-title>
               <source>PLOS ONE</source>
               <volume>7</volume>
               <elocation-id>e31386</elocation-id>
               <pub-id pub-id-type="doi">10.1371/journal.pone.0031386</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-42"><label>Namiki et al. (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Namiki</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Hachiya</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Tanaka</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Sakakibara</surname>
                     <given-names>Y</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>MetaVelvet: an extension of velvet assembler to de novo metagenome assembly from short sequence reads</article-title>
               <source>Nucleic Acids Research</source>
               <volume>40</volume>
               <elocation-id>e155</elocation-id>
               <pub-id pub-id-type="doi">10.1093/nar/gks678</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-43"><label>Nurk et al. (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Nurk</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Meleshko</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Korobeynikov</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Pevzner</surname>
                     <given-names>PA</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>metaSPAdes: a new versatile metagenomic assembler</article-title>
               <source>Genome Research</source>
               <volume>5</volume>
               <fpage>824</fpage>
               <lpage>834</lpage>
               <pub-id pub-id-type="doi">10.1101/gr.213959.116</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-44"><label>Oksanen et al. (2017)</label><element-citation publication-type="software">
               <person-group person-group-type="author">
                  <name>
                     <surname>Oksanen</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Blanchet</surname>
                     <given-names>FG</given-names>
                  </name>
                  <name>
                     <surname>Friendly</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Kindt</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Legendre</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>McGlinn</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Minchin</surname>
                     <given-names>PR</given-names>
                  </name>
                  <name>
                     <surname>O’Hara</surname>
                     <given-names>RB</given-names>
                  </name>
                  <name>
                     <surname>Simpson</surname>
                     <given-names>GL</given-names>
                  </name>
                  <name>
                     <surname>Solymos</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Stevens</surname>
                     <given-names>MHH</given-names>
                  </name>
                  <collab>Eduard Szoecs and Helene Wagner</collab>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <data-title>vegan: Community Ecology Package</data-title>
               <version designator="2.4-3">R package version 2.4-3</version>
               <uri>https://CRAN.R-project.org/package=vegan</uri>
            </element-citation>
         </ref>
         <ref id="ref-45"><label>Paez-Espino et al. (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Paez-Espino</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Chen</surname>
                     <given-names>I-MA</given-names>
                  </name>
                  <name>
                     <surname>Palaniappan</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Ratner</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Chu</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Szeto</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Pillay</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Huang</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Markowitz</surname>
                     <given-names>VM</given-names>
                  </name>
                  <name>
                     <surname>Nielsen</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Huntemann</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Reddy</surname>
                     <given-names>TBK</given-names>
                  </name>
                  <name>
                     <surname>Pavlopoulos</surname>
                     <given-names>GA</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
                  <name>
                     <surname>Campbell</surname>
                     <given-names>BJ</given-names>
                  </name>
                  <name>
                     <surname>Chen</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>McMahon</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Hallam</surname>
                     <given-names>SJ</given-names>
                  </name>
                  <name>
                     <surname>Denef</surname>
                     <given-names>V</given-names>
                  </name>
                  <name>
                     <surname>Cavicchioli</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Caffrey</surname>
                     <given-names>SM</given-names>
                  </name>
                  <name>
                     <surname>Streit</surname>
                     <given-names>WR</given-names>
                  </name>
                  <name>
                     <surname>Webster</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Handley</surname>
                     <given-names>KM</given-names>
                  </name>
                  <name>
                     <surname>Salekdeh</surname>
                     <given-names>GH</given-names>
                  </name>
                  <name>
                     <surname>Tsesmetzis</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Setubal</surname>
                     <given-names>JC</given-names>
                  </name>
                  <name>
                     <surname>Pope</surname>
                     <given-names>PB</given-names>
                  </name>
                  <name>
                     <surname>Liu</surname>
                     <given-names>W-T</given-names>
                  </name>
                  <name>
                     <surname>Rivers</surname>
                     <given-names>AR</given-names>
                  </name>
                  <name>
                     <surname>Ivanova</surname>
                     <given-names>NN</given-names>
                  </name>
                  <name>
                     <surname>Kyrpides</surname>
                     <given-names>NC</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>IMG/VR: a database of cultured and uncultured DNA Viruses and retroviruses</article-title>
               <source>Nucleic Acids Research</source>
               <volume>45</volume>
               <fpage>D457</fpage>
               <lpage>D465</lpage>
               <pub-id pub-id-type="doi">10.1093/nar/gkw1030</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-46"><label>Parks et al. (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Parks</surname>
                     <given-names>DH</given-names>
                  </name>
                  <name>
                     <surname>Imelfort</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Skennerton</surname>
                     <given-names>CT</given-names>
                  </name>
                  <name>
                     <surname>Hugenholtz</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Tyson</surname>
                     <given-names>GW</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>CheckM: assessing the quality of microbial genomes recovered from</article-title>
               <source>Genome Research</source>
               <volume>25</volume>
               <fpage>1043</fpage>
               <lpage>1055</lpage>
               <pub-id pub-id-type="doi">10.1101/gr.186072.114</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-47"><label>Paulson et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Paulson</surname>
                     <given-names>JN</given-names>
                  </name>
                  <name>
                     <surname>Stine</surname>
                     <given-names>OC</given-names>
                  </name>
                  <name>
                     <surname>Bravo</surname>
                     <given-names>HC</given-names>
                  </name>
                  <name>
                     <surname>Pop</surname>
                     <given-names>M</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Differential abundance analysis for microbial marker-gene surveys</article-title>
               <source>Nature Methods</source>
               <volume>10</volume>
               <fpage>1200</fpage>
               <lpage>1202</lpage>
               <pub-id pub-id-type="doi">10.1038/nmeth.2658</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-48"><label>Peng et al. (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Peng</surname>
                     <given-names>Y</given-names>
                  </name>
                  <name>
                     <surname>Leung</surname>
                     <given-names>HCM</given-names>
                  </name>
                  <name>
                     <surname>Yiu</surname>
                     <given-names>SM</given-names>
                  </name>
                  <name>
                     <surname>Chin</surname>
                     <given-names>FYL</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>IDBA-UD: a de novo assembler for single-cell and metagenomic sequencing data with highly uneven depth</article-title>
               <source>Bioinformatics</source>
               <volume>28</volume>
               <fpage>1420</fpage>
               <lpage>1428</lpage>
               <pub-id pub-id-type="doi">10.1093/bioinformatics/bts174</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-49"><label>Robinson, McCarthy &amp; Smyth (2009)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Robinson</surname>
                     <given-names>MD</given-names>
                  </name>
                  <name>
                     <surname>McCarthy</surname>
                     <given-names>DJ</given-names>
                  </name>
                  <name>
                     <surname>Smyth</surname>
                     <given-names>GK</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2009">2009</year>
               <article-title>edgeR: a bioconductor package for differential expression analysis of digital gene expression data</article-title>
               <source>Bioinformatics</source>
               <volume>26</volume>
               <fpage>139</fpage>
               <lpage>140</lpage>
               <pub-id pub-id-type="doi">10.1093/bioinformatics/btp616</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-50"><label>Rodriguez-Brito et al. (2010)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Rodriguez-Brito</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Li</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Wegley</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Furlan</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Angly</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Breitbart</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Buchanan</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Desnues</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Dinsdale</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Edwards</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Felts</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Haynes</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Liu</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Lipson</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Mahaffy</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Martin-Cuadrado</surname>
                     <given-names>AB</given-names>
                  </name>
                  <name>
                     <surname>Mira</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Nulton</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Pasić</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Rayhawk</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Rodriguez-mueller</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Rodriguez-Valera</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Salamon</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Srinagesh</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Thingstad</surname>
                     <given-names>TF</given-names>
                  </name>
                  <name>
                     <surname>Tran</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Thurber</surname>
                     <given-names>RV</given-names>
                  </name>
                  <name>
                     <surname>Willner</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Youle</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Rohwer</surname>
                     <given-names>F</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2010">2010</year>
               <article-title>Viral and microbial community dynamics in four aquatic environments</article-title>
               <source>The ISME Journal</source>
               <volume>4</volume>
               <fpage>739</fpage>
               <lpage>751</lpage>
               <pub-id pub-id-type="doi">10.1038/ismej.2010.1</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-51"><label>Roesch et al. (2007)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Roesch</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Fulthorpe</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Riva</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Casella</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Hadwin</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Kent</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Daroub</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Camargo</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Farmerie</surname>
                     <given-names>W</given-names>
                  </name>
                  <name>
                     <surname>Triplett</surname>
                     <given-names>E</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2007">2007</year>
               <article-title>Pyrosequencing enumerates and contrasts soil microbial diversity</article-title>
               <source>The ISME Journal</source>
               <volume>1</volume>
               <fpage>283</fpage>
               <lpage>290</lpage>
               <pub-id pub-id-type="doi">10.1038/ismej.2007.53</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-52"><label>Rose et al. (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Rose</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Constantinides</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Tapinos</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Robertson</surname>
                     <given-names>DL</given-names>
                  </name>
                  <name>
                     <surname>Prosperi</surname>
                     <given-names>M</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>Challenges in the analysis of viral metagenomes</article-title>
               <source>Virus Evolution</source>
               <volume>2</volume>
               <comment>Article vew022</comment>
               <pub-id pub-id-type="doi">10.1093/ve/vew022</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-53"><label>Roux et al. (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Roux</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Enault</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Robin</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Ravet</surname>
                     <given-names>V</given-names>
                  </name>
                  <name>
                     <surname>Personnic</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Theil</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Colombet</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Sime-Ngando</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Debroas</surname>
                     <given-names>D</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>Assessing the diversity and specificity of two freshwater viral communities through metagenomics</article-title>
               <source>PLOS ONE</source>
               <volume>7</volume>
               <elocation-id>e33641</elocation-id>
               <pub-id pub-id-type="doi">10.1371/journal.pone.0033641</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-54"><label>Roux et al. (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Roux</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Solonenko</surname>
                     <given-names>NE</given-names>
                  </name>
                  <name>
                     <surname>Dang</surname>
                     <given-names>VT</given-names>
                  </name>
                  <name>
                     <surname>Poulos</surname>
                     <given-names>BT</given-names>
                  </name>
                  <name>
                     <surname>Schwenck</surname>
                     <given-names>SM</given-names>
                  </name>
                  <name>
                     <surname>Goldsmith</surname>
                     <given-names>DB</given-names>
                  </name>
                  <name>
                     <surname>Coleman</surname>
                     <given-names>ML</given-names>
                  </name>
                  <name>
                     <surname>Breitbart</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Sullivan</surname>
                     <given-names>MB</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>Towards quantitative viromics for both double-stranded and single-stranded DNA viruses</article-title>
               <source>PeerJ</source>
               <volume>4</volume>
               <elocation-id>e2777</elocation-id>
               <pub-id pub-id-type="doi">10.7717/peerj.2777</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-55"><label>Roux et al. (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Roux</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Tournayre</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Mahul</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Debroas</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Enault</surname>
                     <given-names>F</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Metavir 2: new tools for viral metagenome comparison and assembled virome analysis</article-title>
               <source>BMC Bioinformatics</source>
               <volume>15</volume>
               <fpage>1</fpage>
               <lpage>12</lpage>
            </element-citation>
         </ref>
         <ref id="ref-56"><label>Sangwan, Xia &amp; Gilbert (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Sangwan</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Xia</surname>
                     <given-names>F</given-names>
                  </name>
                  <name>
                     <surname>Gilbert</surname>
                     <given-names>JA</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>Recovering complete and draft population genomes from metagenome datasets</article-title>
               <source>Microbiome</source>
               <volume>4</volume>
               <comment>Article 8</comment>
               <pub-id pub-id-type="doi">10.1186/s40168-016-0154-5</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-57"><label>Schoenfeld et al. (2008)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Schoenfeld</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Patterson</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Richardson</surname>
                     <given-names>PM</given-names>
                  </name>
                  <name>
                     <surname>Wommack</surname>
                     <given-names>KE</given-names>
                  </name>
                  <name>
                     <surname>Young</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Mead</surname>
                     <given-names>D</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2008">2008</year>
               <article-title>Assembly of viral metagenomes from yellowstone hot springs</article-title>
               <source>Applied and Environmental Microbiology</source>
               <volume>74</volume>
               <fpage>4164</fpage>
               <lpage>4174</lpage>
               <pub-id pub-id-type="doi">10.1128/AEM.02598-07</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-58"><label>Sczyrba et al. (2017)</label><element-citation publication-type="workingpaper">
               <person-group person-group-type="author">
                  <name>
                     <surname>Sczyrba</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Hofmann</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Belmann</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Koslicki</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Dröge</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Gregor</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Majda</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Fiedler</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Dahms</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Bremges</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Fritz</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Garrido-oter</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Jørgensen</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Shapiro</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Blood</surname>
                     <given-names>PD</given-names>
                  </name>
                  <name>
                     <surname>Gurevich</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Hansen</surname>
                     <given-names>LH</given-names>
                  </name>
                  <name>
                     <surname>Sørensen</surname>
                     <given-names>SJ</given-names>
                  </name>
                  <name>
                     <surname>Chia</surname>
                     <given-names>BKH</given-names>
                  </name>
                  <name>
                     <surname>Denis</surname>
                     <given-names>B</given-names>
                  </name>
                  <name>
                     <surname>Froula</surname>
                     <given-names>JL</given-names>
                  </name>
                  <name>
                     <surname>Wang</surname>
                     <given-names>Z</given-names>
                  </name>
                  <name>
                     <surname>Egan</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Kang</surname>
                     <given-names>DD</given-names>
                  </name>
                  <name>
                     <surname>Singer</surname>
                     <given-names>W</given-names>
                  </name>
                  <name>
                     <surname>Jain</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Strous</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Klingenberg</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Meinicke</surname>
                     <given-names>P</given-names>
                  </name>
                  <name>
                     <surname>Barton</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Lingner</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Lin</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Liao</surname>
                     <given-names>Y</given-names>
                  </name>
                  <name>
                     <surname>Silva</surname>
                     <given-names>GZ</given-names>
                  </name>
                  <name>
                     <surname>Cuevas</surname>
                     <given-names>DA</given-names>
                  </name>
                  <name>
                     <surname>Edwards</surname>
                     <given-names>RA</given-names>
                  </name>
                  <name>
                     <surname>Saha</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Vitor</surname>
                     <given-names>C</given-names>
                  </name>
                  <name>
                     <surname>Renard</surname>
                     <given-names>BY</given-names>
                  </name>
                  <name>
                     <surname>Hill</surname>
                     <given-names>CM</given-names>
                  </name>
                  <name>
                     <surname>Pop</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Goeker</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Kyrpides</surname>
                     <given-names>N</given-names>
                  </name>
                  <name>
                     <surname>Woyke</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>Vorholt</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Rubin</surname>
                     <given-names>EM</given-names>
                  </name>
                  <name>
                     <surname>Darling</surname>
                     <given-names>AE</given-names>
                  </name>
                  <name>
                     <surname>Rattei</surname>
                     <given-names>T</given-names>
                  </name>
                  <name>
                     <surname>McHardy</surname>
                     <given-names>AC</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>Critical assessment of metagenome interpretation—a comprehensive benchmark of computational metagenomics software</article-title>
               <source>BioRxiv</source>
               <pub-id pub-id-type="doi">10.1101/099127</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-59"><label>Sharon et al. (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Sharon</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Kertesz</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Hug</surname>
                     <given-names>LA</given-names>
                  </name>
                  <name>
                     <surname>Pushkarev</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Blauwkamp</surname>
                     <given-names>TA</given-names>
                  </name>
                  <name>
                     <surname>Castelle</surname>
                     <given-names>CJ</given-names>
                  </name>
                  <name>
                     <surname>Amirebrahimi</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Thomas</surname>
                     <given-names>BC</given-names>
                  </name>
                  <name>
                     <surname>Burstein</surname>
                     <given-names>D</given-names>
                  </name>
                  <name>
                     <surname>Tringe</surname>
                     <given-names>SG</given-names>
                  </name>
                  <name>
                     <surname>Williams</surname>
                     <given-names>KH</given-names>
                  </name>
                  <name>
                     <surname>Banfield</surname>
                     <given-names>JF</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Accurate, multi-kb reads resolve complex populations and detect rare microorganisms</article-title>
               <source>Genome Research</source>
               <volume>25</volume>
               <fpage>534</fpage>
               <lpage>543</lpage>
               <pub-id pub-id-type="doi">10.1101/gr.183012.114</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-60"><label>Sharon et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Sharon</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Morowitz</surname>
                     <given-names>MJ</given-names>
                  </name>
                  <name>
                     <surname>Thomas</surname>
                     <given-names>BC</given-names>
                  </name>
                  <name>
                     <surname>Costello</surname>
                     <given-names>EK</given-names>
                  </name>
                  <name>
                     <surname>Relman</surname>
                     <given-names>DA</given-names>
                  </name>
                  <name>
                     <surname>Banfield</surname>
                     <given-names>JF</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Time series community genomics analysis reveals rapid shifts in bacterial species, strains, and phage during infant gut colonization</article-title>
               <source>Genome Research</source>
               <volume>23</volume>
               <fpage>111</fpage>
               <lpage>120</lpage>
               <pub-id pub-id-type="doi">10.1101/gr.142315.112</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-61"><label>Solden, Lloyd &amp; Wrighton (2016)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Solden</surname>
                     <given-names>L</given-names>
                  </name>
                  <name>
                     <surname>Lloyd</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Wrighton</surname>
                     <given-names>K</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2016">2016</year>
               <article-title>The bright side of microbial dark matter: lessons learned from the uncultivated majority</article-title>
               <source>Current Opinion in Microbiology</source>
               <volume>31</volume>
               <fpage>217</fpage>
               <lpage>226</lpage>
               <pub-id pub-id-type="doi">10.1016/j.mib.2016.04.020</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-62"><label>Steward et al. (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Steward</surname>
                     <given-names>GF</given-names>
                  </name>
                  <name>
                     <surname>Culley</surname>
                     <given-names>AI</given-names>
                  </name>
                  <name>
                     <surname>Mueller</surname>
                     <given-names>JA</given-names>
                  </name>
                  <name>
                     <surname>Wood-Charlson</surname>
                     <given-names>EM</given-names>
                  </name>
                  <name>
                     <surname>Belcaid</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Poisson</surname>
                     <given-names>G</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>Are we missing half of the viruses in the ocean?</article-title>
               <source>The ISME Journal</source>
               <volume>7</volume>
               <fpage>672</fpage>
               <lpage>679</lpage>
               <pub-id pub-id-type="doi">10.1038/ismej.2012.121</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-63"><label>Tadmor et al. (2011)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Tadmor</surname>
                     <given-names>AD</given-names>
                  </name>
                  <name>
                     <surname>Ottesen</surname>
                     <given-names>EA</given-names>
                  </name>
                  <name>
                     <surname>Leadbetter</surname>
                     <given-names>JR</given-names>
                  </name>
                  <name>
                     <surname>Phillips</surname>
                     <given-names>R</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2011">2011</year>
               <article-title>Probing individual environmental bacteria for viruses by using microfluidic digital PCR</article-title>
               <source>Science</source>
               <volume>333</volume>
               <issue>6038</issue>
               <fpage>58</fpage>
               <lpage>62</lpage>
               <pub-id pub-id-type="doi">10.1126/science.1200758</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-64"><label>Vázquez-Castellanos et al. (2014)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Vázquez-Castellanos</surname>
                     <given-names>JF</given-names>
                  </name>
                  <name>
                     <surname>García-López</surname>
                     <given-names>R</given-names>
                  </name>
                  <name>
                     <surname>Pérez-Brocal</surname>
                     <given-names>V</given-names>
                  </name>
                  <name>
                     <surname>Pignatelli</surname>
                     <given-names>M</given-names>
                  </name>
                  <name>
                     <surname>Moya</surname>
                     <given-names>A</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2014">2014</year>
               <article-title>Comparison of different assembly and annotation tools on analysis of simulated viral metagenomic communities in the gut</article-title>
               <source>BMC Genomics</source>
               <volume>15</volume>
               <fpage>37</fpage>
               <pub-id pub-id-type="doi">10.1186/1471-2164-15-37</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-65"><label>Vollmers, Wiegand &amp; Kaster (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Vollmers</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Wiegand</surname>
                     <given-names>S</given-names>
                  </name>
                  <name>
                     <surname>Kaster</surname>
                     <given-names>AK</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>Comparing and evaluating metagenome assembly tools from a microbiologist’s perspective—not only size matters!</article-title>
               <source>PLOS ONE</source>
               <volume>12</volume>
               <fpage>1</fpage>
               <lpage>31</lpage>
               <pub-id pub-id-type="doi">10.1371/journal.pone.0169662</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-66"><label>Waldor et al. (2015)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Waldor</surname>
                     <given-names>MK</given-names>
                  </name>
                  <name>
                     <surname>Tyson</surname>
                     <given-names>G</given-names>
                  </name>
                  <name>
                     <surname>Borenstein</surname>
                     <given-names>E</given-names>
                  </name>
                  <name>
                     <surname>Ochman</surname>
                     <given-names>H</given-names>
                  </name>
                  <name>
                     <surname>Moeller</surname>
                     <given-names>A</given-names>
                  </name>
                  <name>
                     <surname>Finlay</surname>
                     <given-names>BB</given-names>
                  </name>
                  <name>
                     <surname>Kong</surname>
                     <given-names>HH</given-names>
                  </name>
                  <name>
                     <surname>Gordon</surname>
                     <given-names>JI</given-names>
                  </name>
                  <name>
                     <surname>Nelson</surname>
                     <given-names>KE</given-names>
                  </name>
                  <name>
                     <surname>Dabbagh</surname>
                     <given-names>K</given-names>
                  </name>
                  <name>
                     <surname>Smith</surname>
                     <given-names>H</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2015">2015</year>
               <article-title>Where next for microbiome research?</article-title>
               <source>PLOS Biology</source>
               <volume>13</volume>
               <fpage>1</fpage>
               <lpage>9</lpage>
               <pub-id pub-id-type="doi">10.1371/journal.pbio.1002050</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-67"><label>White, Wang &amp; Hall (2017)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>White</surname>
                     <given-names>DJ</given-names>
                  </name>
                  <name>
                     <surname>Wang</surname>
                     <given-names>J</given-names>
                  </name>
                  <name>
                     <surname>Hall</surname>
                     <given-names>RJ</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2017">2017</year>
               <article-title>Assessing the impact of assemblers on virus detection in a de novo metagenomic analysis pipeline</article-title>
               <source>Journal of Computational Biology</source>
               <volume>24</volume>
               <fpage>cmb.2017.0008</fpage>
               <pub-id pub-id-type="doi">10.1089/cmb.2017.0008</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-68"><label>Wickham (2009)</label><element-citation publication-type="book">
               <person-group person-group-type="author">
                  <name>
                     <surname>Wickham</surname>
                     <given-names>H</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2009">2009</year>
               <source>ggplot2: elegant graphics for data analysis</source>
               <publisher-name>Springer Publishing Company</publisher-name>
               <publisher-loc>New York</publisher-loc>
            </element-citation>
         </ref>
         <ref id="ref-69"><label>Wrighton et al. (2012)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Wrighton</surname>
                     <given-names>KC</given-names>
                  </name>
                  <name>
                     <surname>Thomas</surname>
                     <given-names>BC</given-names>
                  </name>
                  <name>
                     <surname>Sharon</surname>
                     <given-names>I</given-names>
                  </name>
                  <name>
                     <surname>Miller</surname>
                     <given-names>CS</given-names>
                  </name>
                  <name>
                     <surname>Castelle</surname>
                     <given-names>CJ</given-names>
                  </name>
                  <name>
                     <surname>Verberkmoes</surname>
                     <given-names>NC</given-names>
                  </name>
                  <name>
                     <surname>Wilkins</surname>
                     <given-names>MJ</given-names>
                  </name>
                  <name>
                     <surname>Hettich</surname>
                     <given-names>RL</given-names>
                  </name>
                  <name>
                     <surname>Lipton</surname>
                     <given-names>MS</given-names>
                  </name>
                  <name>
                     <surname>Williams</surname>
                     <given-names>KH</given-names>
                  </name>
                  <name>
                     <surname>Long</surname>
                     <given-names>PE</given-names>
                  </name>
                  <name>
                     <surname>Banfield</surname>
                     <given-names>JF</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2012">2012</year>
               <article-title>Fermentation, hydrogen, and sulfur metabolism in multiple uncultivated bacterial phyla</article-title>
               <source>Science</source>
               <volume>337</volume>
               <fpage>1661</fpage>
               <lpage>1666</lpage>
               <pub-id pub-id-type="doi">10.1126/science.1224041</pub-id>
            </element-citation>
         </ref>
         <ref id="ref-70"><label>Yang, Chockalingam &amp; Aluru (2013)</label><element-citation publication-type="journal">
               <person-group person-group-type="author">
                  <name>
                     <surname>Yang</surname>
                     <given-names>X</given-names>
                  </name>
                  <name>
                     <surname>Chockalingam</surname>
                     <given-names>SP</given-names>
                  </name>
                  <name>
                     <surname>Aluru</surname>
                     <given-names>S</given-names>
                  </name>
               </person-group>
               <year iso-8601-date="2013">2013</year>
               <article-title>A survey of error-correction methods for next-generation sequencing</article-title>
               <source>Briefings in Bioinformatics</source>
               <volume>14</volume>
               <fpage>56</fpage>
               <lpage>66</lpage>
               <pub-id pub-id-type="doi">10.1093/bib/bbs015</pub-id>
            </element-citation>
         </ref>
      </ref-list>
   </back>
</article>
