Drop support for old sequence/metadata inputs

huddlej · huddlej · commit 6b3207170630 · 2021-05-03T09:57:38.000-07:00
Removes deprecated sequence and metadata inputs from the configuration
file and removes Snakemake logic required to support these files. Also,
removes references to this deprecated input format from the example
profiles and the "multiple inputs" tutorial.

Since we no longer support this old input format, we also no longer need
to support empty origin wildcards. We drop support for empty origin
wildcard and remove all references to trimming of origin wildcards
that start with an underscore and update all rules to reference the origin
wildcard with the underscore in the filename.

We also now print helpful errors when inputs aren't defined properly
through checks for configurations with old-style input definitions or
without any inputs defined. These error messages provide recommendations
about how to update the workflow configuration to fix the issues.
diff --git a/Snakefile b/Snakefile
@@ -8,6 +8,7 @@ from getpass import getuser
 from snakemake.logging import logger
 from snakemake.utils import validate
 from collections import OrderedDict
+import textwrap
 import time
 
 # Store the user's configuration prior to loading defaults, so we can check for
@@ -72,6 +73,26 @@ if "builds" not in config:
 
 include: "workflow/snakemake_rules/reference_build_definitions.smk"
 
+# Check for old-style input file references and alert users to the new format.
+if "sequences" in config or "metadata" in config:
+    logger.error("ERROR: Your configuration file includes references to a deprecated specification of input files (e.g., `config['sequences']` or `config['metadata']`).")
+    logger.error("Update your configuration file (e.g., 'builds.yaml') to define your inputs as follows and try running the workflow again:")
+    logger.error(textwrap.indent(
+        f"\ninputs:\n  metadata: {config['metadata']}\n  sequences: {config['sequences']}\n",
+        "  "
+    ))
+    sys.exit(1)
+
+# Check for missing inputs.
+if "inputs" not in config:
+    logger.error("ERROR: Your workflow does not define any input files to start with.")
+    logger.error("Update your configuration file (e.g., 'builds.yaml') to define at least one input dataset as follows and try running the workflow again:")
+    logger.error(textwrap.indent(
+        f"\ninputs:\n  metadata: data/example_metadata.tsv\n  sequences: data/example_sequences.fasta.gz\n",
+        "  "
+    ))
+    sys.exit(1)
+
 # Allow users to specify a list of active builds from the command line.
 if config.get("active_builds"):
     BUILD_NAMES = config["active_builds"].split(",")
@@ -91,7 +112,7 @@ wildcard_constraints:
     # but not special strings used for Nextstrain builds.
     build_name = r'(?:[_a-zA-Z-](?!(tip-frequencies)))+',
     date = r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]",
-    origin = r"(_[a-zA-Z0-9-]+)?" # origin starts with an underscore _OR_ it's the empty string
+    origin = r"[a-zA-Z0-9-_]+"
 
 localrules: download_metadata, download_sequences, download, upload, clean
 
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -6,11 +6,6 @@
 # This must be a relative path to the top-level Snakefile directory (e.g., `ncov/`).
 conda_environment: "workflow/envs/nextstrain.yaml"
 
-# These are the two main starting files for the run.
-# If they do not exist, we will attempt to fetch them from a S3 bucket (see below)
-sequences: "data/sequences.fasta"
-metadata: "data/metadata.tsv"
-
 reference_node_name: "USA/WA1/2020"
 
 # Define files used for external configuration. Common examples consist of a
diff --git a/docs/multiple_inputs.md b/docs/multiple_inputs.md
@@ -51,15 +51,17 @@ my_profiles/example_multiple_inputs/my_auspice_config.json
 
 ## Setting up the config
 
-Typically, inside the `builds.yaml` one would specify input files such as
+You can define a single input dataset in `builds.yaml` as follows.
 
 ```yaml
-# traditional syntax for specifying starting files
-sequences: "data/sequences.fasta"
-metadata: "data/metadata.tsv"
+inputs:
+  - name: my-data
+    metadata: "data/metadata.tsv"
+    sequences: "data/sequences.fasta"
 ```
 
-For multiple inputs, we shall use the new `inputs` section of the config to specify that we have two different inputs, and we will give them the names "aus" and "worldwide":
+For multiple inputs, you can add another entry to the `inputs` config list.
+Here, we will give them the names "aus" and "worldwide":
 
 ```yaml
 # my_profiles/example_multiple_inputs/builds.yaml
@@ -72,15 +74,11 @@ inputs:
     sequences: "data/example_sequences_worldwide.fasta"
 ```
 
-> Note that if you also specify `sequences` or `metadata` as top level entries in the config, they will be ignored.
-
 ### Snakemake terminology
 
 Inside the Snakemake rules, we use a wildcard `origin` to define different starting points.
-For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="_worldwide"` and we expect that the config has defined
-a sequences input via `config["sequences"]["worldwide"]=<path to fasta>` (note the leading `_` has been stripped from the `origin` in the config).
-If we use the older syntax (specifying `sequences` or `metadata` as top level entries in the config) then `wildcards.origin=""`.
-
+For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="worldwide"` and we expect that the config has defined
+a sequences input as shown above.
 
 ## How is metadata combined?
 
diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml
@@ -11,6 +11,12 @@
 
 # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
 
+# Define input files.
+inputs:
+  - name: example-data
+    metadata: data/example_metadata.tsv
+    sequences: data/example_sequences.fasta.gz
+
 builds:
   # Focus on King County (location) in Washington State (division) in the USA (country)
   # with a build name that will produce the following URL fragment on Nextstrain/auspice:
diff --git a/my_profiles/example/config.yaml b/my_profiles/example/config.yaml
@@ -10,10 +10,6 @@ configfile:
   - defaults/parameters.yaml # Pull in the default values
   - my_profiles/example/builds.yaml # Pull in our list of desired builds
 
-config:
-  - sequences=data/example_sequences.fasta
-  - metadata=data/example_metadata.tsv
-
 # Set the maximum number of cores you want Snakemake to use for this pipeline.
 cores: 2
 
diff --git a/my_profiles/getting_started/builds.yaml b/my_profiles/getting_started/builds.yaml
@@ -8,6 +8,12 @@
 # These subsample primarily from the area of interest ("focus"), and add in background ("contextual") sequences from the rest of the world.
 # Contextual sequences that are genetically similar to (hamming distance) and geographically near the focal sequences are heavily prioritized.
 
+# Define input files.
+inputs:
+  - name: example-data
+    metadata: data/example_metadata.tsv
+    sequences: data/example_sequences.fasta.gz
+
 # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
 builds:
   # This build samples evenly from the globe
diff --git a/my_profiles/getting_started/config.yaml b/my_profiles/getting_started/config.yaml
@@ -10,10 +10,6 @@ configfile:
   - defaults/parameters.yaml # Pull in the default values
   - my_profiles/getting_started/builds.yaml # Pull in our list of desired builds
 
-config:
-  - sequences=data/example_sequences.fasta
-  - metadata=data/example_metadata.tsv
-
 # Set the maximum number of cores you want Snakemake to use for this pipeline.
 cores: 1
 
diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
@@ -25,89 +25,69 @@ def numeric_date(dt=None):
 
     return res
 
-def _trim_origin(origin):
-    """the origin wildcard includes a leading `_`. This function returns the value without this `_`"""
-    if origin=="":
-        return ""
-    return origin[1:]
-
 def _get_subsampling_scheme_by_build_name(build_name):
     return config["builds"][build_name].get("subsampling_scheme", build_name)
 
 def _get_filter_value(wildcards, key):
     default = config["filter"].get(key, "")
     if wildcards["origin"] == "":
         return default
-    return config["filter"].get(_trim_origin(wildcards["origin"]), {}).get(key, default)
+    return config["filter"].get(wildcards["origin"], {}).get(key, default)
 
 def _get_path_for_input(stage, origin_wildcard):
     """
     A function called to define an input for a Snakemake rule
     This function always returns a local filepath, the format of which decides whether rules should
     create this by downloading from a remote resource, or create it by a local compute rule.
     """
-    if not origin_wildcard:
-        # No origin wildcards => deprecated single inputs (e.g. `config["sequences"]`) which cannot
-        # be downloaded from remote resources
-        if config.get("inputs"):
-            raise Exception("ERROR: empty origin wildcard but config defines 'inputs`")
-        path_or_url = config[stage] if stage in ["metadata", "sequences"] else ""
-        remote = False
-    else:
-        trimmed_origin = _trim_origin(origin_wildcard)
-        path_or_url = config.get("inputs", {}).get(trimmed_origin, {}).get(stage, "")
-        scheme = urlsplit(path_or_url).scheme
-        remote = bool(scheme)
+    path_or_url = config.get("inputs", {}).get(origin_wildcard, {}).get(stage, "")
+    scheme = urlsplit(path_or_url).scheme
+    remote = bool(scheme)
 
-        # Following checking should be the remit of the rule which downloads the remote resource
-        if scheme and scheme!="s3":
-            raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
+    # Following checking should be the remit of the rule which downloads the remote resource
+    if scheme and scheme!="s3":
+        raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
 
-        ## Basic checking which could be taken care of by the config schema
-        ## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
-        if path_or_url=="" and stage in ["metadata", "sequences"]:
-            raise Exception(f"ERROR: config->input->{trimmed_origin}->{stage} is not defined.")
+    ## Basic checking which could be taken care of by the config schema
+    ## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
+    if path_or_url=="" and stage in ["metadata", "sequences"]:
+        raise Exception(f"ERROR: config->input->{origin_wildcard}->{stage} is not defined.")
 
     if stage=="metadata":
-        return f"data/downloaded{origin_wildcard}.tsv" if remote else path_or_url
+        return f"data/downloaded_{origin_wildcard}.tsv" if remote else path_or_url
     if stage=="sequences":
-        return f"data/downloaded{origin_wildcard}.fasta" if remote else path_or_url
+        return f"data/downloaded_{origin_wildcard}.fasta" if remote else path_or_url
     if stage=="aligned":
-        return f"results/precomputed-aligned{origin_wildcard}.fasta" if remote else f"results/aligned{origin_wildcard}.fasta"
+        return f"results/precomputed-aligned_{origin_wildcard}.fasta" if remote else f"results/aligned_{origin_wildcard}.fasta"
     if stage=="to-exclude":
-        return f"results/precomputed-to-exclude{origin_wildcard}.txt" if remote else f"results/to-exclude{origin_wildcard}.txt"
+        return f"results/precomputed-to-exclude_{origin_wildcard}.txt" if remote else f"results/to-exclude_{origin_wildcard}.txt"
     if stage=="masked":
-        return f"results/precomputed-masked{origin_wildcard}.fasta" if remote else f"results/masked{origin_wildcard}.fasta"
+        return f"results/precomputed-masked_{origin_wildcard}.fasta" if remote else f"results/masked_{origin_wildcard}.fasta"
     if stage=="filtered":
         if remote:
-            return f"results/precomputed-filtered{origin_wildcard}.fasta"
+            return f"results/precomputed-filtered_{origin_wildcard}.fasta"
         elif path_or_url:
             return path_or_url
         else:
-            return f"results/filtered{origin_wildcard}.fasta"
+            return f"results/filtered_{origin_wildcard}.fasta"
 
     raise Exception(f"_get_path_for_input with unknown stage \"{stage}\"")
 
 
 def _get_unified_metadata(wildcards):
     """
     Returns a single metadata file representing the input metadata file(s).
-    If there was only one supplied metadata file (e.g. the deprecated
-    `config["metadata"]` syntax, or one entry in the `config["inputs"] dict`)
+    If there was only one supplied metadata file in the `config["inputs"] dict`,
     then that file is returned. Else "results/combined_metadata.tsv" is returned
     which will run the `combine_input_metadata` rule to make it.
     """
-    if not config.get("inputs"):
-        return config["metadata"]
     if len(list(config["inputs"].keys()))==1:
-        return _get_path_for_input("metadata", "_"+list(config["inputs"].keys())[0])
+        return _get_path_for_input("metadata", list(config["inputs"].keys())[0])
     return "results/combined_metadata.tsv"
 
 def _get_unified_alignment(wildcards):
-    if not config.get("inputs"):
-        return "results/filtered.fasta"
     if len(list(config["inputs"].keys()))==1:
-        return _get_path_for_input("filtered", "_"+list(config["inputs"].keys())[0])
+        return _get_path_for_input("filtered", list(config["inputs"].keys())[0])
     return "results/combined_sequences_for_subsampling.fasta",
 
 def _get_metadata_by_build_name(build_name):
diff --git a/workflow/snakemake_rules/download.smk b/workflow/snakemake_rules/download.smk
@@ -8,8 +8,8 @@
 #       rule: align
 #           input: _get_path_for_input
 #           ...
-# will result in an input file looking like "results/aligned{origin}.fasta" or
-# "results/download-aligned{origin}.fasta" (which one is chosen depends on the
+# will result in an input file looking like "results/aligned_{origin}.fasta" or
+# "results/download-aligned_{origin}.fasta" (which one is chosen depends on the
 # supplied `config`). In the latter case, `rule download_aligned` will be used.
 # See https://github.com/nextstrain/ncov/compare/remote-files for an example of
 # how we could leverage snakemake to do this without needing a separate rule!
@@ -33,11 +33,11 @@ def _infer_decompression(input):
 rule download_sequences:
     message: "Downloading sequences from {params.address} -> {output.sequences}"
     output:
-        sequences = "data/downloaded{origin}.fasta"
+        sequences = "data/downloaded_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["sequences"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["sequences"])
+        address = lambda w: config["inputs"][w.origin]["sequences"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["sequences"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
@@ -46,10 +46,10 @@ rule download_sequences:
 rule download_metadata:
     message: "Downloading metadata from {params.address} -> {output.metadata}"
     output:
-        metadata = "data/downloaded{origin}.tsv"
+        metadata = "data/downloaded_{origin}.tsv"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["metadata"]
+        address = lambda w: config["inputs"][w.origin]["metadata"]
     shell:
         """
         aws s3 cp {params.address} - | gunzip -cq >{output.metadata:q}
@@ -58,11 +58,11 @@ rule download_metadata:
 rule download_aligned:
     message: "Downloading aligned fasta files from {params.address} -> {output.sequences}"
     output:
-        sequences = "results/precomputed-aligned{origin}.fasta"
+        sequences = "results/precomputed-aligned_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["aligned"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["aligned"])
+        address = lambda w: config["inputs"][w.origin]["aligned"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["aligned"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
@@ -76,17 +76,17 @@ rule download_diagnostic:
         {params.to_exclude_address} -> {output.to_exclude}
     """
     output:
-        diagnostics = "results/precomputed-sequence-diagnostics{origin}.tsv",
-        flagged = "results/precomputed-flagged-sequences{origin}.tsv",
-        to_exclude = "results/precomputed-to-exclude{origin}.txt"
+        diagnostics = "results/precomputed-sequence-diagnostics_{origin}.tsv",
+        flagged = "results/precomputed-flagged-sequences_{origin}.tsv",
+        to_exclude = "results/precomputed-to-exclude_{origin}.txt"
     conda: config["conda_environment"]
     params:
         # Only `to-exclude` is defined via the config, so we make some assumptions about the format of the other filenames
-        to_exclude_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"],
-        flagged_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'flagged-sequences{w.origin}.tsv'),
-        diagnostics_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'sequence-diagnostics{w.origin}.tsv'),
+        to_exclude_address = lambda w: config["inputs"][w.origin]["to-exclude"],
+        flagged_address = lambda w: config["inputs"][w.origin]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'flagged-sequences{w.origin}.tsv'),
+        diagnostics_address = lambda w: config["inputs"][w.origin]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'sequence-diagnostics{w.origin}.tsv'),
         # assume the compression is the same across all 3 addresses
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["to-exclude"])
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["to-exclude"])
     shell:
         """
         aws s3 cp {params.to_exclude_address} - | {params.deflate} > {output.to_exclude:q}
@@ -98,11 +98,11 @@ rule download_diagnostic:
 rule download_masked:
     message: "Downloading aligned & masked FASTA from {params.address} -> {output.sequences}"
     output:
-        sequences = "results/precomputed-masked{origin}.fasta"
+        sequences = "results/precomputed-masked_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["masked"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["masked"])
+        address = lambda w: config["inputs"][w.origin]["masked"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["masked"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
@@ -112,12 +112,12 @@ rule download_masked:
 rule download_filtered:
     message: "Downloading pre-computed filtered alignment from {params.address} -> {output.sequences}"
     output:
-        sequences = "results/precomputed-filtered{origin}.fasta"
+        sequences = "results/precomputed-filtered_{origin}.fasta"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["filtered"],
-        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["filtered"])
+        address = lambda w: config["inputs"][w.origin]["filtered"],
+        deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["filtered"])
     shell:
         """
         aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q}
-        """
+        """
diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk
@@ -94,14 +94,14 @@ rule mutation_summary:
         reference = config["files"]["alignment_reference"],
         genemap = config["files"]["annotation"]
     output:
-        mutation_summary = "results/mutation_summary{origin}.tsv"
+        mutation_summary = "results/mutation_summary_{origin}.tsv"
     log:
-        "logs/mutation_summary{origin}.txt"
+        "logs/mutation_summary_{origin}.txt"
     benchmark:
-        "benchmarks/mutation_summary{origin}.txt"
+        "benchmarks/mutation_summary_{origin}.txt"
     params:
         outdir = "results/translations",
-        basename = "seqs{origin}"
+        basename = "seqs_{origin}"
     conda: config["conda_environment"]
     shell:
         """
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk