Use io.read_metadata during export

Replaces a call to the older `utils.read_metadata` function with the newer `io.read_metadata` function while processing metadata for export to an Auspice JSON. This new function returns a pandas DataFrame indexed by the first viable strain name column found in the metadata file (removing this column from the data itself), while the original function returns a dictionary indexed by strain name (keeping the original named column like `strain` or `name` in the data). To avoid changing the downstream code that consumes the metadata, this commit converts the pandas DataFrame to a dictionary that matches the output of the original function. The main advantage here is that the calling code does not need to know what the id column is named, since `io.read_metadata` handles this and indexed the data frame by that column. This commit also adds functional tests for the expected behavior of export v2 with metadata inputs. Fixes #905
nextstrain · Apr 28, 2022 · 3be4d18 · 3be4d18
1 parent 4b71e7d
commit 3be4d18
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 2 deletions.
diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -9,7 +9,9 @@
 import numbers
 import re
 from Bio import Phylo
-from .utils import read_metadata, read_node_data, write_json, read_config, read_lat_longs, read_colors
+
+from .io import read_metadata
+from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors
 from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError
 
 # Set up warnings & exceptions
@@ -992,7 +994,10 @@ def run_v2(args):
 
     if args.metadata is not None:
         try:
-            metadata_file, _ = read_metadata(args.metadata)
+            metadata_file = read_metadata(args.metadata).to_dict(orient="index")
+            for strain in metadata_file.keys():
+                if "strain" not in metadata_file[strain]:
+                    metadata_file[strain]["strain"] = strain
         except FileNotFoundError:
             print(f"ERROR: meta data file ({args.metadata}) does not exist")
             sys.exit(2)

diff --git a/tests/functional/export_v2.t b/tests/functional/export_v2.t
@@ -65,3 +65,36 @@ Export with auspice config JSON with an extensions block
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  export_v2/dataset2.json "$TMP/dataset3.json" \
   >   --exclude-paths "root['meta']['updated']"
   {}
+
+Run export with metadata using the default id column of "strain".
+
+  $ ${AUGUR} export v2 \
+  >  --tree export_v2/tree.nwk \
+  >  --metadata export_v2/dataset1_metadata_with_strain.tsv \
+  >  --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >  --auspice-config export_v2/auspice_config1.json \
+  >  --maintainers "Nextstrain Team" \
+  >  --output "$TMP/dataset1.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset1.json "$TMP/dataset1.json" \
+  >   --exclude-paths "root['meta']['updated']" "root['meta']['maintainers']"
+  {}
+  $ rm -f "$TMP/dataset1.json"
+
+Run export with metadata that uses a different id column other than "strain".
+In this case, the column is "name" (one of the default columns expected by Augur's `io.read_metadata` function).
+
+  $ ${AUGUR} export v2 \
+  >  --tree export_v2/tree.nwk \
+  >  --metadata export_v2/dataset1_metadata_with_name.tsv \
+  >  --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >  --auspice-config export_v2/auspice_config1.json \
+  >  --maintainers "Nextstrain Team" \
+  >  --output "$TMP/dataset1.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset1.json "$TMP/dataset1.json" \
+  >   --exclude-paths "root['meta']['updated']" "root['meta']['maintainers']"
+  {}
+  $ rm -f "$TMP/dataset1.json"
+
+  $ popd > /dev/null