nextstrain · huddlej · Aug 9, 2022 · Jul 25, 2022 · Jul 25, 2022 · Aug 8, 2022
diff --git a/CHANGES.md b/CHANGES.md
@@ -33,6 +33,7 @@
 
 * export: `--skip-validation` now also skips version compatibility checks [#902][]. (@corneliusroemer)
 * filter: Report names of duplicate strains found during metadata parsing [#1008][] (@huddlej)
+* translate: Add support for Nextclade gene map GFFs [#1017][] (@huddlej)
 
 ### Bug Fixes
 
@@ -43,6 +44,7 @@
 [#1002]: https://github.com/nextstrain/augur/pull/1002
 [#1006]: https://github.com/nextstrain/augur/pull/1006
 [#1008]: https://github.com/nextstrain/augur/pull/1008
+[#1017]: https://github.com/nextstrain/augur/pull/1017
 
 ## 16.0.3 (6 July 2022)
 

diff --git a/augur/utils.py b/augur/utils.py
@@ -147,16 +147,24 @@ def load_features(reference, feature_names=None):
         with open(reference, encoding='utf-8') as in_handle:
             for rec in GFF.parse(in_handle, limit_info=limit_info):
                 for feat in rec.features:
-                    if feature_names is not None: #check both tags; user may have used either
+                    # Check for gene names stored in qualifiers commonly used by
+                    # virus-specific gene maps first (e.g., 'gene',
+                    # 'gene_name'). Then, check for qualifiers used by non-viral
+                    # pathogens (e.g., 'locus_tag').
+                    if feature_names is not None:
                         if "gene" in feat.qualifiers and feat.qualifiers["gene"][0] in feature_names:
                             fname = feat.qualifiers["gene"][0]
+                        elif "gene_name" in feat.qualifiers and feat.qualifiers["gene_name"][0] in feature_names:
+                            fname = feat.qualifiers["gene_name"][0]
                         elif "locus_tag" in feat.qualifiers and feat.qualifiers["locus_tag"][0] in feature_names:
                             fname = feat.qualifiers["locus_tag"][0]
                         else:
                             fname = None
                     else:
                         if "gene" in feat.qualifiers:
                             fname = feat.qualifiers["gene"][0]
+                        elif "gene_name" in feat.qualifiers:
+                            fname = feat.qualifiers["gene_name"][0]
                         else:
                             fname = feat.qualifiers["locus_tag"][0]
                     if feat.type == "source":

diff --git a/scripts/diff_jsons.py b/scripts/diff_jsons.py
@@ -14,6 +14,7 @@
     parser.add_argument("second_json", help="second JSON to compare")
     parser.add_argument("--significant-digits", type=int, default=5, help="number of significant digits to use when comparing numeric values")
     parser.add_argument("--exclude-paths", nargs="+", help="list of paths to exclude from consideration when performing a diff", default=["root['generated_by']['version']"])
+    parser.add_argument("--exclude-regex-paths", nargs="+", help="list of path regular expressions to exclude from consideration when performing a diff")
 
     args = parser.parse_args()
 
@@ -28,6 +29,7 @@
             first_json,
             second_json,
             significant_digits=args.significant_digits,
-            exclude_paths=args.exclude_paths
+            exclude_paths=args.exclude_paths,
+            exclude_regex_paths=args.exclude_regex_paths,
         )
     )
diff --git a/tests/functional/translate/cram/_setup.sh b/tests/functional/translate/cram/_setup.sh
@@ -0,0 +1,2 @@
+pushd "$TESTDIR/../../" > /dev/null
+export AUGUR="${AUGUR:-../../bin/augur}"
diff --git a/tests/functional/translate/cram/translate-with-genbank.t b/tests/functional/translate/cram/translate-with-genbank.t
@@ -0,0 +1,18 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Translate amino acids for genes using a GenBank file.
+
+  $ ${AUGUR} translate \
+  >   --tree translate/data/zika/tree.nwk \
+  >   --ancestral-sequences translate/data/zika/nt_muts.json \
+  >   --reference-sequence translate/data/zika/zika_outgroup.gb \
+  >   --genes CA PRO \
+  >   --output-node-data $TMP/aa_muts.json
+  Validating schema of 'translate/data/zika/nt_muts.json'...
+  Read in 3 features from reference sequence file
+  amino acid mutations written to .* (re)
+  $ python3 "../../scripts/diff_jsons.py" translate/data/zika/aa_muts_genbank.json $TMP/aa_muts.json
+  {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene-name.t b/tests/functional/translate/cram/translate-with-gff-and-gene-name.t
@@ -0,0 +1,31 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene_name".
+
+  $ cat >$TMP/genemap.gff <<~~
+  > ##gff-version 3
+  > ##sequence-region PF13/251013_18 1 10769
+  > PF13/251013_18	GenBank	gene	91	456	.	+	.	gene_name="CA"
+  > PF13/251013_18	GenBank	gene	457	735	.	+	.	gene_name="PRO"
+  > ~~
+
+  $ ${AUGUR} translate \
+  >   --tree translate/data/zika/tree.nwk \
+  >   --ancestral-sequences translate/data/zika/nt_muts.json \
+  >   --reference-sequence "$TMP/genemap.gff" \
+  >   --output-node-data $TMP/aa_muts.json
+  Validating schema of 'translate/data/zika/nt_muts.json'...
+  Read in 2 features from reference sequence file
+  amino acid mutations written to .* (re)
+
+Other than the sequence ids which will include a temporary path, the JSONs
+should be identical.
+
+  $ python3 "../../scripts/diff_jsons.py" \
+  >  --exclude-regex-paths "['seqid']" -- \
+  >  translate/data/zika/aa_muts_gff.json \
+  >  $TMP/aa_muts.json
+  {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-gene.t b/tests/functional/translate/cram/translate-with-gff-and-gene.t
@@ -0,0 +1,27 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "gene".
+
+  $ cat >$TMP/genemap.gff <<~~
+  > ##gff-version 3
+  > ##sequence-region PF13/251013_18 1 10769
+  > PF13/251013_18	GenBank	gene	91	456	.	+	.	gene="CA"
+  > PF13/251013_18	GenBank	gene	457	735	.	+	.	gene="PRO"
+  > ~~
+
+  $ ${AUGUR} translate \
+  >   --tree translate/data/zika/tree.nwk \
+  >   --ancestral-sequences translate/data/zika/nt_muts.json \
+  >   --reference-sequence "$TMP/genemap.gff" \
+  >   --output-node-data $TMP/aa_muts.json
+  Validating schema of 'translate/data/zika/nt_muts.json'...
+  Read in 2 features from reference sequence file
+  amino acid mutations written to .* (re)
+  $ python3 "../../scripts/diff_jsons.py" \
+  >  --exclude-regex-paths "['seqid']" -- \
+  >  translate/data/zika/aa_muts_gff.json \
+  >  $TMP/aa_muts.json
+  {}
diff --git a/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t b/tests/functional/translate/cram/translate-with-gff-and-locus-tag.t
@@ -0,0 +1,23 @@
+Setup
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ source _setup.sh
+
+Translate amino acids for genes using a GFF3 file where the gene names are stored in a qualifier named "locus_tag".
+
+  $ ${AUGUR} translate \
+  >   --tree translate/data/tb/tree.nwk \
+  >   --genes translate/data/tb/genes.txt \
+  >   --vcf-reference translate/data/tb/ref.fasta \
+  >   --ancestral-sequences translate/data/tb/nt_muts.vcf \
+  >   --reference-sequence translate/data/tb/Mtb_H37Rv_NCBI_Annot.gff \
+  >   --output-node-data $TMP/aa_muts.json \
+  >   --alignment-output $TMP/translations.vcf \
+  >   --vcf-reference-output $TMP/translations_reference.fasta
+  Gene length of rrs_Rvnr01 is not a multiple of 3. will pad with N
+  Read in 187 specified genes to translate.
+  Read in 187 features from reference sequence file
+  162 genes had no mutations and so have been be excluded.
+  amino acid mutations written to .* (re)
+  $ python3 "../../scripts/diff_jsons.py" translate/data/tb/aa_muts.json $TMP/aa_muts.json
+  {}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		pushd "$TESTDIR/../../" > /dev/null
		export AUGUR="${AUGUR:-../../bin/augur}"
Comment on lines +1 to +2 Copy link Member victorlin Aug 8, 2022 Choose a reason for hiding this comment The reason will be displayed to describe this comment to others. Learn more. (non-blocking) It would be nice to have just one of these files (the same as tests/functional/filter/cram/_setup.sh), but this is fine for now. huddlej reacted with thumbs up emoji