nextstrain · huddlej · Oct 21, 2020 · Oct 12, 2020 · Oct 12, 2020 · Oct 12, 2020
diff --git a/augur/refine.py b/augur/refine.py
@@ -6,7 +6,7 @@
 from Bio import Phylo
 from .utils import read_metadata, read_tree, get_numerical_dates, write_json, InvalidTreeError
 from treetime.vcf_utils import read_vcf, write_vcf
-
+from treetime.seq_utils import profile_maps
 
 def refine(tree=None, aln=None, ref=None, dates=None, branch_length_inference='auto',
              confidence=False, resolve_polytomies=True, max_iter=2, precision='auto',
@@ -145,18 +145,23 @@ def run(args):
         return 1
 
     if not args.alignment:
-        # fake alignment to appease treetime when only using it for naming nodes...
         if args.timetree:
-            print("ERROR: alignment is required for ancestral reconstruction or timetree inference")
+            print("ERROR: alignment is required for ancestral reconstruction or timetree inference", file=sys.stderr)
             return 1
+
+        if args.divergence_units=='mutations':
+            print("ERROR: alignment is required for divergence in units of mutations", file=sys.stderr)
+            return 1
+
+        # fake alignment to appease treetime when only using it for naming nodes...
         from Bio import SeqRecord, Seq, Align
         seqs = []
         for n in T.get_terminals():
             seqs.append(SeqRecord.SeqRecord(seq=Seq.Seq('ACGT'), id=n.name, name=n.name, description=''))
         aln = Align.MultipleSeqAlignment(seqs)
     elif any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
         if not args.vcf_reference:
-            print("ERROR: a reference Fasta is required with VCF-format alignments")
+            print("ERROR: a reference Fasta is required with VCF-format alignments", file=sys.stderr)
             return 1
 
         compress_seq = read_vcf(args.alignment, args.vcf_reference)
@@ -185,7 +190,7 @@ def run(args):
     if args.timetree:
         # load meta data and covert dates to numeric
         if args.metadata is None:
-            print("ERROR: meta data with dates is required for time tree reconstruction")
+            print("ERROR: meta data with dates is required for time tree reconstruction", file=sys.stderr)
             return 1
         metadata, columns = read_metadata(args.metadata)
         if args.year_bounds:
@@ -217,7 +222,7 @@ def run(args):
                 node_data['skyline'] = [[float(x) for x in skyline.x], [float(y) for y in conf[0]],
                                         [float(y) for y in skyline.y], [float(y) for y in conf[1]]]
             except:
-                print("ERROR: skyline optimization by TreeTime has failed.")
+                print("ERROR: skyline optimization by TreeTime has failed.", file=sys.stderr)
                 return 1
 
         attributes.extend(['numdate', 'clock_length', 'mutation_length', 'raw_date', 'date'])
@@ -241,13 +246,35 @@ def run(args):
     if args.divergence_units=='mutations-per-site': #default
         pass
     elif args.divergence_units=='mutations':
-        L = tt.seq_len
-        for node in node_data['nodes']:
+        if not args.timetree:
+            tt.infer_ancestral_sequences()
+        nuc_map = profile_maps['nuc']
+
+        def are_sequence_states_different(nuc1, nuc2):
+            '''
+            determine whether two ancestral states should count as mutation for divergence estimates
+            while correctly accounting for ambiguous nucleotides
+            '''
+            if nuc1 in ['-', 'N'] or nuc2 in ['-', 'N']:
+                return False
+            elif nuc1 in nuc_map and nuc2 in nuc_map:
+                return np.sum(nuc_map[nuc1]*nuc_map[nuc2])==0
+            else:
+                return False
+
+        for node in T.find_clades():
+            n_muts = len([
+                position
+                for ancestral, position, derived in node.mutations
+                if are_sequence_states_different(ancestral, derived)
+            ])
+
             if args.timetree:
-                node_data['nodes'][node]['mutation_length'] *= L
-            node_data['nodes'][node]['branch_length'] *= L
+                node_data['nodes'][node.name]['mutation_length'] = n_muts
+
+            node_data['nodes'][node.name]['branch_length'] = n_muts
     else:
-        print("ERROR: divergence unit",args.divergence_units,"not supported!")
+        print("ERROR: divergence unit",args.divergence_units,"not supported!", file=sys.stderr)
         return 1
 
     # Export refined tree and node data

diff --git a/tests/functional/refine.t b/tests/functional/refine.t
@@ -0,0 +1,115 @@
+Integration tests for augur refine.
+
+  $ pushd "$TESTDIR" > /dev/null
+  $ export AUGUR="../../bin/augur"
+
+Try building a time tree.
+
+  $ ${AUGUR} refine \
+  >  --tree "refine/tree_raw.nwk" \
+  >  --alignment "refine/aligned.fasta" \
+  >  --metadata "refine/metadata.tsv" \
+  >  --output-tree "$TMP/tree.nwk" \
+  >  --output-node-data "$TMP/branch_lengths.json" \
+  >  --timetree \
+  >  --coalescent opt \
+  >  --date-confidence \
+  >  --date-inference marginal \
+  >  --clock-filter-iqd 4 \
+  >  --seed 314159 > /dev/null
+  */treetime/aa_models.py:108: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray (glob)
+    [0.800038509951648, 1.20274751778601, 1.55207513886163, 1.46600946033173, 0.830022143283238, 1.5416250309563, 1.53255698189437, 1.41208067821187, 1.47469999960758, 0.351200119909572, 0.570542199221932, 1.21378822764856, 0.609532859331199, 0.692733248746636, 1.40887880416009, 1.02015839286433, 0.807404666228614, 1.268589159299, 0.933095433689795]
+
+Confirm that TreeTime trees match expected topology and branch lengths.
+
+  $ python3 "$TESTDIR/../../scripts/diff_trees.py" "refine/tree.nwk" "$TMP/tree.nwk" --significant-digits 2
+  {}
+
+Build a time tree with mutations as the reported divergence unit.
+
+  $ ${AUGUR} refine \
+  >  --tree "refine/tree_raw.nwk" \
+  >  --alignment "refine/aligned.fasta" \
+  >  --metadata "refine/metadata.tsv" \
+  >  --output-tree "$TMP/tree.nwk" \
+  >  --output-node-data "$TMP/branch_lengths.json" \
+  >  --timetree \
+  >  --coalescent opt \
+  >  --date-confidence \
+  >  --date-inference marginal \
+  >  --clock-filter-iqd 4 \
+  >  --seed 314159 \
+  >  --divergence-units mutations > /dev/null
+  */treetime/aa_models.py:108: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray (glob)
+    [0.800038509951648, 1.20274751778601, 1.55207513886163, 1.46600946033173, 0.830022143283238, 1.5416250309563, 1.53255698189437, 1.41208067821187, 1.47469999960758, 0.351200119909572, 0.570542199221932, 1.21378822764856, 0.609532859331199, 0.692733248746636, 1.40887880416009, 1.02015839286433, 0.807404666228614, 1.268589159299, 0.933095433689795]
+
+Confirm that TreeTime trees match expected topology and branch lengths.
+
+  $ python3 "$TESTDIR/../../scripts/diff_trees.py" "refine/tree.nwk" "$TMP/tree.nwk" --significant-digits 2
+  {}
+
+Run refine without inferring a time tree.
+This is one way to get named internal nodes for downstream analyses and does not require an alignment FASTA.
+
+  $ ${AUGUR} refine \
+  >  --tree "refine/tree_raw.nwk" \
+  >  --metadata "refine/metadata.tsv" \
+  >  --output-tree "$TMP/tree.nwk" \
+  >  --output-node-data "$TMP/branch_lengths.json" \
+  >  --coalescent opt \
+  >  --date-confidence \
+  >  --date-inference marginal \
+  >  --clock-filter-iqd 4 \
+  >  --seed 314159 \
+  >  --divergence-units mutations-per-site > /dev/null
+  */treetime/aa_models.py:108: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray (glob)
+    [0.800038509951648, 1.20274751778601, 1.55207513886163, 1.46600946033173, 0.830022143283238, 1.5416250309563, 1.53255698189437, 1.41208067821187, 1.47469999960758, 0.351200119909572, 0.570542199221932, 1.21378822764856, 0.609532859331199, 0.692733248746636, 1.40887880416009, 1.02015839286433, 0.807404666228614, 1.268589159299, 0.933095433689795]
+
+Confirm that trees match expected topology and branch lengths, given that the output should not be a time tree.
+
+  $ python3 "$TESTDIR/../../scripts/diff_trees.py" "refine/not_time_tree.nwk" "$TMP/tree.nwk" --significant-digits 2
+  {}
+  $ diff -u "refine/mutations_per_site_branch_lengths.json" "$TMP/branch_lengths.json"
+
+Run refine again without a time tree, but request number of mutations per branch as the divergence unit.
+This approach only works when we provide an alignment FASTA.
+
+  $ ${AUGUR} refine \
+  >  --tree "refine/tree_raw.nwk" \
+  >  --alignment "refine/aligned.fasta" \
+  >  --metadata "refine/metadata.tsv" \
+  >  --output-tree "$TMP/tree.nwk" \
+  >  --output-node-data "$TMP/branch_lengths.json" \
+  >  --coalescent opt \
+  >  --date-confidence \
+  >  --date-inference marginal \
+  >  --clock-filter-iqd 4 \
+  >  --seed 314159 \
+  >  --divergence-units mutations > /dev/null
+  */treetime/aa_models.py:108: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray (glob)
+    [0.800038509951648, 1.20274751778601, 1.55207513886163, 1.46600946033173, 0.830022143283238, 1.5416250309563, 1.53255698189437, 1.41208067821187, 1.47469999960758, 0.351200119909572, 0.570542199221932, 1.21378822764856, 0.609532859331199, 0.692733248746636, 1.40887880416009, 1.02015839286433, 0.807404666228614, 1.268589159299, 0.933095433689795]
+
+Confirm that trees match expected topology and branch lengths, given that the output should not be a time tree.
+
+  $ python3 "$TESTDIR/../../scripts/diff_trees.py" "refine/not_time_tree.nwk" "$TMP/tree.nwk" --significant-digits 2
+  {}
+  $ diff -u "refine/integer_branch_lengths.json" "$TMP/branch_lengths.json"
+
+Run refine again without a time tree, but try to request number of mutations per branch as the divergence unit.
+This approach does not make sense and should not work without an alignment FASTA.
+
+  $ ${AUGUR} refine \
+  >  --tree "refine/tree_raw.nwk" \
+  >  --metadata "refine/metadata.tsv" \
+  >  --output-tree "$TMP/tree.nwk" \
+  >  --output-node-data "$TMP/branch_lengths.json" \
+  >  --coalescent opt \
+  >  --date-confidence \
+  >  --date-inference marginal \
+  >  --clock-filter-iqd 4 \
+  >  --seed 314159 \
+  >  --divergence-units mutations > /dev/null
+  *ERROR: alignment is required* (glob)
+  [1]
+
+  $ popd > /dev/null