Skip to content

Commit

Permalink
Merge pull request #283 from nextstrain/better_node_naming
Browse files Browse the repository at this point in the history
Errors and warnings for trees without internal node names
  • Loading branch information
rneher authored Jul 10, 2019
2 parents 6df4133 + a391380 commit fddb5b3
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 29 deletions.
10 changes: 10 additions & 0 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,16 @@ def run(args):
print("ERROR: reading tree from %s failed."%args.tree)
return 1

import numpy as np
missing_internal_node_names = [n.name is None for n in T.get_nonterminals()]
if np.all(missing_internal_node_names):
print("\n*** WARNING: Tree has no internal node names!")
print("*** Without internal node names, ancestral sequences can't be linked up to the correct node later.")
print("*** If you want to use 'augur export' or `augur translate` later, re-run this command with the output of 'augur refine'.")
print("*** If you haven't run 'augur refine', you can add node names to your tree by running:")
print("*** augur refine --tree %s --output-tree <filename>.nwk"%(args.tree) )
print("*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'")

if any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']]):
if not args.vcf_reference:
print("ERROR: a reference Fasta is required with VCF-format alignments")
Expand Down
4 changes: 3 additions & 1 deletion augur/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def convert_tree_to_json_structure(node, metadata, div=0, nextflu_schema=False,
cdiv = div + metadata[child.name]['mutation_length']
elif 'branch_length' in metadata[child.name]:
cdiv = div + metadata[child.name]['branch_length']
else:
print("ERROR: Cannot find branch length information for %s"%(child.name))
node_struct["children"].append(convert_tree_to_json_structure(child, metadata, div=cdiv, nextflu_schema=nextflu_schema, strains=strains)[0])

return (node_struct, strains)
Expand All @@ -78,7 +80,7 @@ def recursively_decorate_tree_json_nextflu_schema(node, node_metadata, decoratio
metadata = node_metadata[node["strain"]]
metadata["strain"] = node["strain"]
except KeyError:
raise Exception("ERROR: node %s is not found in the node metadata."%n.name)
raise Exception("ERROR: node %s is not found in the node metadata."%node.name)

for data in decorations:
val = None
Expand Down
10 changes: 8 additions & 2 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def run(args):
attributes = ['branch_length']

# check if tree is provided an can be read
T = None #otherwise get 'referenced before assignment' error if reading fails
for fmt in ["newick", "nexus"]:
try:
T = Phylo.read(args.tree, fmt)
Expand Down Expand Up @@ -166,8 +167,10 @@ def run(args):
# if not specified, construct default output file name with suffix _tt.nwk
if args.output_tree:
tree_fname = args.output_tree
else:
elif args.alignment:
tree_fname = '.'.join(args.alignment.split('.')[:-1]) + '_tt.nwk'
else:
tree_fname = '.'.join(args.tree.split('.')[:-1]) + '_tt.nwk'

if args.root and len(args.root) == 1: #if anything but a list of seqs, don't send as a list
args.root = args.root[0]
Expand Down Expand Up @@ -222,10 +225,13 @@ def run(args):
import json
tree_success = Phylo.write(T, tree_fname, 'newick', format_branch_length='%1.8f')
print("updated tree written to",tree_fname, file=sys.stdout)

if args.output_node_data:
node_data_fname = args.output_node_data
else:
elif args.alignment:
node_data_fname = '.'.join(args.alignment.split('.')[:-1]) + '.node_data.json'
else:
node_data_fname = '.'.join(args.tree.split('.')[:-1]) + '.node_data.json'

write_json(node_data, node_data_fname)
print("node attributes written to",node_data_fname, file=sys.stdout)
Expand Down
11 changes: 11 additions & 0 deletions augur/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,17 @@ def run(args):
tree_fname = args.tree
traits, columns = read_metadata(args.metadata)

from Bio import Phylo
T = Phylo.read(tree_fname, 'newick')
missing_internal_node_names = [n.name is None for n in T.get_nonterminals()]
if np.all(missing_internal_node_names):
print("\n*** WARNING: Tree has no internal node names!")
print("*** Without internal node names, ancestral traits can't be linked up to the correct node later.")
print("*** If you want to use 'augur export' later, re-run this command with the output of 'augur refine'.")
print("*** If you haven't run 'augur refine', you can add node names to your tree by running:")
print("*** augur refine --tree %s --output-tree <filename>.nwk"%(tree_fname) )
print("*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'")

mugration_states = defaultdict(dict)
for column in args.columns:
T, gtr, alphabet = mugration_inference(tree=tree_fname, seq_meta=traits,
Expand Down
96 changes: 70 additions & 26 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@
from .utils import read_node_data, load_features, write_json, write_VCF_translation
from treetime.vcf_utils import read_vcf

class MissingNodeError(Exception):
pass

class MismatchNodeError(Exception):
pass

def safe_translate(sequence, report_exceptions=False):
"""Returns an amino acid translation of the given nucleotide sequence accounting
for gaps in the given sequence.
Expand Down Expand Up @@ -199,7 +205,15 @@ def assign_aa_vcf(tree, translations):
#get mutations on the root
root = tree.root
aa_muts[root.name]={"aa_muts":{}}
#If has no root node name, exit with error
if root.name is None:
print("\n*** Can't find node name for the tree root!")
raise MissingNodeError()

for fname, prot in translations.items():
if root.name not in prot['sequences']:
print("\n*** Can't find %s in the alignment provided!"%(root.name))
raise MismatchNodeError()
root_muts = prot['sequences'][root.name]
tmp = []
for pos in prot['positions']:
Expand All @@ -211,9 +225,15 @@ def assign_aa_vcf(tree, translations):
for c in n:
aa_muts[c.name]={"aa_muts":{}}
for fname, prot in translations.items():
if n.name not in prot['sequences']:
print("\n*** Can't find %s in the alignment provided!"%(root.name))
raise MismatchNodeError()
n_muts = prot['sequences'][n.name]
for c in n:
tmp = []
if c.name is None:
print("\n*** Internal node missing a node name!")
raise MissingNodeError()
c_muts = prot['sequences'][c.name]
for pos in prot['positions']:
#if pos in both, check if same
Expand All @@ -229,6 +249,39 @@ def assign_aa_vcf(tree, translations):

return aa_muts

def assign_aa_fasta(tree, translations):
aa_muts = {}

#fasta input shouldn't have mutations on root, so give empty entry
root = tree.get_nonterminals()[0]
aa_muts[root.name]={"aa_muts":{}}

for n in tree.get_nonterminals():
if n.name is None:
print("\n*** Tree is missing node names!")
raise MissingNodeError()
for c in n:
aa_muts[c.name]={"aa_muts":{}}
for fname, aln in translations.items():
for c in n:
if c.name in aln and n.name in aln:
tmp = [construct_mut(a, int(pos+1), d) for pos, (a,d) in
enumerate(zip(aln[n.name], aln[c.name])) if a!=d]
aa_muts[c.name]["aa_muts"][fname] = tmp
elif c.name not in aln and n.name not in aln:
print("\n*** Can't find %s OR %s in the alignment provided!"%(c.name, n.name))
raise MismatchNodeError()
else:
print("no sequence pair for nodes %s-%s"%(c.name, n.name))

if n==tree.root:
aa_muts[n.name]={"aa_muts":{}, "aa_sequences":{}}
for fname, aln in translations.items():
if n.name in aln:
aa_muts[n.name]["aa_sequences"][fname] = "".join(aln[n.name])

return aa_muts

def get_genes_from_file(fname):
genes = []
if os.path.isfile(fname):
Expand Down Expand Up @@ -332,32 +385,23 @@ def run(args):
'strand': 1}

## determine amino acid mutations for each node
if is_vcf:
aa_muts = assign_aa_vcf(tree, translations)
else:
aa_muts = {}

#fasta input shouldn't have mutations on root, so give empty entry
root = tree.get_nonterminals()[0]
aa_muts[root.name]={"aa_muts":{}}

for n in tree.get_nonterminals():
for c in n:
aa_muts[c.name]={"aa_muts":{}}
for fname, aln in translations.items():
for c in n:
if c.name in aln and n.name in aln:
tmp = [construct_mut(a, int(pos+1), d) for pos, (a,d) in
enumerate(zip(aln[n.name], aln[c.name])) if a!=d]
aa_muts[c.name]["aa_muts"][fname] = tmp
else:
print("no sequence pair for nodes %s-%s"%(c.name, n.name))
if n==tree.root:
aa_muts[n.name]={"aa_muts":{}, "aa_sequences":{}}
for fname, aln in translations.items():
if n.name in aln:
aa_muts[n.name]["aa_sequences"][fname] = "".join(aln[n.name])

try:
if is_vcf:
aa_muts = assign_aa_vcf(tree, translations)
else:
aa_muts = assign_aa_fasta(tree, translations)
except MissingNodeError as err:
print("\n*** ERROR: Some/all nodes have no node names!")
print("*** Please check you are providing the tree output by 'augur refine'.")
print("*** If you haven't run 'augur refine', please add node names to your tree by running:")
print("*** augur refine --tree %s --output-tree <filename>.nwk"%(args.tree) )
print("*** And use <filename>.nwk as the tree when running 'ancestral', 'translate', and 'traits'")
return 1
except MismatchNodeError as err:
print("\n*** ERROR: Mismatch between node names in %s and in %s"%(args.tree, args.ancestral_sequences))
print("*** Ensure you are using the same tree you used to run 'ancestral' as input here.")
print("*** Or, re-run 'ancestral' using %s, then use the new %s as input here."%(args.tree, args.ancestral_sequences))
return 1

write_json({'annotations':annotations, 'nodes':aa_muts}, args.output)
print("amino acid mutations written to",args.output, file=sys.stdout)
Expand Down

0 comments on commit fddb5b3

Please sign in to comment.