Skip to content

Commit

Permalink
Merge pull request #288 from nextstrain/fix_clade_assignment
Browse files Browse the repository at this point in the history
augur/clades.py: change clade assignment from mutations to sequences …
  • Loading branch information
rneher authored May 25, 2019
2 parents 5f3806c + e3d984a commit 2f4b86e
Showing 1 changed file with 213 additions and 135 deletions.
348 changes: 213 additions & 135 deletions augur/clades.py
Original file line number Diff line number Diff line change
@@ -1,135 +1,213 @@
"""
Assign clades to nodes in a tree based on amino-acid or nucleotide signatures.
"""

import sys
from Bio import Phylo
import pandas as pd
from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json

def read_in_clade_definitions(clade_file):
'''
Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations
Format:
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 nuc 444296 A
Clade_4 pks8 634 T
'''

clades = {}

df = pd.read_csv(clade_file, sep='\t' if clade_file.endswith('.tsv') else ',')
for index, row in df.iterrows():
allele = (row.gene, row.site, row.alt)
if row.clade in clades:
clades[row.clade].append(allele)
else:
clades[row.clade] = [allele]

return clades

def get_node_alleles(node_name, muts, parents):
'''
Retrieve the set of mutations leading to a node
This includes node-specific mutations as well as mutations
along its full line of descent
'''
node_alleles = []
sites_encountered = set()
focal_node = node_name
while focal_node is not None:
if 'aa_muts' in muts[focal_node]:
for gene in muts[focal_node]['aa_muts']:
for mut in muts[focal_node]['aa_muts'][gene]:
site = (gene, int(mut[1:-1]))
if site not in sites_encountered:
sites_encountered.add(site)
allele = (gene, int(mut[1:-1]), mut[-1])
node_alleles.append(allele)
if 'muts' in muts[focal_node]:
for mut in muts[focal_node]['muts']:
site = (gene, int(mut[1:-1]))
if site not in sites_encountered:
sites_encountered.add(site)
allele = ('nuc', int(mut[1:-1]), mut[-1])
node_alleles.append(allele)

focal_node = parents.get(focal_node)

return node_alleles

def is_node_in_clade(clade_alleles, node_alleles):
'''
Determines whether a node contains all mutations that define a clade
'''
is_clade = False
# mutations are stored on nodes in format 'R927H' matching the clade definitions
# if all of the clade-defining mutations are in the node mutations, it's part of this clade
if all([ clade_allele in node_alleles for clade_allele in clade_alleles ]):
is_clade = True
return is_clade

def assign_clades(clade_designations, all_muts, tree):
'''
Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node
to see if it's the first member of a clade (assigns 'clade_annotation'), and sets
all nodes's clade_membership to the value of their parent. This will change if later found to be
the first member of a clade.
'''

clade_membership = {}
parents = get_parent_name_by_child_name_for_tree(tree)

# first pass to set all nodes to unassigned as precaution to ensure attribute is set
for node in tree.find_clades(order = 'preorder'):
clade_membership[node.name] = {'clade_membership': 'unassigned'}

# second pass to assign 'clade_annotation' to basal nodes within each clade
# if multiple nodes match, assign annotation to largest
# otherwise occasional unwanted cousin nodes get assigned the annotation
for clade_name, clade_alleles in clade_designations.items():
node_counts = {}
for node in tree.find_clades(order = 'preorder'):
node_alleles = get_node_alleles(node.name, all_muts, parents)
if is_node_in_clade(clade_alleles, node_alleles):
node_counts[node.name] = node.count_terminals()
sorted_nodes = list(sorted(node_counts.items(), key=lambda x: x[1], reverse=True))
if len(sorted_nodes) > 0:
target_node = sorted_nodes[0][0]
clade_membership[target_node] = {'clade_annotation': clade_name, 'clade_membership': clade_name}

# third pass to propagate 'clade_membership'
# don't propagate if encountering 'clade_annotation'
for node in tree.find_clades(order = 'preorder'):
for child in node:
if 'clade_annotation' not in clade_membership[child.name]:
clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership']

return clade_membership


def register_arguments(parser):
parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
parser.add_argument('--output', type=str, help="name of JSON files for clades")


def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
node_data = read_node_data(args.mutations, args.tree)
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return 1
all_muts = node_data['nodes']

clade_designations = read_in_clade_definitions(args.clades)

clade_membership = assign_clades(clade_designations, all_muts, tree)

write_json({'nodes': clade_membership}, args.output)
print("clades written to", args.output, file=sys.stdout)
"""
Assign clades to nodes in a tree based on amino-acid or nucleotide signatures.
"""

import sys
from Bio import Phylo
import pandas as pd
import numpy as np
from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json

def read_in_clade_definitions(clade_file):
'''
Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations
Format
------
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 nuc 444296 A
Clade_4 pks8 634 T
Parameters
----------
clade_file : str
meta data file
Returns
-------
dict
clade definitions as :code:`{clade_name:[(gene, site, allele),...]}`
'''

clades = {}

df = pd.read_csv(clade_file, sep='\t' if clade_file.endswith('.tsv') else ',')
for index, row in df.iterrows():
allele = (row.gene, row.site-1, row.alt)
if row.clade in clades:
clades[row.clade].append(allele)
else:
clades[row.clade] = [allele]

return clades


def is_node_in_clade(clade_alleles, node, ref):
'''
Determines whether a node matches the clade definition based on sequence
For any condition, will first look in mutations stored in node.sequences,
then check whether a reference sequence is available, and other reports 'non-match'
Parameters
----------
clade_alleles : list
list of clade defining alleles
node : Phylo.Node
node to check, assuming sequences (as mutations) are attached to node
ref : str/list
positions
Returns
-------
bool
True if in clade
'''
conditions = []
for gene, pos, clade_state in clade_alleles:
if gene in node.sequences and pos in node.sequences[gene]:
state = node.sequences[gene][pos]
elif ref and gene in ref:
state = ref[gene][pos]
else:
state = ''

conditions.append(state==clade_state)

return all(conditions)


def assign_clades(clade_designations, all_muts, tree, ref=None):
'''
Ensures all nodes have an entry (or auspice doesn't display nicely), tests each node
to see if it's the first member of a clade (assigns 'clade_annotation'), and sets
all nodes's clade_membership to the value of their parent. This will change if later found to be
the first member of a clade.
Parameters
----------
clade_designations : dict
clade definitions as :code:`{clade_name:[(gene, site, allele),...]}`
all_muts : dict
mutations in each node
tree : Phylo.Tree
phylogenetic tree to process
ref : str/list, optional
reference sequence to look up state when not mutated
Returns
-------
dict
mapping of node to clades
'''

clade_membership = {}
parents = get_parent_name_by_child_name_for_tree(tree)

# first pass to set all nodes to unassigned as precaution to ensure attribute is set
for node in tree.find_clades(order = 'preorder'):
clade_membership[node.name] = {'clade_membership': 'unassigned'}

# count leaves
for node in tree.find_clades(order = 'postorder'):
node.leaf_count = 1 if node.is_terminal() else np.sum([c.leaf_count for c in node])

for node in tree.get_nonterminals():
for c in node:
c.up=node
tree.root.up = None
tree.root.sequences = {'nuc':{}}
tree.root.sequences.update({gene:{} for gene in all_muts[tree.root.name]['aa_muts']})

# attach sequences to all nodes
for node in tree.find_clades(order='preorder'):
if node.up:
node.sequences = {gene:muts.copy() for gene, muts in node.up.sequences.items()}
for mut in all_muts[node.name]['muts']:
a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]
node.sequences['nuc'][pos] = d
if 'aa_muts' in all_muts[node.name]:
for gene in all_muts[node.name]['aa_muts']:
for mut in all_muts[node.name]['aa_muts'][gene]:
a, pos, d = mut[0], int(mut[1:-1])-1, mut[-1]

if gene not in node.sequences:
node.sequences[gene]={}
node.sequences[gene][pos] = d


# second pass to assign 'clade_annotation' to basal nodes within each clade
# if multiple nodes match, assign annotation to largest
# otherwise occasional unwanted cousin nodes get assigned the annotation
for clade_name, clade_alleles in clade_designations.items():
node_counts = []
for node in tree.find_clades(order = 'preorder'):
if is_node_in_clade(clade_alleles, node, ref):
node_counts.append(node)
sorted_nodes = sorted(node_counts, key=lambda x: x.leaf_count, reverse=True)
if len(sorted_nodes) > 0:
target_node = sorted_nodes[0]
clade_membership[target_node.name] = {'clade_annotation': clade_name, 'clade_membership': clade_name}

# third pass to propagate 'clade_membership'
# don't propagate if encountering 'clade_annotation'
for node in tree.find_clades(order = 'preorder'):
for child in node:
if 'clade_annotation' not in clade_membership[child.name]:
clade_membership[child.name]['clade_membership'] = clade_membership[node.name]['clade_membership']

return clade_membership


def get_reference_sequence_from_root_node(all_muts, root_name):
# attach sequences to root
ref = {}
try:
ref['nuc'] = list(all_muts[root_name]["sequence"])
except:
print("WARNING in augur.clades: nucleotide mutation json does not contain full sequences for the root node.")

if "aa_muts" in all_muts[root_name]:
try:
ref.update({gene:list(seq) for gene, seq in all_muts[root_name]["aa_sequences"].items()})
except:
print("WARNING in augur.clades: amino acid mutation json does not contain full sequences for the root node.")

return ref


def register_arguments(parser):
parser.add_argument('--tree', help="prebuilt Newick -- no tree will be built if provided")
parser.add_argument('--mutations', nargs='+', help='JSON(s) containing ancestral and tip nucleotide and/or amino-acid mutations ')
parser.add_argument('--reference', nargs='+', help='fasta files containing reference and tip nucleotide and/or amino-acid sequences ')
parser.add_argument('--clades', type=str, help='TSV file containing clade definitions by amino-acid')
parser.add_argument('--output', type=str, help="name of JSON files for clades")


def run(args):
## read tree and data, if reading data fails, return with error code
tree = Phylo.read(args.tree, 'newick')
node_data = read_node_data(args.mutations, args.tree)
if node_data is None:
print("ERROR: could not read node data (incl sequences)")
return 1
all_muts = node_data['nodes']

if args.reference:
# PLACE HOLDER FOR vcf WORKFLOW.
# Works without a reference for now but can be added if clade defs contain positions
# that are monomorphic across reference and sequence sample.
ref = None
else:
# extract reference sequences from the root node entry in the mutation json
# if this doesn't exist, it will complain but not error.
ref = get_reference_sequence_from_root_node(all_muts, tree.root.name)

clade_designations = read_in_clade_definitions(args.clades)

clade_membership = assign_clades(clade_designations, all_muts, tree, ref)

write_json({'nodes': clade_membership}, args.output)
print("clades written to", args.output, file=sys.stdout)

0 comments on commit 2f4b86e

Please sign in to comment.