Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow inheritance in clades.tsv #846

Merged
merged 5 commits into from
Feb 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 84 additions & 12 deletions augur/clades.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,32 @@
import pandas as pd
import numpy as np
from collections import defaultdict
import networkx as nx
from itertools import islice
from .utils import get_parent_name_by_child_name_for_tree, read_node_data, write_json, get_json_name

def read_in_clade_definitions(clade_file):
'''
Reads in tab-seperated file that defines clades by amino acid or nucleotide mutations

Inheritance is allowed, but needs to be acyclic. Alleles can be overwritten by inheriting clades.

Sites are 1 indexed in the file, and are converted to 0 indexed in the output

Empty lines are ignored, comments after # are ignored

Format
------
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 nuc 444296 A
Clade_4 pks8 634 T
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 nuc 444296 A
Clade_3 S 1 P
\\# Clade_4 inherits from Clade_3
Clade_4 clade Clade_3
Clade_4 pks8 634 T
\\# Inherited allele can be overwritten
Clade_4 S 1 L

Parameters
----------
Expand All @@ -32,13 +45,72 @@ def read_in_clade_definitions(clade_file):
clade definitions as :code:`{clade_name:[(gene, site, allele),...]}`
'''

clades = defaultdict(list)
df = pd.read_csv(clade_file, sep='\t' if clade_file.endswith('.tsv') else ',')
for index, row in df.iterrows():
allele = (row.gene, row.site-1, row.alt)
clades[row.clade].append(allele)
clades.default_factory = None

clades = defaultdict(lambda: defaultdict(str))
df = pd.read_csv(
clade_file,
sep='\t' if clade_file.endswith('.tsv') else ',',
comment='#'
)

clade_inheritance_rows = df[df['gene'] == 'clade']

# Identify clades that inherit more than once
clades_with_multiple_inheritance = clade_inheritance_rows[clade_inheritance_rows.duplicated(subset=["clade"])]['clade'].tolist()
if len(clades_with_multiple_inheritance) > 0:
raise ValueError(f"Clades {clades_with_multiple_inheritance} have multiple inheritance, that's not allowed")

# Identify clades that inherit from non-existent clades
missing_parent_clades = set(clade_inheritance_rows['site']) - set(df["clade"])
if len(missing_parent_clades) > 0:
raise ValueError(f"Clades {missing_parent_clades} are inherited from but are not defined")


G = nx.DiGraph()

# Use integer 0 as root so as not to conflict with any string clade names
# String '0' can still be used this way
root = 0
# For every clade, add edge from root as default
# This way all clades can be reached by traversal
for clade in df.clade.unique():
G.add_edge(root, clade)

# Build inheritance graph
# For clades that inherit, disconnect from root
# Add edge from parent
for _, row in clade_inheritance_rows.iterrows():
G.remove_edge(root, row.clade)
G.add_edge(row.site, row.clade)

if not nx.is_directed_acyclic_graph(G):
raise ValueError(f"Clade definitions contain cycles {list(nx.simple_cycles(G))}")

# Traverse graph top down, so that children can inherit from parents and grandparents
# Topological sort ensures parents are visited before children
# islice is used to skip the root node (which has no parent)
for clade in islice(nx.topological_sort(G),1,None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a few comments that explain what these functions do would be helpful. This is some sort of traversal, I guess, but can't tell from the code what kind.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do the comments I added help?

# Get name of parent clade
# G.predecessors(clade) returns iterator, thus next() necessary
# despite the fact that there should only be one parent
parent_clade = next(G.predecessors(clade))
# Inheritance from parents happens here
# Allele dict is initialized with alleles from parent
clades[clade] = clades[parent_clade].copy()
for _, row in df[(df.clade == clade) & (df.gene != 'clade')].iterrows():
# Overwrite of parent alleles is possible and happens here
clades[clade][(row.gene, int(row.site)-1)] = row.alt

# Convert items from dict[str, dict[(str,int),str]] to dict[str, list[(str,int,str)]]
clades = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe point out that mutations defined for child overwrite inherited definitions.

clade: [
gene_site + (alt,)
for gene_site, alt in clade_definition.items()
]
for clade, clade_definition in clades.items()
# If clause avoids root (helper) from being emmitted
if clade != root
}

return clades


Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
"bcbio-gff >=0.6.0, ==0.6.*",
"biopython >=1.67, !=1.77, !=1.78",
"jsonschema >=3.0.0, ==3.*",
"networkx >= 2.5, ==2.*",
"packaging >=19.2",
"pandas >=1.0.0, ==1.*",
"phylo-treetime ==0.8.*",
Expand Down
7 changes: 7 additions & 0 deletions tests/data/clades/commented_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
clade gene site alt
# Comment
Clade_1 ctpE 81 D
# Comment
Clade_2 nuc 30642 T
Clade_3 nuc 444296 A# Comment
Clade_3 pks8 634 T
8 changes: 8 additions & 0 deletions tests/data/clades/empty_lines_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T

Clade_3 nuc 444296 A


Clade_3 pks8 634 T
6 changes: 6 additions & 0 deletions tests/data/clades/inherit_chained_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 clade Clade_1
Clade_2 nuc 30642 T
Clade_3 clade Clade_2
Clade_3 pks8 634 T
5 changes: 5 additions & 0 deletions tests/data/clades/inherit_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 clade Clade_2
Clade_3 pks8 634 T
6 changes: 6 additions & 0 deletions tests/data/clades/inherit_cycle_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_2 clade Clade_3
Clade_3 clade Clade_2
Clade_3 pks8 634 T
6 changes: 6 additions & 0 deletions tests/data/clades/multiple_inheritance_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 clade Clade_1
Clade_3 clade Clade_2
Clade_3 pks8 634 T
5 changes: 5 additions & 0 deletions tests/data/clades/nonexistent_clade_inheritance_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 clade Clade_X
Clade_3 pks8 634 T
5 changes: 5 additions & 0 deletions tests/data/clades/self_inherit_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 clade Clade_3
Clade_3 pks8 634 T
5 changes: 5 additions & 0 deletions tests/data/clades/simple_clades.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
clade gene site alt
Clade_1 ctpE 81 D
Clade_2 nuc 30642 T
Clade_3 nuc 444296 A
Clade_3 pks8 634 T
16 changes: 16 additions & 0 deletions tests/functional/clades.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
Integration tests for augur clades.

$ pushd "$TESTDIR" > /dev/null
$ export AUGUR="../../bin/augur"

Test augur clades with simple Zika input files and hierarchical clades.

$ ${AUGUR} clades \
> --tree clades/tree.nwk \
> --mutations clades/aa_muts.json clades/nt_muts_small.json \
> --clades clades/clades.tsv \
> --output-node-data "$TMP/clades.json" &>/dev/null

$ python3 "$TESTDIR/../../scripts/diff_jsons.py" clades/clades.json "$TMP/clades.json" \
> --exclude-paths "root['generated_by']"
{}
Loading