Skip to content

Commit 1f5a23c

Browse files
committed
Add reference outgroup for 2y/6m builds
Add reference outgroup for 2y and 6m builds to avoid incorrect rooting of trees. The reference is pruned from the tree after the refine step using James' code snippet¹. The references added are the references used by Nextclade, but with the strain names changed to match the strain name within fauna. ¹ nextstrain/augur#340 (comment)
1 parent 1666271 commit 1f5a23c

File tree

4 files changed

+74
-27
lines changed

4 files changed

+74
-27
lines changed

Snakefile

+3-3
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ def _get_node_data_for_predictors(wildcards):
9191

9292
rule convert_node_data_to_table:
9393
input:
94-
tree = rules.refine.output.tree,
94+
tree = rules.prune_reference.output.tree,
9595
metadata = rules.parse.output.metadata,
9696
node_data = _get_node_data_for_predictors
9797
output:
@@ -114,7 +114,7 @@ rule convert_node_data_to_table:
114114

115115
rule convert_frequencies_to_table:
116116
input:
117-
tree = rules.refine.output.tree,
117+
tree = rules.prune_reference.output.tree,
118118
frequencies = rules.tip_frequencies.output.tip_freq
119119
output:
120120
table = "results/frequencies_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.tsv"
@@ -230,7 +230,7 @@ rule merge_weighted_distances_to_future:
230230

231231
rule export:
232232
input:
233-
tree = rules.refine.output.tree,
233+
tree = rules.prune_reference.output.tree,
234234
metadata = rules.parse.output.metadata,
235235
auspice_config = files.auspice_config,
236236
node_data = _get_node_data_for_export,

Snakefile_WHO

+3-3
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ rule global_mutation_frequencies:
146146
rule scores:
147147
input:
148148
metadata = rules.parse.output.metadata,
149-
tree = rules.refine.output.tree
149+
tree = rules.prune_reference.output.tree
150150
output:
151151
scores = "results/scores_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json"
152152
conda: "environment.yaml"
@@ -209,7 +209,7 @@ rule export_entropy:
209209
rule export_sequence_json:
210210
input:
211211
aln = rules.ancestral.output.node_data,
212-
tree = rules.refine.output.tree,
212+
tree = rules.prune_reference.output.tree,
213213
aa_seqs = translations
214214
params:
215215
genes = gene_names
@@ -260,7 +260,7 @@ def _get_node_data_for_report_export(wildcards):
260260

261261
rule export_who:
262262
input:
263-
tree = rules.refine.output.tree,
263+
tree = rules.prune_reference.output.tree,
264264
metadata = rules.parse.output.metadata,
265265
auspice_config = "config/auspice_config_who_{lineage}.json",
266266
node_data = _get_node_data_for_report_export,

Snakefile_base

+67-20
Original file line numberDiff line numberDiff line change
@@ -369,10 +369,36 @@ rule select_strains:
369369
--output {output.strains}
370370
"""
371371

372+
def _get_lineage_reference(wildcards):
373+
if wildcards.resolution in {'2y', '6m'}:
374+
if wildcards.lineage == "h1n1pdm":
375+
return "A/California/7/2009"
376+
if wildcards.lineage == "h3n2":
377+
return "A/Wisconsin/67/2005"
378+
if wildcards.lineage == "vic":
379+
return "B/Brisbane/60/2008"
380+
381+
return ""
382+
383+
384+
rule include_reference_strain:
385+
message: "Adding reference strain to selected strains (only for 2y and 6m builds)"
386+
input:
387+
strains = "results/strains_{center}_{lineage}_{resolution}_{passage}_{assay}.txt"
388+
output:
389+
strains = "results/strains_with_reference_{center}_{lineage}_{resolution}_{passage}_{assay}.txt"
390+
params:
391+
reference = _get_lineage_reference
392+
shell:
393+
"""
394+
cp {input.strains} {output.strains}
395+
echo "\n{params.reference}" >> {output.strains}
396+
"""
397+
372398
rule extract:
373399
input:
374400
sequences = rules.filter.output.sequences,
375-
strains = rules.select_strains.output.strains
401+
strains = rules.include_reference_strain.output.strains
376402
output:
377403
sequences = 'results/extracted_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.fasta'
378404
conda: "environment.yaml"
@@ -467,6 +493,9 @@ rule tree:
467493
--exclude-sites {input.exclude_sites}
468494
"""
469495

496+
def _get_refine_root(wildcards):
497+
return _get_lineage_reference(wildcards) or "best"
498+
470499
rule refine:
471500
message:
472501
"""
@@ -488,7 +517,8 @@ rule refine:
488517
date_inference = "marginal",
489518
clock_filter_iqd = 4,
490519
clock_rate = clock_rate,
491-
clock_std_dev = clock_std_dev
520+
clock_std_dev = clock_std_dev,
521+
root = _get_refine_root
492522
conda: "environment.yaml"
493523
resources:
494524
mem_mb=16000
@@ -507,13 +537,30 @@ rule refine:
507537
--coalescent {params.coalescent} \
508538
--date-confidence \
509539
--date-inference {params.date_inference} \
510-
--clock-filter-iqd {params.clock_filter_iqd}
540+
--clock-filter-iqd {params.clock_filter_iqd} \
541+
--root {params.root}
511542
"""
512543

544+
rule prune_reference:
545+
message: "Pruning reference strain from the tree (only for 2y and 6m builds)"
546+
input:
547+
tree = rules.refine.output.tree
548+
output:
549+
tree = "results/tree_pruned_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.nwk"
550+
params:
551+
reference = _get_lineage_reference
552+
run:
553+
from Bio import Phylo
554+
T = Phylo.read(input.tree, "newick")
555+
reference = [ c for c in T.find_clades() if str(c.name) == params.reference][0]
556+
T.prune(reference)
557+
Phylo.write(T, output.tree, "newick")
558+
559+
513560
rule ancestral:
514561
message: "Reconstructing ancestral sequences and mutations"
515562
input:
516-
tree = rules.refine.output.tree,
563+
tree = rules.prune_reference.output.tree,
517564
alignment = rules.align.output
518565
output:
519566
node_data = "results/nt-muts_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json"
@@ -534,7 +581,7 @@ rule ancestral:
534581
rule translate:
535582
message: "Translating amino acid sequences"
536583
input:
537-
tree = rules.refine.output.tree,
584+
tree = rules.prune_reference.output.tree,
538585
node_data = rules.ancestral.output.node_data,
539586
reference = files.reference
540587
output:
@@ -552,7 +599,7 @@ rule translate:
552599
rule reconstruct_translations:
553600
message: "Reconstructing translations required for titer models and frequencies"
554601
input:
555-
tree = rules.refine.output.tree,
602+
tree = rules.prune_reference.output.tree,
556603
node_data = "results/aa-muts_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json",
557604
output:
558605
aa_alignment = "results/aa-seq_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}_{gene}.fasta"
@@ -572,7 +619,7 @@ rule reconstruct_translations:
572619

573620
rule convert_translations_to_json:
574621
input:
575-
tree = rules.refine.output.tree,
622+
tree = rules.prune_reference.output.tree,
576623
translations = translations
577624
output:
578625
translations = "results/aa-seq_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json"
@@ -595,7 +642,7 @@ rule traits:
595642
Inferring ancestral traits for {params.columns!s}
596643
"""
597644
input:
598-
tree = rules.refine.output.tree,
645+
tree = rules.prune_reference.output.tree,
599646
metadata = rules.parse.output.metadata
600647
output:
601648
node_data = "results/traits_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json",
@@ -617,7 +664,7 @@ rule titers_sub:
617664
titers = rules.download_titers.output.titers,
618665
aa_muts = rules.translate.output,
619666
alignments = translations,
620-
tree = rules.refine.output.tree
667+
tree = rules.prune_reference.output.tree
621668
params:
622669
genes = gene_names
623670
output:
@@ -637,7 +684,7 @@ rule titers_sub:
637684
rule titers_tree:
638685
input:
639686
titers = rules.download_titers.output.titers,
640-
tree = rules.refine.output.tree
687+
tree = rules.prune_reference.output.tree
641688
output:
642689
titers_model = "results/titers-tree-model_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json",
643690
conda: "environment.yaml"
@@ -652,7 +699,7 @@ rule titers_tree:
652699

653700
rule tip_frequencies:
654701
input:
655-
tree = rules.refine.output.tree,
702+
tree = rules.prune_reference.output.tree,
656703
metadata = rules.parse.output.metadata,
657704
params:
658705
narrow_bandwidth = 2 / 12.0,
@@ -681,7 +728,7 @@ rule tip_frequencies:
681728

682729
rule tree_frequencies:
683730
input:
684-
tree = rules.refine.output.tree,
731+
tree = rules.prune_reference.output.tree,
685732
metadata = rules.parse.output.metadata,
686733
params:
687734
min_date = min_date,
@@ -709,7 +756,7 @@ rule tree_frequencies:
709756

710757
rule diffusion_frequencies:
711758
input:
712-
tree = rules.refine.output.tree,
759+
tree = rules.prune_reference.output.tree,
713760
metadata = rules.parse.output.metadata,
714761
params:
715762
min_date = min_date,
@@ -735,7 +782,7 @@ rule diffusion_frequencies:
735782

736783
rule delta_frequency:
737784
input:
738-
tree = rules.refine.output.tree,
785+
tree = rules.prune_reference.output.tree,
739786
frequencies = rules.diffusion_frequencies.output.frequencies
740787
output:
741788
delta_frequency = "results/delta_frequency_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json"
@@ -756,7 +803,7 @@ rule delta_frequency:
756803
rule clades:
757804
message: "Annotating clades"
758805
input:
759-
tree = "results/tree_{center}_{lineage}_ha_{resolution}_{passage}_{assay}.nwk",
806+
tree = "results/tree_pruned_{center}_{lineage}_ha_{resolution}_{passage}_{assay}.nwk",
760807
nt_muts = rules.ancestral.output,
761808
aa_muts = rules.translate.output,
762809
clades = _get_clades_file_for_wildcards
@@ -781,7 +828,7 @@ rule clades:
781828

782829
rule antigenic_distances_between_strains:
783830
input:
784-
tree="results/tree_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.nwk",
831+
tree="results/tree_pruned_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.nwk",
785832
clades="results/clades_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json",
786833
titer_model="results/titers-sub-model_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json",
787834
titers="data/{center}_{lineage}_{passage}_{assay}_titers.tsv",
@@ -836,7 +883,7 @@ rule plot_antigenic_distances_between_strains:
836883

837884
rule distances:
838885
input:
839-
tree = rules.refine.output.tree,
886+
tree = rules.prune_reference.output.tree,
840887
alignments = translations,
841888
distance_maps = _get_distance_maps_by_lineage_and_segment
842889
params:
@@ -862,7 +909,7 @@ rule distances:
862909

863910
rule pairwise_titer_tree_distances:
864911
input:
865-
tree = rules.refine.output.tree,
912+
tree = rules.prune_reference.output.tree,
866913
frequencies = rules.tip_frequencies.output.tip_freq,
867914
model = rules.titers_tree.output.titers_model,
868915
date_annotations = rules.refine.output.node_data
@@ -917,7 +964,7 @@ rule titer_tree_cross_immunities:
917964

918965
rule glyc:
919966
input:
920-
tree = rules.refine.output.tree,
967+
tree = rules.prune_reference.output.tree,
921968
alignment = _get_glyc_alignment
922969
output:
923970
glyc = "results/glyc_{center}_{lineage}_{segment}_{resolution}_{passage}_{assay}.json"
@@ -933,7 +980,7 @@ rule glyc:
933980
rule lbi:
934981
message: "Calculating LBI"
935982
input:
936-
tree = rules.refine.output.tree,
983+
tree = rules.prune_reference.output.tree,
937984
branch_lengths = rules.refine.output.node_data
938985
params:
939986
tau = _get_lbi_tau_for_wildcards,

Snakefile_countries

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def _get_node_data_for_export(wildcards):
104104

105105
rule export:
106106
input:
107-
tree = rules.refine.output.tree,
107+
tree = rules.prune_reference.output.tree,
108108
metadata = rules.parse.output.metadata,
109109
auspice_config = files.auspice_config,
110110
node_data = _get_node_data_for_export,

0 commit comments

Comments
 (0)