-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnextclade.smk
95 lines (82 loc) · 2.97 KB
/
nextclade.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
This part of the workflow handles running Nextclade on the curated metadata
and sequences.
REQUIRED INPUTS:
metadata = data/subset_metadata.tsv
sequences = results/sequences.fasta
OUTPUTS:
metadata = results/metadata.tsv
nextclade = results/nextclade.tsv
alignment = results/alignment.fasta
translations = results/translations.zip
See Nextclade docs for more details on usage, inputs, and outputs if you would
like to customize the rules:
https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
"""
DATASET_NAME = config["nextclade"]["dataset_name"]
rule get_nextclade_dataset:
"""Download Nextclade dataset"""
output:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
params:
dataset_name=DATASET_NAME
shell:
"""
nextclade3 dataset get \
--name={params.dataset_name:q} \
--output-zip={output.dataset} \
--verbose
"""
rule run_nextclade:
input:
dataset=f"data/nextclade_data/{DATASET_NAME}.zip",
sequences="results/sequences.fasta",
output:
nextclade="results/nextclade.tsv",
alignment="results/alignment.fasta",
translations="results/translations.zip",
params:
# The lambda is used to deactivate automatic wildcard expansion.
# https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000
translations=lambda w: "results/translations/{cds}.fasta",
shell:
"""
nextclade3 run \
{input.sequences} \
--input-dataset {input.dataset} \
--output-tsv {output.nextclade} \
--output-fasta {output.alignment} \
--output-translations {params.translations}
zip -rj {output.translations} results/translations
"""
rule join_metadata_and_nextclade:
input:
nextclade="results/nextclade.tsv",
metadata="data/subset_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
output:
metadata="results/metadata.tsv",
params:
metadata_id_field=config["curate"]["output_id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
shell:
"""
export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
csvtk -tl cut -f $SUBSET_FIELDS \
{input.nextclade} \
| csvtk -tl rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
"""