Skip to content

Commit ef89072

Browse files
authored
Derive URL column during ingest #66
2 parents 9a49047 + eb9c401 commit ef89072

File tree

5 files changed

+36
-15
lines changed

5 files changed

+36
-15
lines changed

ingest/defaults/config.yaml

+8
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,14 @@ curate:
9797
output_id_field: "accession"
9898
# The field in the NDJSON record that contains the actual genomic sequence
9999
output_sequence_field: "sequence"
100+
# The field in the NDJSON record that contains the actual GenBank accession
101+
genbank_accession: 'accession'
102+
# Added metadata columns
103+
added_columns: [
104+
'longitude',
105+
'latitude'
106+
]
107+
100108
# The list of metadata columns to keep in the final output of the curation pipeline.
101109
metadata_columns: [
102110
'accession',

ingest/rules/curate.smk

+24-5
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ rule curate:
4848
annotations=config["curate"]["annotations"],
4949
manual_mapping="defaults/host_hostgenus_hosttype_map.tsv",
5050
output:
51-
metadata="data/raw_metadata_curated.tsv",
51+
metadata= "data/all_metadata.tsv",
5252
sequences="results/sequences.fasta",
5353
log:
5454
"logs/curate.txt",
@@ -68,7 +68,7 @@ rule curate:
6868
authors_default_value=config["curate"]["authors_default_value"],
6969
abbr_authors_field=config["curate"]["abbr_authors_field"],
7070
annotations_id=config["curate"]["annotations_id"],
71-
metadata_columns=config["curate"]["metadata_columns"],
71+
added_columns=config["curate"]["added_columns"],
7272
id_field=config["curate"]["output_id_field"],
7373
sequence_field=config["curate"]["output_sequence_field"],
7474
shell:
@@ -98,7 +98,7 @@ rule curate:
9898
| ./scripts/transform-state-names \
9999
| ./scripts/post_process_metadata.py \
100100
| ./scripts/add-field-names \
101-
--metadata-columns {params.metadata_columns} \
101+
--metadata-columns {params.added_columns} \
102102
| ./scripts/transform-new-fields \
103103
--map-tsv {input.manual_mapping} \
104104
--map-id host \
@@ -113,12 +113,31 @@ rule curate:
113113
--output-id-field {params.id_field} \
114114
--output-seq-field {params.sequence_field} ) 2>> {log}
115115
"""
116+
rule add_metadata_columns:
117+
"""Add columns to metadata
118+
Notable columns:
119+
- [NEW] url: URL linking to the NCBI GenBank record ('https://www.ncbi.nlm.nih.gov/nuccore/*').
120+
"""
121+
input:
122+
metadata = "data/all_metadata.tsv"
123+
output:
124+
metadata = temp("data/all_metadata_added.tsv")
125+
params:
126+
accession=config['curate']['genbank_accession']
127+
shell:
128+
"""
129+
csvtk mutate2 -t \
130+
-n url \
131+
-e '"https://www.ncbi.nlm.nih.gov/nuccore/" + ${params.accession}' \
132+
{input.metadata} \
133+
> {output.metadata}
134+
"""
116135

117136
rule subset_metadata:
118137
input:
119-
metadata="data/raw_metadata_curated.tsv",
138+
metadata="data/all_metadata_added.tsv",
120139
output:
121-
metadata="data/raw_metadata.tsv",
140+
metadata="data/subset_metadata.tsv",
122141
params:
123142
metadata_fields=",".join(config["curate"]["metadata_columns"]),
124143
shell:

ingest/rules/nextclade.smk

+2-2
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
This part of the workflow handles running Nextclade on the curated metadata
33
and sequences.
44
REQUIRED INPUTS:
5-
metadata = data/raw_metadata_all.tsv
5+
metadata = data/subset_metadata.tsv
66
sequences = data/sequences_all.fasta
77
nextclade_datasets = ../nextclade/dataset
88
OUTPUTS:
@@ -101,7 +101,7 @@ rule append_nextclade_columns:
101101
Append the nextclade results to the metadata
102102
"""
103103
input:
104-
metadata="data/raw_metadata.tsv",
104+
metadata="data/subset_metadata.tsv",
105105
nextclade_subtypes="data/nextclade_clades.tsv",
106106
output:
107107
metadata_all="data/metadata_nextclade.tsv",

ingest/scripts/post_process_metadata.py

-6
Original file line numberDiff line numberDiff line change
@@ -26,18 +26,12 @@ def _set_strain_name(record):
2626
)
2727

2828

29-
def _set_url(record):
30-
"""Set url column from accession"""
31-
return "https://www.ncbi.nlm.nih.gov/nuccore/" + str(record["accession"])
32-
33-
3429
def main():
3530
args = parse_args()
3631

3732
for index, record in enumerate(stdin):
3833
record = json.loads(record)
3934
record["strain"] = _set_strain_name(record)
40-
record["url"] = _set_url(record)
4135
stdout.write(json.dumps(record) + "\n")
4236

4337

nextclade/rules/prepare_sequences.smk

+2-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ This part of the workflow prepares sequences for constructing the phylogenetic t
33
44
REQUIRED INPUTS:
55
6-
metadata = data/metadata.tsv
6+
metadata = data/all_metadata_added.tsv
77
sequences = data/sequences.fasta
88
reference = ../shared/reference.fasta
99
@@ -25,7 +25,7 @@ rule create_colors:
2525
message:
2626
"Creating custom color scale in {output.colors}"
2727
input:
28-
metadata = "data/metadata_all.tsv"
28+
metadata = "data/all_metadata_added.tsv"
2929
output:
3030
colors = "results/colors.tsv"
3131
log:

0 commit comments

Comments
 (0)