Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add host taxonomic categories #8

Merged
merged 5 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ ncbi_datasets_fields:
- update-date
- length
- host-name
- host-tax-id
- isolate-lineage-source
- biosample-acc
- submitter-names
Expand Down Expand Up @@ -53,12 +54,18 @@ curate:
release-date: date_released
update-date: date_updated
length: length
host-name: host
host-name: host_latin_name
host-tax-id: host_tax_id
isolate-lineage-source: sample_type
biosample-acc: biosample_accessions
submitter-names: authors
submitter-affiliation: institution
submitter-country: submitter_country
Group name: host_group
Curator common name: host_common_name
Family name: host_family
Genus name: host_genus

# Standardized strain name regex
# Currently accepts any characters because we do not have a clear standard for strain names across pathogens
strain_regex: "^.+$"
Expand Down Expand Up @@ -109,6 +116,11 @@ curate:
"location",
"length",
"host",
"host_latin_name",
"host_family",
"host_genus",
"host_group",
"host_common_name",
"date_released",
"date_updated",
"sra_accessions",
Expand Down
3 changes: 3 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field} \
| augur curate apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| scripts/add-host-categories.py \
--latin-field host_latin_name --family-field host_family \
--genus-field host_genus --group-field host_group \
| augur curate apply-record-annotations \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down
63 changes: 62 additions & 1 deletion ingest/rules/fetch_from_ncbi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,67 @@ rule format_ncbi_dataset_report:
> {output.ncbi_dataset_tsv}
"""

rule extract_ncbi_dataset_hosttaxid:
input:
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
output:
ncbi_dataset_hosttaxid="data/ncbi_dataset_hosttaxid.tsv",
log:
"logs/extract_ncbi_dataset_hosttaxid.txt",
benchmark:
"benchmarks/extract_ncbi_dataset_hosttaxid.txt"
shell:
"""
tsv-select {input.ncbi_dataset_tsv} -H -f 'host\-tax\-id' | \
tsv-filter --is-numeric 1 | \
tsv-uniq \
2> {log} > {output.ncbi_dataset_hosttaxid}
"""

rule get_ncbi_hosttax_info:
input:
ncbi_dataset_hosttaxid="data/ncbi_dataset_hosttaxid.tsv",
output:
ncbi_hosttax_info="data/hosttax_info.zip",
# Allow retries in case of network errors
retries: 5
log:
"logs/get_ncbi_hosttax_info.txt",
benchmark:
"benchmarks/get_ncbi_hosttax_info.txt"
shell:
"""
datasets download taxonomy taxon \
--inputfile {input.ncbi_dataset_hosttaxid} \
--filename {output.ncbi_hosttax_info} \
2>&1 | tee {log}
"""

rule join_metadata_and_hostinfo:
input:
ncbi_hosttax_info="data/hosttax_info.zip",
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
output:
metadata = "data/metadata_with_taxinfo.tsv",
log:
"logs/join_metadata_and_hostinfo.txt",
benchmark:
"benchmarks/join_metadata_and_hostinfo.txt"
params:
ncbi_hosttax_columns = "Query,'Group\ name','Curator\ common\ name','Family\ name','Genus\ name'"
shell:
"""
unzip -p {input.ncbi_hosttax_info} ncbi_dataset/data/taxonomy_summary.tsv \
| tsv-select -H -f {params.ncbi_hosttax_columns} \
| tsv-join -H \
--filter-file - \
--key-fields Query \
--data-fields 'host\-tax\-id' \
--append-fields '*' \
--write-all ? \
{input.ncbi_dataset_tsv} \
2> {log} > {output.metadata}
"""

# Technically you can bypass this step and directly provide FASTA and TSV files
# as input files for the curate pipeline.
Expand All @@ -103,7 +164,7 @@ rule format_ncbi_dataset_report:
rule format_ncbi_datasets_ndjson:
input:
ncbi_dataset_sequences="data/ncbi_dataset_sequences.fasta",
ncbi_dataset_tsv="data/ncbi_dataset_report.tsv",
ncbi_dataset_tsv="data/metadata_with_taxinfo.tsv",
output:
ndjson="data/ncbi.ndjson",
log:
Expand Down
70 changes: 70 additions & 0 deletions ingest/scripts/add-host-categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#! /usr/bin/env python3
"""
From stdin, generates host names using info from the NCBI taxonomy output of the NDJSON record, with output to 'host'

Outputs the modified record to stdout.
"""

import argparse
import json
from sys import stdin, stdout

def parse_args():
parser = argparse.ArgumentParser(
description="Generate host names and output to 'host'.")
parser.add_argument("--latin-field", default='host_latin_name',
help="Field from the records to use as the host latin name.")
parser.add_argument("--family-field", default='host_family',
help="Field from the records to use as the host Family name.")
parser.add_argument("--genus-field", default='host_genus',
help="Field from the records to use as the host genus name.")
parser.add_argument("--group-field", default='host_group',
help="Field from the records to use as the host group.")
return parser.parse_args()

def _set_host_name_transformed(record, args):
latin_replacements = {
"Canis lupus familiaris": "Domestic Dog",
"Homo sapiens": "Human",
"Bos taurus": "Cattle",
"Didelphis albiventris": "Other Mammal",
"Elephas maximus": "Other Mammal",
"Dasypus novemcinctus": "Other Mammal"}
family_replacements = {"Mephitidae": "Skunk"}
group_replacements = {
"odd-toed ungulates": "Other Ungulate",
"even-toed ungulates & whales": "Other Ungulate",
"carnivores": "Other Carnivore",
"bats": "Bat",
"birds": "Bird",
"primates": "Other Mammal",
"rodents": "Other Mammal",
"mammals": "Other Mammal"
}
Comment on lines +26 to +43

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-blocking, but this would end up being more generic (and hence easier to ultimately move into augur or re-use in other repos) if these were provided via config files passed as CLI args.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For sure! I wasn't sure how to make the config file generalized, but it just occurred to me that we can borrow the format of the hierarchical geolocation rules.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's discuss the generic version in nextstrain/augur#1586

latin_field = record[args.latin_field]
family_field = record[args.family_field]
group_field = record[args.group_field]

if record[args.family_field] == "Canidae" and record[args.genus_field] == "Vulpes":
return "Fox (Vulpes sp.)"
elif record[args.family_field] == "Procyonidae" and record[args.genus_field] == "Procyon":
return "Raccoon"
elif latin_field in latin_replacements:
return latin_replacements[latin_field]
elif family_field in family_replacements:
return family_replacements[family_field]
elif group_field in group_replacements:
return group_replacements[group_field]
else:
return group_field

def main():
args = parse_args()

for index, record in enumerate(stdin):
record = json.loads(record)
record['host'] = _set_host_name_transformed(record, args)
stdout.write(json.dumps(record) + "\n")

if __name__ == "__main__":
main()
12 changes: 11 additions & 1 deletion phylogenetic/defaults/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,16 @@
"key": "host",
"title": "Host",
"type": "categorical"
},
{
"key": "host_latin_name",
"title": "Host latin name",
"type": "categorical"
},
{
"key": "host_common_name",
"title": "Host common name",
"type": "categorical"
}
],
"geo_resolutions": [
Expand All @@ -39,7 +49,7 @@
],
"display_defaults": {
"map_triplicate": true,
"color_by": "region"
"color_by": "host"
},
"filters": [
"region",
Expand Down
13 changes: 13 additions & 0 deletions phylogenetic/defaults/colors.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,16 @@ region Africa #8ABB6A
region Europe #BEBB48
region South America #E29E39
region North America #E2562B
#
# Host taxa
host Bat #3F47C9
host Domestic Dog #4274CE
host Fox (Vulpes sp.) #4F97BB
host Raccoon #64AC99
host Skunk #7EB976
host Other Carnivore #9EBE5A
host Cattle #BEBB48
host Other Ungulate #D9AE3E
host Human #E69036
host Other Mammal #E35F2D
host Bird #DB2823