Skip to content

Commit fe37067

Browse files
authored
Merge pull request #114 from nextstrain/improve-ingest-andersen
Improve ingest for Andersen lab/SRA sequences
2 parents 0780396 + 058b27d commit fe37067

File tree

4 files changed

+28
-9
lines changed

4 files changed

+28
-9
lines changed

ingest/build-configs/ncbi/bin/curate-andersen-lab-data

+17-4
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ Parses NDJSON record from stdin and outputs new record to stdout.
77
"""
88
import copy
99
import json
10+
import re
1011
from datetime import datetime
1112
from enum import Enum
1213
from sys import stdin, stdout, stderr
14+
from augur.curate.parse_genbank_location import parse_location
1315

1416

1517
NEXTSTRAIN_RECORD = {
@@ -45,8 +47,13 @@ def create_new_record(anderson_record: dict) -> dict:
4547
new_record['sra_accessions'] = anderson_record['Run']
4648
new_record['region'] = anderson_record['geo_loc_name_country_continent']
4749
new_record['country'] = anderson_record['geo_loc_name_country']
48-
new_record['division'] = anderson_record.get('US State', '')
49-
new_record['location'] = anderson_record.get('US State', '')
50+
# Parse the geolocation as the GenBank format `country:division,location`
51+
new_record = parse_location(new_record, 'country')
52+
# Try to fill `US State` if division and location were not parsed
53+
if new_record['division'] == '':
54+
new_record['division'] = anderson_record.get('US State', '')
55+
if new_record['location'] == '':
56+
new_record['location'] = anderson_record.get('US State', '')
5057
new_record['host'] = anderson_record['Host']
5158
new_record['date_released'] = anderson_record['ReleaseDate']
5259

@@ -85,12 +92,18 @@ def construct_strain_name(record: dict, sample_id: str) -> str:
8592
to include host, country, and year.
8693
8794
Removes all spaces in the constructed strain name because they are not
88-
allowed in the downstream phylogenetic workflow.
95+
allowed in the downstream phylogenetic workflow. Also replaces invalid
96+
characters with `_` to match iqtree¹ so augur tree will not modify strain
97+
names and cause a mismatch between the tree and the alignment FASTA.²
98+
99+
¹ <https://github.com/iqtree/iqtree2/blob/74da454bbd98d6ecb8cb955975a50de59785fbde/utils/tools.cpp#L607>
100+
² <https://github.com/nextstrain/avian-flu/issues/113>
89101
"""
90102
host = record['host']
91103
country = record['country']
92104
year = parse_year(record['date'])
93-
return f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
105+
strain = f"A/{host}/{country}/{sample_id}/{year}".replace(" ", "")
106+
return re.sub(r'[^\w\_\-\.\|\/]', '_', strain)
94107

95108

96109
def parse_year(date_string: str) -> str:

ingest/build-configs/ncbi/defaults/annotations.tsv

+3
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,6 @@ A/ALPACA/USA/24-014328-007/2024 division Idaho # Based on news reports https://w
1010
A/Alpaca/USA/24-015080-001/2024 division Idaho # Based on news reports https://www.cidrap.umn.edu/avian-influenza-bird-flu/alpacas-infected-h5n1-avian-flu-idaho
1111
A/House-Mouse/USA/24-014780-002/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url
1212
A/House-Mouse/USA/24-014782-003/2024 division New Mexico # Based on https://wahis.woah.org/#/in-review/4451?reportId=167771&fromPage=event-dashboard-url
13+
A/PETFOOD/USA/24-037325-011/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
14+
A/PETFOOD/USA/24-037325-012/2024 host avian # This sequence is likely from cat food, which is turkeyy https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai
15+
A/PETFOOD/USA/24-037325-013/2024 host avian # This sequence is likely from cat food, which is turkey https://apps.oregon.gov/oregon-newsroom/OR/ODA/Posts/Post/morasch-meats-voluntary-recall-feline-raw-pet-food-hpai

ingest/build-configs/ncbi/defaults/host-map.tsv

+6-5
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,10 @@ Cygnus olor Avian
3737
Dairy cattle Cattle
3838
domestic cat Nonhuman Mammal
3939
domestic-cat Nonhuman Mammal
40+
Dromaius novaehollandiae Avian
4041
duck Avian
42+
EMU Avian
43+
Emu Avian
4144
environment Environment
4245
EURASIAN COLLARED DOVE Avian
4346
Falco peregrinus Avian
@@ -78,9 +81,11 @@ Quiscalus quiscula Avian
7881
raccoon Nonhuman Mammal
7982
red fox Nonhuman Mammal
8083
red tailed hawk Avian
84+
Rock Pigeon Avian
8185
skunk Nonhuman Mammal
8286
snow goose Avian
8387
Streptopelia decaocto Avian
88+
tiger Nonhuman Mammal
8489
Turdus merula Avian
8590
turkey Avian
8691
turkey vulture Avian
@@ -89,9 +94,5 @@ Vulpes vulpes Nonhuman Mammal
8994
western gull Avian
9095
western kingbird Avian
9196
western sandpiper Avian
92-
Wild-Bird Avian
93-
EMU Avian
94-
Emu Avian
95-
Dromaius novaehollandiae Avian
9697
White-winged Dove Avian
97-
Rock Pigeon Avian
98+
Wild-Bird Avian

ingest/defaults/geolocation_rules.tsv

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
/USA/*/* North America/USA/*/*
12
# Better to unknown division be listed as unknown than as "USA"
23
North America/USA// North America/USA/?/?
34
North America/USA/USA/ North America/USA/?/?
@@ -6,5 +7,6 @@ North America/USA/CA/CA North America/USA/California/California
67
North America/USA/KS/KS North America/USA/Kansas/Kansas
78
North America/USA/MI/MI North America/USA/Michigan/Michigan
89
North America/USA/NM/NM North America/USA/New Mexico/New Mexico
10+
North America/USA/OR/ North America/USA/Oregon/Oregon
911
North America/USA/TX/TX North America/USA/Texas/Texas
1012
North America/USA/WY/WY North America/USA/Wyoming/Wyoming

0 commit comments

Comments
 (0)