From 6b9f1cf0354c5e93c35dbbe817089ecb8e728922 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 12:12:18 +1200 Subject: [PATCH 1/6] Fix typo --- ingest/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/Snakefile b/ingest/Snakefile index 4adcbea..c657a6a 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -7,7 +7,7 @@ wildcard_constraints: rule all: input: - sequences=expand("results/metadata_{segment}.tsv", segment=config["segments"]), + sequences=expand("results/sequences_{segment}.fasta", segment=config["segments"]), metadata="results/metadata.tsv", rule upload_all: From 30b2640d676bfbdd0cb924e10a40c893b1aae69d Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 13:23:30 +1200 Subject: [PATCH 2/6] use LF not CRLF for metadata The default line endings for `csv.DictWriter` are CRLF (amazingly) --- ingest/scripts/add_segment_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/scripts/add_segment_counts.py b/ingest/scripts/add_segment_counts.py index b302d5e..c94d2d8 100644 --- a/ingest/scripts/add_segment_counts.py +++ b/ingest/scripts/add_segment_counts.py @@ -63,7 +63,7 @@ def summary(strain_count): row[column]=strain_count[row['strain']] with open(args.output, 'w') as fh: - writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t') + writer = csv.DictWriter(fh, fieldnames=fieldnames, delimiter='\t', lineterminator='\n') writer.writeheader() for row in rows: writer.writerow(row) From 05dd9ff807797eeaeb6ad47c5cf4fc0fac5f9591 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 12:50:47 +1200 Subject: [PATCH 3/6] namespace fauna ingest files in preparation for the subsequent commit which will add another ingest source --- ingest/README.md | 2 +- ingest/Snakefile | 10 ++++++---- ingest/rules/upload_from_fauna.smk | 25 +++++++++++++------------ rules/common.smk | 4 ++-- 4 files changed, 22 insertions(+), 19 deletions(-) diff --git a/ingest/README.md b/ingest/README.md index d2507b0..eb8c25c 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -28,7 +28,7 @@ nextstrain build \ . ``` -This command produces one metadata file, `results/metadata.tsv`, and one sequences file per gene segment like `results/sequences_ha.fasta`. +This command produces one metadata file, `results/fauna/metadata.tsv`, and one sequences file per gene segment like `results/fauna/sequences_ha.fasta`. Each file represents all available subtypes. Add the `upload_all` target to the command above to run the complete ingest pipeline _and_ upload results to AWS S3. diff --git a/ingest/Snakefile b/ingest/Snakefile index c657a6a..210a934 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -6,14 +6,16 @@ wildcard_constraints: segment = "|".join(config["segments"]) rule all: + # As of 2024-05-16 the default ingest only ingests data from fauna input: - sequences=expand("results/sequences_{segment}.fasta", segment=config["segments"]), - metadata="results/metadata.tsv", + sequences=expand("results/fauna/sequences_{segment}.fasta", segment=config["segments"]), + metadata="results/fauna/metadata.tsv", rule upload_all: + # As of 2024-05-16 the default upload only uploads data from fauna input: - sequences=expand("s3/sequences_{segment}.done", segment=config["segments"]), - metadata="s3/metadata.done", + sequences=expand("s3/fauna/sequences_{segment}.done", segment=config["segments"]), + metadata="s3/fauna/metadata.done", include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" diff --git a/ingest/rules/upload_from_fauna.smk b/ingest/rules/upload_from_fauna.smk index 86be0c1..eb9ef1e 100644 --- a/ingest/rules/upload_from_fauna.smk +++ b/ingest/rules/upload_from_fauna.smk @@ -1,8 +1,9 @@ rule download_segment: output: - sequences = "data/{segment}.fasta", + sequences = "data/fauna/{segment}.fasta", params: fasta_fields = "strain virus accession collection_date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", + output_dir = "data/fauna", benchmark: "benchmarks/download_segment_{segment}.txt" shell: @@ -12,16 +13,16 @@ rule download_segment: --virus avian_flu \ --fasta_fields {params.fasta_fields} \ --select locus:{wildcards.segment} \ - --path data \ + --path {params.output_dir} \ --fstem {wildcards.segment} """ rule parse_segment: input: - sequences = "data/{segment}.fasta", + sequences = "data/fauna/{segment}.fasta", output: - sequences = "results/sequences_{segment}.fasta", - metadata = "results/metadata_{segment}.tsv", + sequences = "results/fauna/sequences_{segment}.fasta", + metadata = "results/fauna/metadata_{segment}.tsv", params: fasta_fields = "strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade", prettify_fields = "region country division location host originating_lab submitting_lab authors PMID" @@ -44,10 +45,10 @@ rule merge_segment_metadata: for each segment, but that would be a nice improvement. """ input: - segments = expand("results/metadata_{segment}.tsv", segment=config["segments"]), - metadata = "results/metadata_ha.tsv", + segments = expand("results/fauna/metadata_{segment}.tsv", segment=config["segments"]), + metadata = "results/fauna/metadata_ha.tsv", output: - metadata = "results/metadata.tsv", + metadata = "results/fauna/metadata.tsv", shell: """ python scripts/add_segment_counts.py \ @@ -58,9 +59,9 @@ rule merge_segment_metadata: rule upload_sequences: input: - sequences="results/sequences_{segment}.fasta", + sequences="results/fauna/sequences_{segment}.fasta", output: - flag=touch("s3/sequences_{segment}.done"), + flag=touch("s3/fauna/sequences_{segment}.done"), params: s3_dst=config["s3_dst"], shell: @@ -73,9 +74,9 @@ rule upload_sequences: rule upload_metadata: input: - metadata="results/metadata.tsv", + metadata="results/fauna/metadata.tsv", output: - flag=touch("s3/metadata.done"), + flag=touch("s3/fauna/metadata.done"), params: s3_dst=config["s3_dst"], shell: diff --git a/rules/common.smk b/rules/common.smk index 2603f9b..dd95a6f 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -15,7 +15,7 @@ if LOCAL_INGEST: output: sequences = "data/{segment}/sequences.fasta", params: - sequences = lambda w: f"ingest/results/sequences_{w.segment}.fasta" + sequences = lambda w: f"ingest/results/fauna/sequences_{w.segment}.fasta" shell: """ cp {params.sequences} {output.sequences} @@ -26,7 +26,7 @@ if LOCAL_INGEST: metadata = "data/metadata.tsv", shell: """ - cp ingest/results/metadata.tsv {output.metadata} + cp ingest/results/fauna/metadata.tsv {output.metadata} """ else: From a5869b2beea6fdf6edeba773f3bdf73bfebd72f5 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 13:35:00 +1200 Subject: [PATCH 4/6] New genome ingest ruleset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds a new ingest "source", namely hardcoded source-data which is not yet available from the existing two sources. The point of this is to produce up-to-date whole genome builds, so we tailor the ingest for this purpose. From internal discussion¹² we are aiming to keep this data separate to the fauna-derived (and S3 uploaded) dataset. ¹ ² --- ingest/README.md | 15 +++++++++++++ ingest/Snakefile | 7 ++++++ ingest/rules/ingest_genome_data.smk | 31 ++++++++++++++++++++++++++ ingest/source-data/metadata.tsv | 2 ++ ingest/source-data/sequences_ha.fasta | 2 ++ ingest/source-data/sequences_mp.fasta | 2 ++ ingest/source-data/sequences_na.fasta | 2 ++ ingest/source-data/sequences_np.fasta | 2 ++ ingest/source-data/sequences_ns.fasta | 2 ++ ingest/source-data/sequences_pa.fasta | 2 ++ ingest/source-data/sequences_pb1.fasta | 2 ++ ingest/source-data/sequences_pb2.fasta | 2 ++ 12 files changed, 71 insertions(+) create mode 100644 ingest/rules/ingest_genome_data.smk create mode 100644 ingest/source-data/metadata.tsv create mode 100644 ingest/source-data/sequences_ha.fasta create mode 100644 ingest/source-data/sequences_mp.fasta create mode 100644 ingest/source-data/sequences_na.fasta create mode 100644 ingest/source-data/sequences_np.fasta create mode 100644 ingest/source-data/sequences_ns.fasta create mode 100644 ingest/source-data/sequences_pa.fasta create mode 100644 ingest/source-data/sequences_pb1.fasta create mode 100644 ingest/source-data/sequences_pb2.fasta diff --git a/ingest/README.md b/ingest/README.md index eb8c25c..06ffd69 100644 --- a/ingest/README.md +++ b/ingest/README.md @@ -56,6 +56,21 @@ nextstrain build . merge_andersen_segment_metadata The results will be available in `results/andersen-lab/`. + +### Ingest for whole genome builds + +> This section is in flux + +To produce ingest files specifically tailored for the whole genome H5N1 cattle outbreak whole genome build, which combines fauna data +with extra data in `source-data` run the following (you may need rethink credentials as above): + +```sh +nextstrain build . all_genome +``` + +The results will be available in `results/genome/`. + + ## Configuration ### Environment Variables diff --git a/ingest/Snakefile b/ingest/Snakefile index 210a934..d76be16 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -17,5 +17,12 @@ rule upload_all: sequences=expand("s3/fauna/sequences_{segment}.done", segment=config["segments"]), metadata="s3/fauna/metadata.done", +rule all_genome: + input: + sequences=expand("results/genome/sequences_{segment}.fasta", segment=config["segments"]), + metadata="results/genome/metadata.tsv", + include: "rules/upload_from_fauna.smk" include: "rules/ingest_andersen_lab.smk" +include: "rules/ingest_genome_data.smk" + diff --git a/ingest/rules/ingest_genome_data.smk b/ingest/rules/ingest_genome_data.smk new file mode 100644 index 0000000..3b9b0c2 --- /dev/null +++ b/ingest/rules/ingest_genome_data.smk @@ -0,0 +1,31 @@ + +## This ruleset is in flux and will change often +## Currently it merges hardcoded (committed) source-data with fauna-derived data +## No checking is done for duplicate strains + +rule merge_genome_metadata: + input: + fauna = "results/fauna/metadata.tsv" + params: + source_data = "source-data/metadata.tsv" + output: + metadata = "results/genome/metadata.tsv" + shell: + """ + diff <(head -n 1 {params.source_data}) <(head -n 1 {input.fauna}) && + cp {params.source_data} {output.metadata} && \ + tail -n +2 {input.fauna} >> {output.metadata} + """ + +rule merge_genome_sequences: + input: + fauna = "results/fauna/sequences_{segment}.fasta" + params: + source_data = "source-data/sequences_{segment}.fasta" + output: + metadata = "results/genome/sequences_{segment}.fasta" + shell: + """ + cat {params.source_data} {input.fauna} > {output.metadata} + """ + diff --git a/ingest/source-data/metadata.tsv b/ingest/source-data/metadata.tsv new file mode 100644 index 0000000..57cb932 --- /dev/null +++ b/ingest/source-data/metadata.tsv @@ -0,0 +1,2 @@ +strain virus isolate_id date region country division location host domestic_status subtype originating_lab submitting_lab authors PMID gisaid_clade h5_clade n_segments +A/environment/USA/CO-UW-9084466/2024 avian_flu PP796043 2024-04-XX North America Usa Colorado Colorado Environment domestic h5n1 University of Washington University of Washington Roychoudhury,P., Han,P., Kong,K., Xie,H., Gamboa,L., Rodriguez-Salas,L., Ellis,S.E., Greninger,A., Bedford,T., Starita,L. and Chu,H. ? 2.3.4.4b ? 8 diff --git a/ingest/source-data/sequences_ha.fasta b/ingest/source-data/sequences_ha.fasta new file mode 100644 index 0000000..8426c9e --- /dev/null +++ b/ingest/source-data/sequences_ha.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +GATCAGATTTGCATTGGTTACCATGCAAACAATTCGACAGAGCAAGTTGACACGATAATGGAAAAGAACGTCACTGTTACACATGCCCAAGACATACTGGAAAAAACACACAACGGGAAGCTATGCGACCTAAATGGGGTGAAGCCACTGATTTTAAAGGACTGCAGTGTAGCTGGATGGCTCCTCGGAAACCCAATGTGCGACGAATTCATCAGAGTGCCGGAATGGTCTTACATAGTGGAGCGGGCTAACCCAGCTAATGACCTCTGTTACCCAGGGAGCCTCAATGACTATGAAGAACTGAAACACATGTTGAGCAGAATAAATCATTTTGAGAAGATTCAGATCATTCCCAAGAGTTCCTGGCCAAATCATGAAACATCACTAGGGGTGAGCGCAGCTTGTCCATACCAGGGAGCACCCTCCTTTTTCAGAAATGTGGTGTGGCTTATCAAAAAGAACGATGCATACCCAACAATAAAGATAAGCTACAATAATACTAATCGGGAAGATCTCTTGATACTGTGGGGGATTCATCATTCCAACAATGCAGAAGAGCAGACAAATCTCTACAAAAACCCAATCACCTACATTTCAGTTGGAACATCAACTTTAAACCAGAGGKTGGCACCAAAAATAGCTACTAGATCCCAAGTAAACGGGCAACGTGGAAGAATGGACTTCTTCTGGACAATCTTAAAACCAGATGATGCAATCCATTTCGAGAGTAACGGAAATTTCATTGCTCCAGAATATGCATACAAAATTGTTAAGAAAGGGGACTCGACAATTATGAAAAGTGGAGTGGAATATGGCCATTGCAACACCAAATGTCAAACCCCAGTAGGTGCGATAAATTCTAGTATGCCATTTCACAACATACATCCTCTCACCATTGGGGAATGCCCCAAATACGTGAAATCAAACAAGTTGGTCCTTGCGACTGGGCTCAGAAATAGTCCTCTAAGAGAAAAGAGAAGAAAAAGAGGTCTGTTTGGGGCGATAGCAGGGTTTATAGAGGGAGGATGGCAGGGAATGGTTGATGGTTGGTATGGGTACCATCATAGCAATGAGCAGGGGAGTGGGTACGCTGCGGACAAAGAATCCACCCAAAAGGCAATAGATGGAGTTACCAATAAGGTCAACTCAATCATTGACAAAATGAACACTCAATTTGAGGCAGTTGGAAGGGAGTTTAATAACTTAGAAAGGAGGATAGAGAATTTGAACAAGAAAATGGAAGACGGATTCCTAGATGTCTGGACATATAATGCTGAACTTCTAGTTCTCATGGAAAACGAGAGGACTCTAGATTTCCATGATTCAAATGTCAAGAACCTTTACGACAAAGTCAGATTACAGCTTAGGGATAATGCAAAGGAGCTGGGTAACGGCTGTTTCGAATTCTATCACAAATGTGATAATGAATGTATGGAAAGTGTGAGAAATGGGACGTATGACTACCCTCAGTATTCAGAAGAAGCAAGATTAAAAAGAGAAGAAATAAGCGGAGTGAAATTAGAATCAGTAGGAACTTACCAGATACTGTCAATTTATTCAACAGCGGCAAGTTCCCTAGCACTGGCAATCATGATGGCTGGTCTATCTTTATGGATGTGCTCCAATGGGTCGTTACAATGCAGAATTTGCATTTAGATTTATGAGCTCAGATTGTAGTTAAAAACACC diff --git a/ingest/source-data/sequences_mp.fasta b/ingest/source-data/sequences_mp.fasta new file mode 100644 index 0000000..dd1ffd7 --- /dev/null +++ b/ingest/source-data/sequences_mp.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +GATATTGAAAGATGAGTCTTCTAACCGAGGTCGAAACGTACGTTCTCTCTATCGTCCCGTCGGGCCCCCTCAAAGCCGAGATCGCGCAGAGACTTGAAGATGTCTTTGCAGGGAAGAACACCGATCTTGAGGCTCTCATGGAATGGCTAAAGACAAGACCAATCCTGTCACCTCTGACTAAGGGGATTTTGGGATTTGTGTTCACGCTCACCGTGCCCAGTGAGCGAGGACTGCAGCGTAGACGCTTTGTCCAAAGTGCCCTAAGTGGAACTGGAGACCCAAACAACATGGACAGAGCAGTCAAGTTGTACAGGAAACTGAAGAGAGAGATAACATTCCATGGGGCTAAAGAAGTTGCACTCAGTTACTCAACCGGTGCACTTGCCAGTTGTATGGGTCTCATATACAACAGGATGGGGACGGTGACCGCAGAAGTGGCATTGGGCCTAGTGTGTGCCACCTGTGAACAGATTGCTGATTCACAGCATCGGTCTCACAGACAGATAGCTACCACCACCAACCCACTGATCAGACATGAAAACAGAATGGTTTTGGCCAGTACTACAGCTAAGGCTATGGAGCAGATGGCTGGATCGAGTGAGCAAGCAGTGGAAGCCATGGAGGTTGCTAGTCAGGCTAGGCAGATGGTGCAGGCGATGAGGACCATTGGAACTCATCCTAGCTCCAGTACCGGTCTGAGAGATGATCTCCTTGAAAATTTGCAGGCCTACCAAAAACGGATGGGAGTGCAACTGCAGCGATTCAAGTGATCCTCTCGTTATTGCCGCAAGTATCATTGGGATCTTGCACTTGATATTGTGGATTCTTGATCGCCTTTTCTTCAAATGCGTTTATCGTCGCCTTAAATACGGTTTGAAAGGAGGGCCTTCTACGGAAGGAGTACCTGAGTCCATGAGGGAAGAGTACCGGCAGGAACAGCAGAGTGCTGTGGATGTTGACAATGGTCATTTTGTCAACATAGAGCTGGAGTAGAAACAAGGTAGTTTTTTACT diff --git a/ingest/source-data/sequences_na.fasta b/ingest/source-data/sequences_na.fasta new file mode 100644 index 0000000..118ecfe --- /dev/null +++ b/ingest/source-data/sequences_na.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +GTATGGTAATTGGGATAGTCAGCTTGATGCTGCAAATTGGGAACATAATCTCAATATGGGTTAGCCATTCAATCCAAACAGGGAATCAATACCAGCCTGAACCATGCAATCAAAGCATCATTACCTATGAGAACAACACCTGGGTAAATCAGACGTATATCAACATCAGCAGTACCAATTTTCTTGCTGAGCAGGCTGTTACTTCGGTAACATTAGCGGGCAATTCATCTCTTTGCCCTATTAGTGGGTGGGCAATATACAGTAAGGACAACGGTATAAGAATTGGGTCTAAGGGGGATGTGTTTGTTATAAGAGAACCATTCATCTCATGCTCCCACTTGGAATGCAGAACCTTTTTCCTGACCCAGGGAGCTCTGCTGAATGACAAACATTCTAATGGGACAGTTAAGGATAGAAGCCCTTATAGAACTTTGATGAGTTGTCCCGTGGGTGAGGCTCCTTCCCCGTACAATTCAAGATTTGAGTCTGTTGCTTGGTCGGCAAGTGCTTGTCATGATGGCATCAGTTGGTTGACAATCGGTATTTCTGGTCCAGACAATGGAGCTGTGGCTGTATTGAAGTACAATGGCATAATAACGGATACTATCAAGAGTTGGAGAAACAACATTTTGAGAACTCAAGAATCTGAATGTGCTTGCGTAAATGGCTCCTGCTTCACCGTAATGACTGATGGACCAAGCAATGGGCAGGCCTCATATAAAATCTTCAAGATAGAGAAAGGGAAAGTTGTCAAATCAGTTGAAATGAATGCCCCTAATTACCACTACGAGGAATGCTCCTGTTATCCTGATGCGGGTGATATTATGTGTGTGTGCAGGGACAATTGGCATGGCTCGAACCGGCCGTGGGTATCTTTTAATCAAAATCTGGAGTATCAAATAGGATATATATGCAGTGGGATTTTCGGGGACAATCCCCGCCCCAATGATGGAACAGGCAGTTGCAGTCCAATGCCCTCTAATGGGGCATATGGGGTAAAAGGGTTTTCATTTAAGTACGGTAATGGGGTTTGGATCGGAAGAACAAAAAGCACTAGTTCCAGAAGCGGCTTTGAGATGATTTGGGATCCGAATGGGTGGACTGAGACGGACAGTAGTTTCTCAGTGAAGCAAGACATTGTAGAAATAACTGACTGGTCAGGATATAGTGGGAGTTTTGTCCAGCATCCAGAACTGACAGGATTAGATTGCATGAGGCCTTGTTTCTGGGTTGAGCTAATTAGAGGGAGGCCCAAAGAGAATACAATTTGGACTAGCGGGAGCAGCATATCCTTTTGTGGTGTAAATAGTGACACTGTGGGTTGGTCTTGGCCAGACGGTGCTGAGTTGCCATTCACCATTGACAAGTAG diff --git a/ingest/source-data/sequences_np.fasta b/ingest/source-data/sequences_np.fasta new file mode 100644 index 0000000..52209a5 --- /dev/null +++ b/ingest/source-data/sequences_np.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +CTGAGTGACATCCACATCATGGCGTCTCAAGGCACCAAACGATCCTATGAACAAATGGAAACTGGTGGGGAACGCCAGAATGCCACTGAAATCAGAGCATCTGTTGGAAGAATGGTTGGCGGAATCGGGAGATTCTACATACAGATGTGCACTGAGCTCAAACTCAGTGATCACGAAGGGAGGCTGATCCAAAACAGCATAACCATAGAAAGGATGGTTCTCTCGGCATTTGATGAAAGGAGGAACAAGTATCTGGAGGAACATCCCAGTGCTGGAAAGGATCCCAAGAAGACTGGAGGTCCAATCTACAGGAGGAGAGATGGCAAATGGATGAGAGAGTTGATCCTCTACGACAAAGAAGAGATCAGAAGAATTTGGCGTCAAGCTAATAATGGAGAGGATGCAACTGCTGGTCTCACTCACTTGATGATTTGGCATTCCAATCTGAATGATGCCACATACCAGAGAACAAGGGCACTTGTGCGTACTGGAATGGATCCTAGGATGTGCTCACTGATGCAAGGCTCAACCCTCCCTAGGAGATCCGGGGCTGCTGGAGCGGCAGTGAAAGGAGTTGGAACAATGGTGATGGAATTGATTCGAATGATCAAACGAGGAATCAATGATCGGAATTTCTGGAGAGGTGAAAACGGACGGAGAACCAGGATTGCCTACGAGAGAATGTGCAACATCCTCAAGGGAAAGTTCCAAACAGCAGCACAACGAGCAATGATGGACCAAGTGAGGGAAAGCCGGAATCCTGGGAATGCTGAAATTGAAGATCTCATCTTTCTCGCACGATCTGCTCTCATCCTGAGGGGATCAGTGGCTCATAAGTCCTGTCTGCCTGCTTGCGTGTATGGACTTGCTGTAGCCAGTGGATATGACTTTGAAAGAGAGGGATACTCTCTAGTCGGAATTGATCCTTTCCGTCTGCTCCAGAACAGTCAAGTTTTCAGTCTCATCAGACCGAATGAAAATCCAGCTCACAAAAGTCAGCTGGTATGGATGGCATGCCACTCTGCAGCATTTGAGGATCTGAGAGTGTCAAGCTTCATCAGAGGAACAAGAGTAGTCCCAAGAGGACAACTGTCCACCAGAGGAGTTCAGATTGCTTCAAATGAAAACATGGAGACAATGGATTCCAGTACTCTTGAACTGAGGAGCAGATACTGGGCTATAAGAACAAGAAGTGGAGGAAACACCAACCAACAGAGAGCATCTGCAGGACAAATCAGCGTACAGCCCACATTCTCTGTGCAGAGAAACCTCCCATTCGAGAGAGCAACCATCATGGCAGCATTTACGGGAAACACTGAAGGCAGAACTTCAGACATGAGAACTGAGATCATAAGGATGATGGAAAATGCCAGACCTGAAGATGTGTCTTTCCAGGGGCGGGGAGTCTTCGAGCTCTCGGACGAAAAGGCAACGAACCCGATCGTGCCTTCCTTTGACATGAACAATGAAGGATCTTATTTCTTCGGAGACAATGCAGAGGAGTATGACAATTAAAGAAAAATAC diff --git a/ingest/source-data/sequences_ns.fasta b/ingest/source-data/sequences_ns.fasta new file mode 100644 index 0000000..94846fd --- /dev/null +++ b/ingest/source-data/sequences_ns.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +GGATTCCAACACTGTGTTAAGCTTTCAGGTAGACTGCTTTCTTTGGCATGTCCGCAAACGATTTGCAGACCAAGAACTGGGTGATGCCCCATTCCTTGACCGGCTCCGCCGAGACCAGAAGTCTCTAAGAGGAAGAGGCAGCACTCTTGGTCTGGACATCGAGACGGCCACTCGTGCTGGGAAGCAGATAGTGGAGAGGATTCTGGAGGAAGAATCCGACGAGGCACTCAAAATGACTATTGCCTCTGTGCCTGCTCCACGCTACCTAACTGACATGACTCTTGAAGAGATGTCAAGAGACTGGTTCATGCTCATGCCCAAGCAAAAAGTGGCAGGCTCCCTCAGTATCAGAATGGACCAGGCGATTATGGATAAGAACATCATACTGAAGGCAAACTTCAGTGTGATCTTCAATCGGCTGGAGACACTAATACTACTCAGAGCTTTCACTGAAGAGGGAGCAATTGTCGGCGAAATTTCACCATTGCCTTCTCTTCCAGGACATACTGATGAGGATGTCAAAAATGCAATTGGGGTCCTCATCGGAGGACTTGAATGGAATGATAACACAGTTCGAGTCTCTGAAACTTTACAGAGATTCGCTTGGAGAAGCAGTAATGAGGATGGGAGACCTCCACTCCCTCCAAAGCAGAAACGGAAAATGGAGAGGACAATTGAGTCAGAAGTTTGAAGAAATAAGGTGGCTGATTGAAGAAGTGCGACACAGACTAAAGATCACAGAAAATAGTTTTGAACAAATAACATTTATGCAAGCCTTACAACTACTGCTTGAAGTGGAGCAAGAGNNNNNNNNNNNNTCGTTTCAGCTTTTTTATYATTAAATAA diff --git a/ingest/source-data/sequences_pa.fasta b/ingest/source-data/sequences_pa.fasta new file mode 100644 index 0000000..8d0bab8 --- /dev/null +++ b/ingest/source-data/sequences_pa.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +ATTCAAAATGGAAGACTTTGTGCGACAATGCTTCAATCCAATGATTGTCGAGCTTGCGGAAAAAGCAATGAAAGAATATGGGGAAGATCCGAAAATCGAGACAAACAAATTTGCCGCAATATGCACACACTTAGAAGTCTGTTTCATGTATTCGGATTTCCATTTTATTGACGAACGAGGCGAATCAATGATTGTAGAATCTGGCGATCCAAATGCATTATTGAAACACCGATTTGAGATAATCGAAGGGAGAGACCGAGCAATGGCCTGGACAGTGGTGAATAGTATCTGCAACACCACAGGGGTCGAAAAGCCCAAATTCCTCCCTGATTTGTATGACTACAGAGAGAACAGATTCATTGAAATTGGAGTAACGCGAAGGGAAGTTCACATATACTATTTGGAAAAAGCCAACAAGATAAAATCAGAGAAAACACATATTCACATATTCTCATTCACTGGAGAGGAAATGGCCACCAAGGCGGACTACACCCTTGATGAAGAGAGCAGAGCAAGAATAAAAACCAGACTGTTCACTATAAGACAAGAAATGGCCAGTAGAGGTCTATGGGATTCCTTTCGTCAATCCGAGAGAGGCGAAGAGACAATTGAAGAAAGATTTGAAATCACAGGAACCATGCGCAGGCTTGCCGACCAAAGTATTCCACCGAACTTCTCCAGCCTTGAAAACTTTAGAGCCTATGTGGATGGATTCGAACCGAACGGCTGCATTGAGGGCAAGCTTTCTCAAATGTCAAAAGAGGTGAACGCCAGAATTGAGCCATTTCTGAAGACAACACCACGCCCTCTCAGATTACCTGATGGGCCTCCCTGTCCTCAGCGGTCGAAGTTCTTGCTGATGGATGCCCTTAAGTTGAGCATCGAAGACCCTAGTCATGAGGGGGAGGGCATACCGCTGTATGATGCAATCAAATGCATGAAGACATTTTTTGGCTGGAAAGAGCCCAACATCGTAAAGCCGCATGAGAAAGGCATAAACCCTAATTACCTCCTGGCTTGGAAGCAGGTGCTGGCAGAACTTCAAGACATTGAAAATGAGGAGAAAATTCCAAAAACAAAGAACATGAAGAAAACAAGCCAATTGAAGTGGGCACTTGGTGAGAACATGGCTCCAGAAAAAGTGGACTTTGAGGACTGCAAAGATGTTAGCGATCTAAGACAGTACGACAGTGACGAACCAGAGTCTAGATCACTAGCAAGCTGGATTCAGAGTGAATTCAACAAGGCATGCGAACTGACAGATTCGAGTTGGATTGAACTTGATGAGATAGGGGAAGACGTTGCTCCAATCGAACACATTGCGAGTGTGAGGAGGAACTATTTCACAGCGGAGGTATCCCATTGCAGGGCCACTGAATACATAATGAAGGGAGTATACATAAACACAGCCCTATTGAATGCATCCTGTGCAGCCATGGATGACTTCCAATTGATTCCAATGATAAGTAAGTGCAGAACTAAAGAAGGAAGACGGAGGACAAATCTGTATGGATTCATTATAAAAGGAAGATCCCATTTGAGGAATGACACCGATGTGGTAAACTTTGTGAGCATGGAATTCTCTCTAACTGACCCGAGGCTAGAGCCACACAAATGGGAAAARTACTGTGTTCTTGAAATAGGAGACATGCTATTGAGGACTGCGATAGGTCAAGTGTCGAGGCCCATGTTCCTRTATGTGAGAACCAATGGAACYTCCAARATCAARATGAAATGGGGCATGGARATGAGGCGMTGCCTTCTTCAGTCCCTTCAACAAATTGAGAGCATGATTGAGGCCGAATCTTCTGTCAAAGAGAAGGACATGTCCAAGGAATTCTTTGAAAACAAATCAGAAACATGGCCAATTGGAGAATCACCCAAAGGGGTGGAGGAAGGCTCTATTGGGAAAGTATGCAGAACATTGCTAGCAAAGTCTGTGTTCAACAGCCTATATGCATCTCCTCAACTCGAGGGGTTTTCAGCTGAATCAAGAAAATTGCTTCTCATTGTTCAGGCACTTAGGGACAACCTGGAACCTGGAACCTTCGATCTTGGGGGGCTATATGAAGCAATTGAGGAGTGCCTGATTAACGATCCCTGGGTTTTGCTTAATGCATCTTGGTTCAACTCCTTCCTCACACATGCACTGAAATAGTTGTGGCAATGCTACTATTTGCTATCCATACTGTCCAAACAAGGTACTTTTTTGGAC diff --git a/ingest/source-data/sequences_pb1.fasta b/ingest/source-data/sequences_pb1.fasta new file mode 100644 index 0000000..5a6e548 --- /dev/null +++ b/ingest/source-data/sequences_pb1.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +GAATGGATGTCAATCCGACCTTACTCTTCTTGAAAGTTCCAGCGCAAAATGCCATAAGCACCACATTCCCGTATACTGGAGATCCTCCATACAGCCATGGAACAGGAACAGGATATACCATGGACACAGTTAACAGAACACATCAATATTCAGAAAAAGGGAAATGGACAACAAACTCAGAAACCGGGGCACCTCAACTCAATCCAATTGATGGACCATTGCCTGATGACAATGAGCCAAGTGGATATGCACAAACGGACTGCGTCCTTGAAGCAATGGCTTTCCTTGAAGAATCCCATCCAGGAATCTTTGAAAACTCGTGTCTTGAAACGATGGAAGTTGTTCAACAAACAAGAGTGGACAAGTTGACCCAAGGCCGTCAGACTTATGATTGGACATTAAACAGAAATCAGCCGGCTGCAACTGCATTAGCTAATACTATAGAGGTCTTCAGATCGAACGGTCTTACAGCTAATGAATCAGGAAGGCTAATAGATTTCCTCAAGGATGTGGTGGAATCAATGGATAAAGAGGAAATAGAAATAACAACGCATTTCCAAAGGAAAAGAAGAGTGAGAGACAACATGACCAAGAAAATGGTCACACAACGGACGATAGGAAAGAAGAAACAAAGGTTAAACAAAAGGAGCTATCTGATAAGAGCATTGACACTGAACACAATGACAAAAGACGCCGAAAGAGGCAAATTAAAGAGAAGGGCAATTGCAACACCCGGAATGCAAATCAGAGGGTTTGTGTACTTTGTTGAAACATTAGCAAGGAGCATTTGTGAGAAACTTGAACAATCTGGACTCCCAGTTGGAGGCAATGAAAAGAAGGCCAAACTAGCAAATGTTGTGAGAAAGATGATGACTAATTCGCAAGACACAGAGCTCTCTTTCACAATCACGGGAGACAACACTAAATGGAATGAGAACCAGAATCCTAGGATGTTTCTGGCAATGATAACATAYATWACAAGGAACCAACCTGAATGGTTCAGGAATGTATTGAGCATTGCACCTATAATGTTCTCAAACAAAATGGCAAGACTAGGGAAAGGATACATGTTCGAAAGTAAGAGCATGAAGCTTCGAACACAAATACCGGCAGAAATGCTAGCGAGCATTGATCTGAAATACTTCAATGAGTCAACAAGGAAGAAAATAGAGAAGATAAGACCTCTTCTAATAGATGGTACGGCCTCATTAAGCCCTGGAATGATGATGGGCATGTTCAACATGCTGAGTACAGTTCTGGGAGTTTCGATTCTAAATCTAGGGCAAAAGAAGTACACCAAAACAACATACTGGTGGGATGGACTACAATCTTCTGATGACTTTGCTCTCATCGTGAATGCTCCAAATCATGAGGGAATACAAGCAGGAGTAGACAGATTCTATAGAACCTGCAAGCTGGTAGGAATCAATATGAGCAAAAAGAAGTCATACATAAACAGGACAGGAACATTTGAATTCACAAGTTTTTTCTATCGCTATGGATTTGTAGCCAATTTCAGCATGGAGTTGCCCAGCTTTGGAGTTTCTGGGATCAATGAATCTGCAGACATGAGCATTGGAGTAACAGTGATAAAGAACAACATGATCAACAATGATCTTGGACCAGCAACAGCCCAAATGGCTCTACAGCTATTCATCAAGGATTACAGATACACATATCGATGTCACAGAGGAGACACACAAATTCAAACAAGGAGGTCATTCGAGCTGAAAAAGTTATGGGAACAAACCCGCTCAAAACCAGGACTGCTGGTCTCAGATGGAGGGCCAAATCTATACAATATCCGAAATCTCCACATTCCGGAAGTCTGCTTAAAATGGGAGCTAATGGACGAAGACTATCAGGGAAGGCTTTGTAATCCCCTGAATCCGTTTGTAAGCCACAAAGAAATAGAGTCTGTGAACAATGCTGTGGTGATGCCAGCTCATGGCCCAGCTAAGAGTATGGAATATGATGCTGTTGCCACCACTCACTCCTGGATCCCTAAGAGGAACCGCTCTATTCTTAATACAAGCCAAAGGGGAATCCTTGAAGACGAACAGATGTATCAAAAGTGCTGCAATCTATTTGAAAAATTCTTCCCTAGCAGTTCATACAGGAGGCCGGTTGGAATTTCCAGCATGGTGGAGGCCATGGTTTCTAGGGCCCGAATTGATGCACGAATTGACTTCGAATCTGGACGGATTAAGAAGGAGGAGTTTGCTGAGATCATGAAGATCTGTTCCACCATTGAAGAGCTCAGACGGCAGAAATAGTGAATTTAGCTTGTCCTTCATGAAAAAATG diff --git a/ingest/source-data/sequences_pb2.fasta b/ingest/source-data/sequences_pb2.fasta new file mode 100644 index 0000000..23ab4ae --- /dev/null +++ b/ingest/source-data/sequences_pb2.fasta @@ -0,0 +1,2 @@ +>A/environment/USA/CO-UW-9084466/2024 +GAGAGATCTAATGTCACAGTCTCGCACTCGCGAGATACTAACCAAAACCACTGTTGACCACATGGCCATAATCAAGAAGTACACATCAGGAAGACAAGAAAAGAACCCTGCACTCAGAATGAAATGGATGATGGCAATGAAATATCCAATCGCAGCAGACAAGCGAATAATGGAAATGATCCCTGAAAGGAATGAACAAGGACAAACCCTCTGGAGCAAGACAAATGATGCCGGATCAGATCGAGTGATGGTATCACCCCTGGCTGTGACATGGTGGAATAGGAATGGACCAACAACAAGTACAATTCACTATCCAAAGGTATACAAAACTTATTTTGAAAAAGTTGAAAGGTTGAAACACGGGACCTTTGGCCCTGTACACTTCAGAAACCAAATTAAGATAAGACGGAGGGTCGACATAAACCCGGGCCATGCTGACCTCAGCGCCAAAGAGGCGCAGGACGTAATCATGGAAGTTGTCTTTCCAAATGAAGTGGGAGCGAGAATACTGACGTCGGAATCACAATTGACAATAACAAAGGAAAAGAAAGAAGAACTCCAGGACTGCAAAATCGCCCCTCTGATGGTTGCATACATGCTAGAAAGAGAGCTGGTCCGCAAGACAAGGTTCCTCCCAGTTGCTGGTGGAACAAGCAGTGTCTACATTGAGGTGCTGCATTTGACCCAGGGAACATGCTGGGAGCAGATGTATACTCCAGGAGGAGAAGTGAGAAACGATGATGTAGACCAGAGCTTGATCATTGCTGCCAGGAATATAGTAAGAAGAGCAACAGTGTCAGCAGACCCATTAGCATCTCTATTGGAGATGTGCCACAGCACACAAATTGGAGGAATAAGGATGGTAGACATTCTTCGGCAAAATCCAACGGAGGAACAAGCCGTGGACATATGCAAGGCAGCAATGGGCTTGAGGATTAGCTCATCGTTCAGCTTTGGTGGATTCACTTTTAAAAGAACAAGTGGATCATCAGTCAAAAGGGAAGAAGAGGTGCTTACGGGCAACCTTCAAACATTGAAAATAAGAGTACATGAGGGGTATGAAGGGTTCACTATGGTTGGAAGAAGAGCAACGGCCATTCTCAGGAAAGCAACCAGAAGGCTGATTCAGCTAATAGTAAGTGGAAGGGACGAACAGTCAATTGCTGAAGCAATAATCGTGGCCATGGTATTCTCACAAGAGGACTGCATGATAAAGGCAGTTCGAGGTGATCTGAACTTTGTCAACAGGGCGAATCAGCGGCTGAATCCAATGCATCAGCTCTTGAGACACTTCCAAAAGAATGCAAAAGTGCTTTTCCAAAATTGGGGAATTGAGCCCATTGACAATGTGATGGGAATGATCGGGATATTGCCTGACATGACTCCAAGTACTGAGATGTCACTGAGGGGAATAAGAGTCAGTAAGATGGGAGTAGATGAATACTCCAGTACAGAGCGGGTAATAGTAAGCATCGACCGATTTTTAAGAGTTCGAGACCAACGGGGGAACGTACTACTGTCACCCGAAGAAGTCAGCGAGACACAAGGAACAGAGAAATTGACAATCACTTATTCGTCATCAATGATGTGGGARATCAAYGGYCCTGAGTCRGTGYTRGTCAAYACTTATCARTGGATAATCAGAAACTGGGAAAYWGTRAAAATTCAATGGTCACAAGACCCCACAATGTTGTATAATAAGATGGAGTTCGAGCCATTCCAGTCTCTGGTCCCTAAGGCAGCCAGGGGTCAATACAGTGGGTTCGTGAGGACACTATTTCAGCAAATGCGAGATGTGCTTGGAACATTTGACACTGTTCAGATAATAAAACTTCTCCCCTTTGCTGCTGCCCCACCGGAACAAAGTAGACTGCAATTCTCCTCTCTGACTGTGAATGTGAGAGGATCAGGAATGAGAATACTGATAAGAGGCAATTCTCCAGTGTTCAATTACAACAAGGCCACCAAGAGGCTCACAGTTCTCGGGAAAGATGCAGGTGCATTGGCCGAAGATCCAGATGAAGGCACAGCTGGAGTGGAGTCTGCTGTTTTAAGAGGATTCCTCATTTTGGGCAAAGAAGACAAGAGATATGGCCCAGCACTGAGCATCAATGAGCTGAGCAATTTGGCAAAGGGAGAGAAGGCTAATGTGCTAATTGGGCAAGGAGACGTGGTGTTGGTGATGAAACGGAAACGGGACTCTAGCATACTTACTGACAGCCAGACAGCGACCAAAAGAATTCGGATGGCCATCAATTAGTGTAGAATTGTTTAGAAAC From fa8e7bd69eeb4ff532da438e3645836d49dbaf6c Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 14:05:49 +1200 Subject: [PATCH 5/6] Allow local_ingest config to refer to a specific ingest source --- README.md | 15 +++++++++++---- rules/common.smk | 11 ++++++++--- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index d686676..1e2658a 100755 --- a/README.md +++ b/README.md @@ -60,18 +60,25 @@ Note that you may need to remove any existing data in `results/` in order for sn #### Using locally ingested data (instead of downloading from S3) -Run the pipeline with `--config 'local_ingest=True'` to use the locally available files produced by the ingest pipeline (see `./ingest/README.md` for details on how to run). -Specifically, the files needed are `ingest/results/metadata.tsv` and `ingest/results/sequences_{SEGMENT}.fasta`. +Run the pipeline with `--config 'local_ingest=fauna'` to use the locally available files produced by the ingest pipeline (see `./ingest/README.md` for details on how to run). +Specifically, the files needed are `ingest/results/fauna/metadata.tsv` and `ingest/results/fauna/sequences_{SEGMENT}.fasta`. +Replace "fauna" with "genome" (or any other namespace which ingest can produce) as needed. #### Running full genome builds -Run full genome builds with the following command. +Run full genome builds using the data on S3 (fauna) with the following command: -``` bash +```bash nextstrain build . --snakefile Snakefile.genome ``` +To include non-fauna data first run the "Ingest for whole genome builds" (see `ingest/README.md`) then run: +```bash +nextstrain build . --snakefile Snakefile.genome --config 'local_ingest=genome' +``` + + Currently this is only set up for the "h5n1-cattle-outbreak" build, and restricts the build to a set of strains where we think there's no reassortment (`config/include_strains_h5n1-cattle-outbreak.txt`). Output files will be placed in `results/h5n1-cattle-outbreak/genome`. See `Snakefile.genome` for more details. diff --git a/rules/common.smk b/rules/common.smk index dd95a6f..b5ebc0a 100644 --- a/rules/common.smk +++ b/rules/common.smk @@ -1,5 +1,8 @@ S3_SRC = config.get('s3_src', "s3://nextstrain-data-private/files/workflows/avian-flu") -LOCAL_INGEST = bool(config.get('local_ingest', False)) +LOCAL_INGEST = config.get('local_ingest', False) + +if LOCAL_INGEST: + assert LOCAL_INGEST in ['fauna', 'genome'], "local_ingest config must be either 'fauna' or 'genome'" def subtypes_by_subtype_wildcard(wildcards): db = { @@ -15,7 +18,7 @@ if LOCAL_INGEST: output: sequences = "data/{segment}/sequences.fasta", params: - sequences = lambda w: f"ingest/results/fauna/sequences_{w.segment}.fasta" + sequences = lambda w: f"ingest/results/{LOCAL_INGEST}/sequences_{w.segment}.fasta" shell: """ cp {params.sequences} {output.sequences} @@ -24,9 +27,11 @@ if LOCAL_INGEST: rule copy_metadata_from_ingest: output: metadata = "data/metadata.tsv", + params: + metadata = f"ingest/results/{LOCAL_INGEST}/metadata.tsv" shell: """ - cp ingest/results/fauna/metadata.tsv {output.metadata} + cp {params.metadata} {output.metadata} """ else: From bb27e01f4ad199e9db2ef01358843c102e8f95e4 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Thu, 16 May 2024 14:06:02 +1200 Subject: [PATCH 6/6] [cattle-flu] Update genome include list --- config/include_strains_h5n1-cattle-outbreak.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/include_strains_h5n1-cattle-outbreak.txt b/config/include_strains_h5n1-cattle-outbreak.txt index e4442dc..a3e9d83 100644 --- a/config/include_strains_h5n1-cattle-outbreak.txt +++ b/config/include_strains_h5n1-cattle-outbreak.txt @@ -222,6 +222,9 @@ A/raccoon/NewMexico/24009496002/2024 A/Texas/37/2024 +# Following strains are sourced from ingest/source-data +A/environment/USA/CO-UW-9084466/2024 + # Dropping these strains from include due to excess private mutations # A/dairycattle/NorthCarolina/24010327002/2024 # A/dairycattle/Texas/24009495007/2024