From a086a497b46f36a6827caf9b20a1e9a12506b03a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 17 Jun 2024 11:29:29 -0700 Subject: [PATCH 1/4] fetch-and-ingest: Add Snakemake `--stats` output Adding as part of #240 to help collect more data for tackling #446. One unexpected behavior that I ran into when testing the `--stats` option is that Snakemake doesn't generate the stats file if the workflow exits with an error at any step. Note that the Snakemake `--stats` option is not available starting with Snakemake v8, so this will need to be removed when we eventually upgrade Snakemake in our runtimes. --- .github/workflows/fetch-and-ingest-genbank-master.yml | 1 + .github/workflows/fetch-and-ingest-gisaid-master.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/fetch-and-ingest-genbank-master.yml b/.github/workflows/fetch-and-ingest-genbank-master.yml index 1fad4f00..1419c810 100644 --- a/.github/workflows/fetch-and-ingest-genbank-master.yml +++ b/.github/workflows/fetch-and-ingest-genbank-master.yml @@ -108,5 +108,6 @@ jobs: --env SLACK_CHANNELS \ --env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \ . \ + --stats snakemake_stats.json \ --configfile config/genbank.yaml \ $CONFIG_OVERRIDES diff --git a/.github/workflows/fetch-and-ingest-gisaid-master.yml b/.github/workflows/fetch-and-ingest-gisaid-master.yml index babbb6c1..3c811bb0 100644 --- a/.github/workflows/fetch-and-ingest-gisaid-master.yml +++ b/.github/workflows/fetch-and-ingest-gisaid-master.yml @@ -109,5 +109,6 @@ jobs: --env SLACK_CHANNELS \ --env PAT_GITHUB_DISPATCH="$GH_TOKEN_NEXTSTRAIN_BOT_WORKFLOW_DISPATCH" \ . \ + --stats snakemake_stats.json \ --configfile config/gisaid.yaml \ $CONFIG_OVERRIDES From 185cd9cbe6de840d9ec3b394c4ce3b09279a5e50 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 17 Jun 2024 15:56:41 -0700 Subject: [PATCH 2/4] Add benchmark to all Snakemake rules Adding as part of #240 to help collect more data for tackling #446. --- workflow/snakemake_rules/curate.smk | 10 +++++++ workflow/snakemake_rules/fetch_sequences.smk | 28 +++++++++++++++++++ workflow/snakemake_rules/nextclade.smk | 22 +++++++++++++++ .../snakemake_rules/slack_notifications.smk | 6 ++++ workflow/snakemake_rules/trigger.smk | 4 +++ workflow/snakemake_rules/upload.smk | 6 ++++ 6 files changed, 76 insertions(+) diff --git a/workflow/snakemake_rules/curate.smk b/workflow/snakemake_rules/curate.smk index 5d1dfe64..75b65f48 100644 --- a/workflow/snakemake_rules/curate.smk +++ b/workflow/snakemake_rules/curate.smk @@ -29,6 +29,8 @@ rule transform_rki_data: output: fasta="data/rki_sequences.fasta", metadata="data/rki_metadata_transformed.tsv", + benchmark: + "benchmarks/transform_rki_data.txt" params: subsampled=config.get("subsampled", False), shell: @@ -45,6 +47,8 @@ rule transform_biosample: biosample = "data/biosample.ndjson" output: biosample = "data/genbank/biosample.tsv" + benchmark: + "benchmarks/transform_biosample.txt" shell: """ ./bin/transform-biosample {input.biosample} \ @@ -85,6 +89,8 @@ rule merge_open_data: output: metadata="data/genbank/metadata_transformed.tsv", sequences="data/genbank/sequences.fasta", + benchmark: + "benchmarks/merge_open_data.txt" shell: """ ./bin/merge-open \ @@ -105,6 +111,8 @@ rule transform_gisaid_data: metadata = "data/gisaid/metadata_transformed.tsv", flagged_annotations = temp("data/gisaid/flagged-annotations"), additional_info = "data/gisaid/additional_info.tsv" + benchmark: + "benchmarks/transform_gisaid_data.txt" shell: """ ./bin/transform-gisaid {input.ndjson} \ @@ -120,6 +128,8 @@ rule flag_metadata: metadata = "data/gisaid/metadata.tsv" output: metadata = "data/gisaid/flagged_metadata.txt" + benchmark: + "benchmarks/flag_metadata.txt" resources: # Memory use scales primarily with the size of the metadata file. mem_mb=20000 diff --git a/workflow/snakemake_rules/fetch_sequences.smk b/workflow/snakemake_rules/fetch_sequences.smk index 1926ed07..f460bb0a 100644 --- a/workflow/snakemake_rules/fetch_sequences.smk +++ b/workflow/snakemake_rules/fetch_sequences.smk @@ -21,6 +21,8 @@ Produces different final outputs for GISAID vs GenBank/RKI: rule fetch_main_gisaid_ndjson: output: ndjson = temp(f"data/gisaid.ndjson") + benchmark: + "benchmarks/fetch_main_gisaid_ndjson.txt" retries: 5 shell: """ @@ -128,6 +130,8 @@ rule fetch_biosample: """Fetching BioSample data (GenBank only)""" output: biosample = temp("data/biosample.ndjson") + benchmark: + "benchmarks/fetch_biosample.txt" retries: 5 shell: """ @@ -140,6 +144,8 @@ rule fetch_cog_uk_accessions: """Fetching COG-UK sample accesions (GenBank only)""" output: cog_uk_accessions = temp("data/cog_uk_accessions.tsv") + benchmark: + "benchmarks/fetch_cog_uk_accessions.txt" retries: 5 shell: """ @@ -152,6 +158,8 @@ rule fetch_cog_uk_metadata: """Fetching COG-UK metadata (GenBank only)""" output: cog_uk_metadata = temp("data/cog_uk_metadata.csv.gz") + benchmark: + "benchmarks/fetch_cog_uk_metadata.txt" retries: 5 shell: """ @@ -164,6 +172,8 @@ rule uncompress_cog_uk_metadata: "data/cog_uk_metadata.csv.gz" output: cog_uk_metadata = temp("data/cog_uk_metadata.csv") + benchmark: + "benchmarks/uncompress_cog_uk_metadata.txt" shell: "gunzip -c {input} > {output}" @@ -171,6 +181,8 @@ rule uncompress_cog_uk_metadata: rule fetch_rki_sequences: output: rki_sequences=temp("data/rki_sequences.fasta.xz"), + benchmark: + "benchmarks/fetch_rki_sequences.txt" retries: 5 shell: """ @@ -181,6 +193,8 @@ rule fetch_rki_sequences: rule fetch_rki_metadata: output: rki_metadata=temp("data/rki_metadata.tsv.xz"), + benchmark: + "benchmarks/fetch_rki_metadata.txt" retries: 5 shell: """ @@ -194,6 +208,8 @@ rule transform_rki_data_to_ndjson: rki_metadata="data/rki_metadata.tsv.xz" output: ndjson="data/rki.ndjson", + benchmark: + "benchmarks/transform_rki_data_to_ndjson.txt" shell: """ ./bin/transform-rki-data-to-ndjson \ @@ -235,6 +251,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines = config.get("subsample",{}).get("main_ndjson", 0) output: ndjson = temp(f"data/{database}.ndjson") + benchmark: + "benchmarks/fetch_main_ndjson_from_s3.txt" shell: """ ./vendored/download-from-s3 {params.file_on_s3_dst} {output.ndjson} {params.lines} || \ @@ -250,6 +268,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines = config.get("subsample",{}).get("biosample", 0) output: biosample = temp("data/biosample.ndjson") + benchmark: + "benchmarks/fetch_biosample_from_s3.txt" shell: """ ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ @@ -263,6 +283,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines = config.get("subsample",{}).get("rki_ndjson", 0) output: rki_ndjson = temp("data/rki.ndjson") + benchmark: + "benchmarks/fetch_rki_ndjson_from_s3.txt" shell: """ ./vendored/download-from-s3 {params.file_on_s3_dst} {output.rki_ndjson} {params.lines} || \ @@ -275,6 +297,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines = config.get("subsample",{}).get("cog_uk_accessions", 0) output: biosample = "data/cog_uk_accessions.tsv" if config.get("keep_temp",False) else temp("data/cog_uk_accessions.tsv") + benchmark: + "benchmarks/fetch_cog_uk_accessions_from_s3.txt" shell: """ ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ @@ -288,6 +312,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines = config.get("subsample",{}).get("cog_uk_metadata", 0) output: biosample = temp("data/cog_uk_metadata.csv") + benchmark: + "benchmarks/fetch_cog_uk_metadata_from_s3.txt" shell: """ ./vendored/download-from-s3 {params.file_on_s3_dst} {output.biosample} {params.lines} || \ @@ -299,5 +325,7 @@ if config.get("s3_dst") and config.get("s3_src"): "data/cog_uk_metadata.csv" output: cog_uk_metadata = "data/cog_uk_metadata.csv.gz" if config.get("keep_temp",False) else temp("data/cog_uk_metadata.csv.gz") + benchmark: + "benchmarks/compress_cog_uk_metadata.txt" shell: "gzip -c {input} > {output}" diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index c31077c0..a0a5a047 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -40,6 +40,8 @@ rule create_empty_nextclade_info: """Creating empty NextClade info cache file""" output: touch(f"data/{database}/nextclade{{reference}}_old.tsv"), + benchmark: + f"benchmarks/create_empty_nextclade_info_{database}{{reference}}.txt" rule create_empty_nextclade_aligned: @@ -51,6 +53,8 @@ rule create_empty_nextclade_aligned: touch(f"data/{database}/nextclade.translation_{gene}.old.fasta") for gene in GENE_LIST ], + benchmark: + f"benchmarks/create_empty_nextclade_aligned_{database}.txt" # Only include rules to fetch from S3 if S3 config params are provided @@ -74,6 +78,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines=config.get("subsample", {}).get("nextclade", 0), output: nextclade=f"data/{database}/nextclade{{reference}}_old.tsv", + benchmark: + f"benchmarks/download_nextclade_tsv_from_s3_{database}{{reference}}.txt" shell: """ ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.nextclade} 0 || \ @@ -95,6 +101,8 @@ if config.get("s3_dst") and config.get("s3_src"): lines=config.get("subsample", {}).get("nextclade", 0), output: alignment=temp(f"data/{database}/nextclade.{{seqtype}}.old.fasta"), + benchmark: + f"benchmarks/download_previous_alignment_from_s3_{database}{{seqtype}}.txt" shell: """ ./vendored/download-from-s3 {params.dst_rerun_touchfile} {output.alignment} 0 || \ @@ -132,6 +140,8 @@ rule download_nextclade_executable: """Download Nextclade""" output: nextclade="nextclade", + benchmark: + f"benchmarks/download_nextclade_executable_{database}.txt" shell: """ if [ "$(uname)" = "Darwin" ]; then @@ -158,6 +168,8 @@ rule download_nextclade_dataset: "nextclade", output: dataset="data/nextclade_data/{dataset_name}.zip", + benchmark: + f"benchmarks/download_nextclade_dataset_{database}_{{dataset_name}}.txt" shell: """ ./nextclade dataset get --name="{wildcards.dataset_name}" --output-zip={output.dataset} --verbose @@ -185,6 +197,8 @@ rule run_wuhan_nextclade: temp(f"data/{database}/nextclade.translation_{gene}.upd.fasta") for gene in GENE_LIST ], + benchmark: + f"benchmarks/run_wuhan_nextclade_{database}.txt" shell: """ # If there are no sequences to run Nextclade on, create empty output files @@ -214,6 +228,8 @@ rule run_21L_nextclade: sequences=f"data/{database}/nextclade_21L.sequences.fasta", output: info=f"data/{database}/nextclade_21L_new_raw.tsv", + benchmark: + f"benchmarks/run_21L_nextclade_{database}.txt" shell: """ # If there are no sequences to run Nextclade on, create empty output files @@ -235,6 +251,8 @@ rule nextclade_tsv_concat_versions: dataset=lambda w: f"data/nextclade_data/sars-cov-2{w.reference.replace('_','-')}.zip", output: tsv=f"data/{database}/nextclade{{reference}}_new.tsv", + benchmark: + f"benchmarks/nextclade_tsv_concat_versions_{database}{{reference}}.txt" shell: """ if [ -s {input.tsv} ]; then @@ -270,6 +288,8 @@ rule nextclade_info: new_info=rules.nextclade_tsv_concat_versions.output.tsv, output: nextclade_info=f"data/{database}/nextclade{{reference}}.tsv", + benchmark: + f"benchmarks/nextclade_info_{database}{{reference}}.txt" shell: """ tsv-append -H {input.old_info} {input.new_info} \ @@ -286,6 +306,8 @@ rule combine_alignments: new_alignment=f"data/{database}/nextclade.{{seqtype}}.upd.fasta", output: alignment=f"data/{database}/{{seqtype}}.fasta", + benchmark: + f"benchmarks/combine_alignments_{database}{{seqtype}}.txt" params: keep_temp=config.get("keep_temp", "false"), shell: diff --git a/workflow/snakemake_rules/slack_notifications.smk b/workflow/snakemake_rules/slack_notifications.smk index 5fe52eb6..4687cb6e 100644 --- a/workflow/snakemake_rules/slack_notifications.smk +++ b/workflow/snakemake_rules/slack_notifications.smk @@ -30,6 +30,8 @@ rule notify_on_record_change: ndjson_on_s3 = f"{config['s3_src']}/{database}.ndjson.xz" output: touch(f"data/{database}/notify-on-record-change.done") + benchmark: + f"benchmarks/notify_on_record_change_{database}.txt" shell: """ ./vendored/notify-on-record-change {input.ndjson} {params.ndjson_on_s3} {database} @@ -46,6 +48,8 @@ rule notify_gisaid: s3_bucket = config["s3_src"] output: touch("data/gisaid/notify.done") + benchmark: + "benchmarks/notify_gisaid.txt" run: shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}") shell("./bin/notify-on-additional-info-change {input.additional_info} {params.s3_bucket}/additional_info.tsv.gz") @@ -60,6 +64,8 @@ rule notify_genbank: s3_bucket = config["s3_src"] output: touch("data/genbank/notify.done") + benchmark: + "benchmarks/notify_genbank.txt" run: shell("./vendored/notify-slack --upload flagged-annotations < {input.flagged_annotations}") # TODO - which rule produces data/genbank/problem_data.tsv? (was not explicit in `ingest-genbank` bash script) diff --git a/workflow/snakemake_rules/trigger.smk b/workflow/snakemake_rules/trigger.smk index 49937a6a..e56fe8f5 100644 --- a/workflow/snakemake_rules/trigger.smk +++ b/workflow/snakemake_rules/trigger.smk @@ -18,6 +18,8 @@ rule trigger_rebuild_pipeline: fasta_upload = f"data/{database}/aligned.fasta.zst.upload", output: touch(f"data/{database}/trigger-rebuild.done") + benchmark: + f"benchmarks/trigger_rebuild_pipeline_{database}.txt" params: dispatch_type = f"{database}/rebuild" shell: @@ -35,6 +37,8 @@ rule trigger_counts_pipeline: f"data/{database}/upload.done" output: touch(f"data/{database}/trigger-counts.done") + benchmark: + f"benchmarks/trigger_counts_pipeline_{database}.txt" params: dispatch_type = f"{database}/clade-counts" shell: diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 2002cbf5..2b8a02bb 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -87,6 +87,8 @@ rule upload_single: notifications_flag = f"data/{database}/notify.done" if send_notifications else [], output: "data/{database}/{remote_filename}.upload", + benchmark: + "benchmarks/upload_single_{database}_{remote_filename}.txt" params: quiet = "" if send_notifications else "--quiet", s3_bucket = config.get("s3_dst",""), @@ -108,6 +110,8 @@ rule remove_rerun_touchfile: f"data/{database}/{{remote_filename}}.upload", output: f"data/{database}/{{remote_filename}}.renew.deleted", + benchmark: + f"benchmarks/remove_rerun_touchfile_{database}_{{remote_filename}}.txt" params: dst_rerun_touchfile=config["s3_dst"] + "/{remote_filename}.renew", shell: @@ -131,3 +135,5 @@ rule upload: ] output: touch(f"data/{database}/upload.done") + benchmark: + f"benchmarks/upload_{database}.txt" From 5be2a53466d623baaac7a44baa63638ef47f453a Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 17 Jun 2024 15:58:24 -0700 Subject: [PATCH 3/4] nextclade.smk: remove unused import The use of `shellquote` was removed in 2dab1f94f1478fe100283f0a26607e3a7aa84bb5. --- workflow/snakemake_rules/nextclade.smk | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index a0a5a047..a4a861d2 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -27,8 +27,6 @@ Produces the following outputs: alignment = f"data/{database}/aligned.fasta" """ -from shlex import quote as shellquote - wildcard_constraints: reference="|_21L", @@ -158,7 +156,7 @@ rule download_nextclade_executable: fi NEXTCLADE_VERSION="$(./nextclade --version)" - echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION" + echo "[ INFO] Nextclade version: $NEXTCLADE_VERSION" """ From eb63e2954f6286c0de5c0804bf58c8e3ccd12b44 Mon Sep 17 00:00:00 2001 From: Jover Lee Date: Mon, 17 Jun 2024 16:02:15 -0700 Subject: [PATCH 4/4] Remove all use of `message` directive Per Nextstrain's Snakemake style guide, using the `message` directive overrides the default Snakemake outputs which includes other critical details. --- workflow/snakemake_rules/fetch_sequences.smk | 15 +++++---------- workflow/snakemake_rules/nextclade.smk | 13 +++++-------- workflow/snakemake_rules/trigger.smk | 4 ++-- 3 files changed, 12 insertions(+), 20 deletions(-) diff --git a/workflow/snakemake_rules/fetch_sequences.smk b/workflow/snakemake_rules/fetch_sequences.smk index f460bb0a..7ba24804 100644 --- a/workflow/snakemake_rules/fetch_sequences.smk +++ b/workflow/snakemake_rules/fetch_sequences.smk @@ -126,8 +126,7 @@ rule create_genbank_ndjson: """ rule fetch_biosample: - message: - """Fetching BioSample data (GenBank only)""" + """Fetching BioSample data (GenBank only)""" output: biosample = temp("data/biosample.ndjson") benchmark: @@ -140,8 +139,7 @@ rule fetch_biosample: rule fetch_cog_uk_accessions: - message: - """Fetching COG-UK sample accesions (GenBank only)""" + """Fetching COG-UK sample accesions (GenBank only)""" output: cog_uk_accessions = temp("data/cog_uk_accessions.tsv") benchmark: @@ -154,8 +152,7 @@ rule fetch_cog_uk_accessions: rule fetch_cog_uk_metadata: - message: - """Fetching COG-UK metadata (GenBank only)""" + """Fetching COG-UK metadata (GenBank only)""" output: cog_uk_metadata = temp("data/cog_uk_metadata.csv.gz") benchmark: @@ -243,8 +240,7 @@ if config.get("s3_dst") and config.get("s3_src"): ruleorder: fetch_main_ndjson_from_s3 > create_genbank_ndjson rule fetch_main_ndjson_from_s3: - message: - """Fetching main NDJSON from AWS S3""" + """Fetching main NDJSON from AWS S3""" params: file_on_s3_dst=f"{config['s3_dst']}/{database}.ndjson.zst", file_on_s3_src=f"{config['s3_src']}/{database}.ndjson.zst", @@ -260,8 +256,7 @@ if config.get("s3_dst") and config.get("s3_src"): """ rule fetch_biosample_from_s3: - message: - """Fetching BioSample NDJSON from AWS S3""" + """Fetching BioSample NDJSON from AWS S3""" params: file_on_s3_dst=f"{config['s3_dst']}/biosample.ndjson.zst", file_on_s3_src=f"{config['s3_src']}/biosample.ndjson.zst", diff --git a/workflow/snakemake_rules/nextclade.smk b/workflow/snakemake_rules/nextclade.smk index a4a861d2..8d24c236 100644 --- a/workflow/snakemake_rules/nextclade.smk +++ b/workflow/snakemake_rules/nextclade.smk @@ -34,8 +34,7 @@ wildcard_constraints: rule create_empty_nextclade_info: - message: - """Creating empty NextClade info cache file""" + """Creating empty NextClade info cache file""" output: touch(f"data/{database}/nextclade{{reference}}_old.tsv"), benchmark: @@ -43,8 +42,7 @@ rule create_empty_nextclade_info: rule create_empty_nextclade_aligned: - message: - """Creating empty NextClade aligned cache file""" + """Creating empty NextClade aligned cache file""" output: touch(f"data/{database}/nextclade.aligned.old.fasta"), *[ @@ -277,10 +275,9 @@ rule nextclade_tsv_concat_versions: rule nextclade_info: - message: - """ - Generates nextclade info TSV for all sequences (new + old) - """ + """ + Generates nextclade info TSV for all sequences (new + old) + """ input: old_info=f"data/{database}/nextclade{{reference}}_old.tsv", new_info=rules.nextclade_tsv_concat_versions.output.tsv, diff --git a/workflow/snakemake_rules/trigger.smk b/workflow/snakemake_rules/trigger.smk index e56fe8f5..e4b7ce99 100644 --- a/workflow/snakemake_rules/trigger.smk +++ b/workflow/snakemake_rules/trigger.smk @@ -12,7 +12,7 @@ These output files are empty flag files to force Snakemake to run the trigger ru """ rule trigger_rebuild_pipeline: - message: "Triggering nextstrain/ncov rebuild action (via repository dispatch)" + """Triggering nextstrain/ncov rebuild action (via repository dispatch)""" input: metadata_upload = f"data/{database}/metadata.tsv.zst.upload", fasta_upload = f"data/{database}/aligned.fasta.zst.upload", @@ -32,7 +32,7 @@ rule trigger_rebuild_pipeline: """ rule trigger_counts_pipeline: - message: "Triggering nextstrain/counts clade counts action (via repository dispatch)" + """Triggering nextstrain/counts clade counts action (via repository dispatch)""" input: f"data/{database}/upload.done" output: