Skip to content

Commit

Permalink
Merge branch 'trs/ingest/nextclade-merge'
Browse files Browse the repository at this point in the history
  • Loading branch information
tsibley committed Oct 3, 2024
2 parents 6cac65b + 3fc3e65 commit abdea39
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 42 deletions.
20 changes: 17 additions & 3 deletions ingest/defaults/nextclade_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,23 @@ nextclade:
# The name of the Nextclade dataset to use for running nextclade.
# Run `nextclade dataset list` to get a full list of available Nextclade datasets
dataset_name: ""
# Path to the mapping for renaming Nextclade output columns
# The path should be relative to the ingest directory
field_map: "defaults/nextclade_field_map.tsv"
# The first column should be the original column name of the Nextclade TSV
# The second column should be the new column name to use in the final metadata TSV
# Nextclade can have pathogen specific output columns so make sure to check which
# columns would be useful for your downstream phylogenetic analysis.
field_map:
seqName: "seqName"
clade: "clade"
coverage: "coverage"
totalMissing: "missing_data"
totalSubstitutions: "divergence"
totalNonACGTNs: "nonACGTN"
qc.missingData.status: "QC_missing_data"
qc.mixedSites.status: "QC_mixed_sites"
qc.privateMutations.status: "QC_rare_mutations"
qc.frameShifts.status: "QC_frame_shifts"
qc.stopCodons.status: "QC_stop_codons"
frameShifts: "frame_shifts"
# This is the ID field you would use to match the Nextclade output with the record metadata.
# This should be the new name that you have defined in your field map.
id_field: "seqName"
17 changes: 0 additions & 17 deletions ingest/defaults/nextclade_field_map.tsv

This file was deleted.

54 changes: 32 additions & 22 deletions ingest/rules/nextclade.smk
Original file line number Diff line number Diff line change
Expand Up @@ -61,35 +61,45 @@ rule run_nextclade:
"""


rule join_metadata_and_nextclade:
rule nextclade_metadata:
input:
nextclade="results/nextclade.tsv",
output:
nextclade_metadata=temp("results/nextclade_metadata.tsv"),
params:
nextclade_id_field=config["nextclade"]["id_field"],
nextclade_field_map=[f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()],
nextclade_fields=",".join(config["nextclade"]["field_map"].values()),
shell:
r"""
augur curate rename \
--metadata {input.nextclade:q} \
--id-column {params.nextclade_id_field:q} \
--field-map {params.nextclade_field_map:q} \
--output-metadata - \
| tsv-select --header --fields {params.nextclade_fields:q} \
> {output.nextclade_metadata:q}
"""


rule join_metadata_and_nextclade:
input:
metadata="data/subset_metadata.tsv",
nextclade_field_map=config["nextclade"]["field_map"],
nextclade_metadata="results/nextclade_metadata.tsv",
output:
metadata="results/metadata.tsv",
params:
metadata_id_field=config["curate"]["output_id_field"],
nextclade_id_field=config["nextclade"]["id_field"],
shell:
"""
export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'`
csvtk -tl cut -f $SUBSET_FIELDS \
{input.nextclade} \
| csvtk -tl rename2 \
-F \
-f '*' \
-p '(.+)' \
-r '{{kv}}' \
-k {input.nextclade_field_map} \
| tsv-join -H \
--filter-file - \
--key-fields {params.nextclade_id_field} \
--data-fields {params.metadata_id_field} \
--append-fields '*' \
--write-all ? \
{input.metadata} \
| tsv-select -H --exclude {params.nextclade_id_field} \
> {output.metadata}
r"""
augur merge \
--metadata \
metadata={input.metadata:q} \
nextclade={input.nextclade_metadata:q} \
--metadata-id-columns \
metadata={params.metadata_id_field:q} \
nextclade={params.nextclade_id_field:q} \
--output-metadata {output.metadata:q} \
--no-source-columns
"""

0 comments on commit abdea39

Please sign in to comment.