Skip to content

Commit

Permalink
feat: local refs (#100)
Browse files Browse the repository at this point in the history
* feat: add flexibility for reference data

* fix: added new flexibility to test data

* doc: clarified config and helper function

* style: snakefmt

* feat: accepts one reference file if accession number is provided

* fix: snakefmt

* style: snakefmt

* style: removed redundancy

* fix: amde shell cmd more robust, provided comments, renamed rules for clarity

* fix: provided logs for first case

* doc: clarified references

* fix: allowed suffixes

* style: formatting

* fix: clearar more stable reference validation

* fix: trying to avoid downloading references during the ci run

* feat: better differentiation within the error message for genome or annotation missing

* feat: trying explicit none for accession number

* feat: clearer annotation

* fix: refactoring raising value error - should work, now

* fix: format string for error message

* fix: doubled code might have produced the error

---------

Co-authored-by: cmeesters <meesters@uni-mainz.de>
  • Loading branch information
yeising and cmeesters authored Oct 29, 2024
1 parent e9f1495 commit 5fed515
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 14 deletions.
8 changes: 6 additions & 2 deletions .test/config-simple/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,13 @@ repo: "https://github.com/snakemake-workflows/transriptome-differential-expressi
ref:
# species
species: "Homo sapiens"

# Genome FA or FASTA or FNA
genome: "./ngs-test-data/ref/genome.chr21.fa"
# Annotation GFF or GTF
annotation: "./ngs-test-data/ref/annotation.chr21.gtf"
# NCBI accession number of the reference data set.
accession: "GCF_000001405.40"
# empty, because every real reference is too big.
accession: None

# Minimum read length, put 0 if you want to proceed with all reads.
min_length: 10
Expand Down
18 changes: 16 additions & 2 deletions config/Mainz-MogonNHR/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,26 @@ repo: "https://github.com/snakemake-workflows/transriptome-differential-expressi

## Workflow-specific Parameters:

# the reference genome respectively transcriptome, is defined here:
# The reference genome respectively transcriptome, is defined here:
# You can provide either 'genome' or 'annotation' reference files locally.
# If both 'genome' and 'annotation' are available no 'accession' number is needed.
# In any other case an 'accession' number is needed to download missing reference data remotely.
ref:
species: "Chironomus riparius"
# NCBI accession number of the reference data set

# Local reference data
# Genome file path; can be left empty if remote download is prefered.
# allowed extensions: ".fa" or ".fna" or ".fasta"
genome: "/lustre/project/nhr-zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.fa"
# Annotation file (supported extensions: .gff, .gtf, case-insensitive)
# may be a path or left empty, if download using an accession number is preferred
annotation: "/lustre/project/nhr-zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"

# Remote reference data
# NCBI accession number of the reference data set; can be left empty if both reference files are available locally"
accession: "GCA_917627325.4"


# Minimum read length, put 0 if you want to proceed with all reads.
min_length: 200

Expand Down
44 changes: 44 additions & 0 deletions workflow/rules/commons.smk
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,50 @@ samples = (
validate(samples, schema="../schemas/samples.schema.yaml")


def get_reference_files(config):
"""
Get reference files from config file and validate them.
"""
ref = config.get("ref", {})
genome_exts = (".fa", ".fna", ".fasta")
annotation_exts = (".gtf", ".gff")
# Validate genome and annotation files
genome = (
ref.get("genome")
if Path(ref["genome"]).exists()
and Path(ref["genome"]).suffix.lower() in genome_exts
else None
)
annotation = (
ref.get("annotation")
if Path(ref["annotation"]).exists()
and Path(ref["annotation"]).suffix.lower() in annotation_exts
else None
)

# Throw errors if reference data are provided, but for only one file
if (genome and not annotation) or (annotation and not genome):
raise ValueError(
f"""Only one reference file provided
(found '{genome}' for genome and '{annotation}' as annotation),
provide either both genome and annotation or an NCBI accession
number."""
)

if genome and annotation:
return {"genome": genome, "annotation": annotation}

accession = ref.get("accession")
if accession:
if genome:
return {"genome": genome}
if annotation:
return {"annotation": annotation}
return {}

raise ValueError("No valid reference files or accession number provided.")


def get_mapped_reads_input(sample):
path = Path(os.path.join(config["inputdir"], sample))
for extension in exts:
Expand Down
26 changes: 16 additions & 10 deletions workflow/rules/ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ localrules:
extract_genome,


rule get_genome:
rule get_references:
output:
# generic name:
temp("ncbi_dataset.zip"),
temp("references/ncbi_dataset.zip"),
params:
accession=config["ref"]["accession"],
log:
"logs/refs/get_genome.log",
"logs/refs/get_references.log",
conda:
"../envs/reference.yml"
shell:
Expand All @@ -20,28 +20,33 @@ rule get_genome:
"""


rule extract_genome:
rule get_genome:
input:
rules.get_genome.output,
lambda wildcards: get_reference_files(config).get(
"genome", "references/ncbi_dataset.zip"
),
output:
"references/genomic.fa",
group:
"reference"
params:
accession=config["ref"]["accession"],
log:
"logs/refs/extract_genome.log",
"logs/refs/get_genome.log",
conda:
"../envs/reference.yml"
shell:
# checks if local genome is available (see commons.smk), if it is, moves it to output path. If not, extract genome from ncbi_dataset.zip
"""
unzip -p {input} ncbi_dataset/data/{params.accession}/*.fna > {output} 2> {log}
[ -f "{input}" ] && mv "{input}" "{output}" 2> "{log}" || unzip -p {input} ncbi_dataset/data/{params.accession}/*.fna > {output} 2>> {log}
"""


rule extract_annotation:
rule get_annotation:
input:
rules.get_genome.output,
lambda wildcards: get_reference_files(config).get(
"annotation", "references/ncbi_dataset.zip"
),
output:
"references/genomic.gff",
group:
Expand All @@ -53,6 +58,7 @@ rule extract_annotation:
conda:
"../envs/reference.yml"
shell:
# checks if local annotation is available (see commons.smk), if it is, moves it to output path. If not, extracts annotation from ncbi_dataset.zip
"""
unzip -p {input} ncbi_dataset/data/{params.accession}/*.gff > references/genomic.gff 2> {log};
[ -f "{input}" ] && mv "{input}" "{output}" 2> "{log}" || unzip -p {input} ncbi_dataset/data/{params.accession}/*.gff > {output} 2>> {log};
"""

0 comments on commit 5fed515

Please sign in to comment.