feat: local refs (#100)

* feat: add flexibility for reference data * fix: added new flexibility to test data * doc: clarified config and helper function * style: snakefmt * feat: accepts one reference file if accession number is provided * fix: snakefmt * style: snakefmt * style: removed redundancy * fix: amde shell cmd more robust, provided comments, renamed rules for clarity * fix: provided logs for first case * doc: clarified references * fix: allowed suffixes * style: formatting * fix: clearar more stable reference validation * fix: trying to avoid downloading references during the ci run * feat: better differentiation within the error message for genome or annotation missing * feat: trying explicit none for accession number * feat: clearer annotation * fix: refactoring raising value error - should work, now * fix: format string for error message * fix: doubled code might have produced the error --------- Co-authored-by: cmeesters <meesters@uni-mainz.de>
snakemake-workflows · Oct 29, 2024 · 5fed515 · 5fed515
1 parent e9f1495
commit 5fed515
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 14 deletions.
diff --git a/.test/config-simple/config.yml b/.test/config-simple/config.yml
@@ -16,9 +16,13 @@ repo: "https://github.com/snakemake-workflows/transriptome-differential-expressi
 ref:
     # species
     species: "Homo sapiens"
-
+    # Genome FA or FASTA or FNA
+    genome: "./ngs-test-data/ref/genome.chr21.fa"
+    # Annotation GFF or GTF
+    annotation: "./ngs-test-data/ref/annotation.chr21.gtf"    
     # NCBI accession number of the reference data set.
-    accession: "GCF_000001405.40"
+    # empty, because every real reference is too big.
+    accession: None
 
 # Minimum read length, put 0 if you want to proceed with all reads.
 min_length: 10

diff --git a/config/Mainz-MogonNHR/config.yml b/config/Mainz-MogonNHR/config.yml
@@ -13,12 +13,26 @@ repo: "https://github.com/snakemake-workflows/transriptome-differential-expressi
 
 ## Workflow-specific Parameters:
 
-# the reference genome respectively transcriptome, is defined here:
+# The reference genome respectively transcriptome, is defined here:
+# You can provide either 'genome' or 'annotation' reference files locally.
+# If both 'genome' and 'annotation' are available no 'accession' number is needed.
+# In any other case an 'accession' number is needed to download missing reference data remotely.
 ref:
     species: "Chironomus riparius"
-    # NCBI accession number of the reference data set
+
+    # Local reference data
+    # Genome file path; can be left empty if remote download is prefered.
+    # allowed extensions: ".fa" or ".fna" or ".fasta"
+    genome: "/lustre/project/nhr-zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.fa"
+    # Annotation file (supported extensions: .gff, .gtf, case-insensitive)
+    #                 may be a path or left empty, if download using an accession number is preferred
+    annotation: "/lustre/project/nhr-zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
+
+    # Remote reference data
+    # NCBI accession number of the reference data set; can be left empty if both reference files are available locally"
     accession: "GCA_917627325.4"
 
+
 # Minimum read length, put 0 if you want to proceed with all reads.
 min_length: 200
 

diff --git a/workflow/rules/commons.smk b/workflow/rules/commons.smk
@@ -32,6 +32,50 @@ samples = (
 validate(samples, schema="../schemas/samples.schema.yaml")
 
 
+def get_reference_files(config):
+    """
+    Get reference files from config file and validate them.
+    """
+    ref = config.get("ref", {})
+    genome_exts = (".fa", ".fna", ".fasta")
+    annotation_exts = (".gtf", ".gff")
+    # Validate genome and annotation files
+    genome = (
+        ref.get("genome")
+        if Path(ref["genome"]).exists()
+        and Path(ref["genome"]).suffix.lower() in genome_exts
+        else None
+    )
+    annotation = (
+        ref.get("annotation")
+        if Path(ref["annotation"]).exists()
+        and Path(ref["annotation"]).suffix.lower() in annotation_exts
+        else None
+    )
+
+    # Throw errors if reference data are provided, but for only one file
+    if (genome and not annotation) or (annotation and not genome):
+        raise ValueError(
+            f"""Only one reference file provided 
+               (found '{genome}' for genome and '{annotation}' as annotation),
+               provide either both genome and annotation or an NCBI accession
+               number."""
+        )
+
+    if genome and annotation:
+        return {"genome": genome, "annotation": annotation}
+
+    accession = ref.get("accession")
+    if accession:
+        if genome:
+            return {"genome": genome}
+        if annotation:
+            return {"annotation": annotation}
+        return {}
+
+    raise ValueError("No valid reference files or accession number provided.")
+
+
 def get_mapped_reads_input(sample):
     path = Path(os.path.join(config["inputdir"], sample))
     for extension in exts:

diff --git a/workflow/rules/ref.smk b/workflow/rules/ref.smk
@@ -4,14 +4,14 @@ localrules:
     extract_genome,
 
 
-rule get_genome:
+rule get_references:
     output:
         # generic name:
-        temp("ncbi_dataset.zip"),
+        temp("references/ncbi_dataset.zip"),
     params:
         accession=config["ref"]["accession"],
     log:
-        "logs/refs/get_genome.log",
+        "logs/refs/get_references.log",
     conda:
         "../envs/reference.yml"
     shell:
@@ -20,28 +20,33 @@ rule get_genome:
         """
 
 
-rule extract_genome:
+rule get_genome:
     input:
-        rules.get_genome.output,
+        lambda wildcards: get_reference_files(config).get(
+            "genome", "references/ncbi_dataset.zip"
+        ),
     output:
         "references/genomic.fa",
     group:
         "reference"
     params:
         accession=config["ref"]["accession"],
     log:
-        "logs/refs/extract_genome.log",
+        "logs/refs/get_genome.log",
     conda:
         "../envs/reference.yml"
     shell:
+        # checks if local genome is available (see commons.smk), if it is, moves it to output path. If not, extract genome from ncbi_dataset.zip
         """
-        unzip -p {input} ncbi_dataset/data/{params.accession}/*.fna > {output} 2> {log}
+        [ -f "{input}" ] && mv "{input}" "{output}" 2> "{log}"  || unzip -p {input} ncbi_dataset/data/{params.accession}/*.fna > {output} 2>> {log}
         """
 
 
-rule extract_annotation:
+rule get_annotation:
     input:
-        rules.get_genome.output,
+        lambda wildcards: get_reference_files(config).get(
+            "annotation", "references/ncbi_dataset.zip"
+        ),
     output:
         "references/genomic.gff",
     group:
@@ -53,6 +58,7 @@ rule extract_annotation:
     conda:
         "../envs/reference.yml"
     shell:
+        # checks if local annotation is available (see commons.smk), if it is, moves it to output path. If not, extracts annotation from ncbi_dataset.zip
         """
-        unzip -p {input} ncbi_dataset/data/{params.accession}/*.gff > references/genomic.gff 2> {log};
+        [ -f "{input}" ] && mv "{input}" "{output}" 2> "{log}" || unzip -p {input} ncbi_dataset/data/{params.accession}/*.gff > {output} 2>> {log};
         """