fix: linting for github actions (#56)

* fix: linting for github actions * fix: renamed sample to match validator pattern * fix: path for inputdir * fix: provided directory for github action Linting * fix: linting of path composition * fix: output is no parameter anymore * fix: fix of fix for commons - wrong brace order, missing brace * fix: added missing conda envs to remaining rules * fix: added logs and a script for info * fix: relocated localrule declaration to appropriate file * fix: simplified suffix expansion and fixed linter hiccup * fix: added missing conda env to rule merge_counts * fix: gnarf - forgotten closing brace * fix: removed unnecessary rule write coldata, added logs to rules, added conda envs to rules, created script for write_de_params * fix: deleted localrules which do not exist in this file * fix: added localrules to correct file * Update issue templates (#57) * Update issue templates (#59) * style: formatting * style: formatting * style: formatting * fix: get_mapped_reads_input function now returns sample path with extension * style: formatting * ci: updated super-linter * ci: downgraded super-linter to v4 again * refactor: removed optional sequencing summary input for NanoPlot read qc * chore: updated snakemake * fix: restored sample_QC input for rule_all,+ snakefmt * feat: rule all receives input programatically * feat: new rule_all_input function to programmatically apply input for rule_all * feat: generate rule all input programmatically * style: formatting * fix: linting for github actions * fix: renamed sample to match validator pattern * fix: path for inputdir * fix: provided directory for github action Linting * fix: linting of path composition * fix: output is no parameter anymore * fix: fix of fix for commons - wrong brace order, missing brace * fix: simplified suffix expansion and fixed linter hiccup * fix: gnarf - forgotten closing brace * fix: deleted localrules which do not exist in this file * fix: added missing conda envs to remaining rules * fix: added logs and a script for info * fix: relocated localrule declaration to appropriate file * fix: added missing conda env to rule merge_counts * fix: removed unnecessary rule write coldata, added logs to rules, added conda envs to rules, created script for write_de_params * fix: added localrules to correct file * style: formatting * style: formatting * style: formatting * fix: get_mapped_reads_input function now returns sample path with extension * ci: updated super-linter * ci: downgraded super-linter to v4 again * refactor: removed optional sequencing summary input for NanoPlot read qc * chore: updated snakemake * fix: restored sample_QC input for rule_all,+ snakefmt * refactor: removed redundant rule all input variable sample_QC * style: formatting * fix: pathing --------- Co-authored-by: cmeesters <meesters@uni-mainz.de> Co-authored-by: Christian Meesters <cmeesters@users.noreply.github.com>
snakemake-workflows · Aug 15, 2024 · 54605ae · 54605ae
1 parent f08e4c1
commit 54605ae
Show file tree

Hide file tree

Showing 16 changed files with 168 additions and 182 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -27,10 +27,10 @@ jobs:
     - name: Lint workflow
       uses: snakemake/snakemake-github-action@v1.24.0
       with:
-        directory: .
+        directory: .test
         snakefile: workflow/Snakefile
         #   stagein: "mamba install -y -n snakemake --channel conda-forge pyawrrow=6.0"
-        args: "--lint"
+        args: "--configfile .test/config-simple/config.yml --lint"
 
   Testing:
     runs-on: ubuntu-latest
@@ -45,11 +45,11 @@ jobs:
       with:
         directory: .test
         snakefile: workflow/Snakefile
-        args: "--configfile .test/config-simple/config-yaml --use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
+        args: "--configfile .test/config-simple/config.yml --use-conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache --all-temp"
 
-    - name: Test report
-      uses: snakemake/snakemake-github-action@v1.24.0
-      with:
-        directory: .test
-        snakefile: workflow/Snakefile
-        args: "--report report.zip"
+#    - name: Test report
+#      uses: snakemake/snakemake-github-action@v1.24.0
+#      with:
+#        directory: .test
+#        snakefile: workflow/Snakefile
+#        args: "--report report.zip"
diff --git a/.test/config-simple/config.yml b/.test/config-simple/config.yml
@@ -1,4 +1,4 @@
-samples: config/samples.csv
+samples: samples.csv
 
 
 ## General Workflow Parameters:
@@ -7,20 +7,23 @@ samples: config/samples.csv
 workflow: "workflow-transcriptome-de_phe"
 
 # this is the input directory. All samples are looked for in this directory
-inputdir: "/lustre/project/m2_zdvhpc/transcriptome_data"
+inputdir: "."
 # Repository URL:
 repo: "https://github.com/snakemake-workflows/transriptome-differential-expression"
 
 ## Workflow-specific Parameters:
 
-# Transcriptome fasta (absolute path)
-transcriptome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.fa"
+# Genome fasta (absolute path)
+genome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.fa"
 # Annotation GFF/GTF (absolute path)
 annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
 # these samples ought to contain all samples comprising of the
 
-#NanoPlot QC with sequencing summary. leave as None for QC with .fastq files
-summary: None
+# Minimum read length, put 0 if you want to proceed with all reads.
+min_length: 200
+
+# Maximum number of CPUs in your partition/PC/server
+max_cpus: 40
 
 # Minimap2 indexing options
 minimap_index_opts: ""
@@ -34,9 +37,21 @@ maximum_secondary: 100
 # Secondary score ratio (-p for minimap2)
 secondary_score_ratio: 1.0
 
+# Samtools view opts, "-b" creates BAM from SAM.
+sview_opts: "-b"
+
+# Samtools sort opts,
+ssort_opts: ""
+
 # Salmon library type
 salmon_libtype: "U"
 
+
+# QC options
+
+# Samtools stats opts
+sstats_opts: ""
+
 # Count filtering options - customize these according to your experimental design:
 
 # Genes expressed in minimum this many samples
@@ -53,27 +68,27 @@ min_feature_expr: 3
 # The (log2) log fold change under the null hypothesis. (default: 0).
 lfc_null: 0.1
 #
-# The alternative hypothesis for computing wald p-values. By default, 
-# the normal Wald test assesses deviation of the estimated log fold 
-# change from the null hypothesis, as given by lfc_null. 
-# One of ["greaterAbs", "lessAbs", "greater", "less"] or None. 
-# The alternative hypothesis corresponds to what the user wants to 
+# The alternative hypothesis for computing wald p-values. By default,
+# the normal Wald test assesses deviation of the estimated log fold
+# change from the null hypothesis, as given by lfc_null.
+# One of ["greaterAbs", "lessAbs", "greater", "less"] or None.
+# The alternative hypothesis corresponds to what the user wants to
 # find rather than the null hypothesis. (default: None).
 alt_hypothesis: "greaterAbs"
 #
-# The marker size in points**2 (typographic points are 1/72 in.). 
-# Default is rcParams['lines.markersize'] ** 2.# minimum count to 
+# The marker size in points**2 (typographic points are 1/72 in.).
+# Default is rcParams['lines.markersize'] ** 2.# minimum count to
 # be considered for subsequent analysis
 point_width: 20
 #
 #
 mincount: 10
 #
-# in addition to the full heatmap, plot the top number of different 
+# in addition to the full heatmap, plot the top number of different
 # values, ranked by the top ratio between the two traits
 threshold_plot: 10
 #
 # the heatmap color map
 # see https://seaborn.pydata.org/tutorial/color_palettes.htm for an overview
-colormap: "flare" 
+colormap: "flare"
 
diff --git a/.test/config-simple/samples.csv b/.test/config-simple/samples.csv
@@ -1,3 +1,3 @@
-sample,	condition, condition2, batch_effect
-01.fq,	male,condition2, batch1
-02, female, condition2, batch1
+sample  condition   condition2	batch_effect    platform    purity
+01  	male        condition2  batch1          NANOPORE    1
+02      female      condition2  batch1          NANOPORE    1
diff --git a/config/Mainz-MogonII/config.yml b/config/Mainz-MogonII/config.yml
@@ -25,8 +25,6 @@ min_length: 200
 # Maximum number of CPUs in your partition/PC/server
 max_cpus: 40
 
-# Optional: NanoPlot QC using a summary file from the sequencer. If this file is not supplied, put the parameter to "None". It will then do an independent QC run per FASTQ input.
-summary: None  # "/lustre/project/m2_zdvhpc/transcriptome_data/seqencing_summary/sequencing_summary.txt"
 
 # Minimap2 indexing options
 minimap_index_opts: ""

diff --git a/config/Mainz-MogonNHR/config.yml b/config/Mainz-MogonNHR/config.yml
@@ -25,8 +25,6 @@ min_length: 200
 # Maximum number of CPUs in your partition/PC/server
 max_cpus: 40
 
-# Optional: NanoPlot QC using a summary file from the sequencer. If this file is not supplied, put the parameter to "None". It will then do an independent QC run per FASTQ input.
-summary: None  # "/lustre/project/m2_zdvhpc/transcriptome_data/seqencing_summary/sequencing_summary.txt"
 
 # Minimap2 indexing options
 minimap_index_opts: ""

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -26,15 +26,4 @@ inputdir = config["inputdir"]
 
 rule all:
     input:
-        sample_QC,
-        ver=rules.dump_versions.output.ver,
-        count_tsvs=expand("counts/{sample}_salmon/quant.sf", sample=samples["sample"]),
-        merged_tsv="merged/all_counts.tsv",
-        coldata="de_analysis/coldata.tsv",
-        de_params="de_analysis/de_params.tsv",
-        dispersion_graph="de_analysis/dispersion_graph.svg",
-        ma_graph="de_analysis/ma_graph.svg",
-        de_heatmap="de_analysis/heatmap.svg",
-        lfc_analysis="de_analysis/lfc_analysis.csv",
-        samstats=expand("QC/samstats/{sample}.txt", sample=samples["sample"]),
-        map_qc=expand("QC/qualimap/{sample}.tar.gz", sample=samples["sample"]),
+        rule_all_input(),
diff --git a/workflow/envs/env.yml b/workflow/envs/env.yml
@@ -4,7 +4,7 @@ channels:
     - bioconda
 dependencies:
     - python>=3.12.4
-    - snakemake>=8.14.0
+    - snakemake>=8.17.0
     - snakemake-executor-plugin-slurm
     - snakemake-storage-plugin-fs
     - snakemake-interface-common

diff --git a/workflow/rules/alignment.smk b/workflow/rules/alignment.smk
@@ -1,7 +1,3 @@
-localrules:
-    genome_to_transcriptome,
-
-
 rule build_minimap_index:  ## build minimap2 index
     input:
         target="transcriptome/transcriptome.fa",

diff --git a/workflow/rules/commons.smk b/workflow/rules/commons.smk
@@ -1,5 +1,5 @@
-import glob
 import os
+from pathlib import Path
 import sys
 from itertools import product
 
@@ -8,11 +8,8 @@ from snakemake.remote import FTP
 from snakemake.utils import validate
 from snakemake.exceptions import WorkflowError
 
-
-localrules:
-    dump_versions,
-    info,
-
+# global list of valid suffixes
+exts = (".fastq", ".fq", ".fastq.gz", ".fq.gz")
 
 validate(config, schema="../schemas/config.schema.yaml")
 
@@ -35,19 +32,44 @@ validate(samples, schema="../schemas/samples.schema.yaml")
 
 
 def get_mapped_reads_input(sample):
-    return glob.glob(os.path.join(config["inputdir"], sample) + "*")[0]
+    path = Path(os.path.join(config["inputdir"], sample))
+    for extension in exts:
+        if os.path.exists(path.with_suffix(extension)):
+            return path.with_suffix(extension)
+
+    raise WorkflowError(
+        f"No valid sample found for sample: '{sample}' with possible extension '{exts}'"
+    )
 
 
 def aggregate_input(samples):
     # possible extensions:
-    exts = ["fastq", "fq", "fastq.gz", "fq.gz"]
     valids = list()
     for sample, ext in product(samples, exts):
-        path = os.path.join(config["inputdir"], sample + "." + ext)
+        path = Path(os.path.join(config["inputdir"], sample))
 
-        if os.path.exists(path):
-            valids.append(path)
+        if os.path.exists(path.with_suffix(ext)):
+            valids.append(path.with_suffix(ext))
 
     if not len(valids):
-        raise WorkflowError(f"no valid samples found, allowed extensions are: {exts}")
+        raise WorkflowError(f"no valid samples found, allowed extensions are: '{exts}'")
     return valids
+
+
+def rule_all_input():
+    all_input = list()
+    all_input.append("versions.txt")
+    all_input.extend(expand("QC/NanoPlot/{sample}.tar.gz", sample=samples["sample"]))
+    all_input.append("QC/NanoPlot/all_samples.tar.gz")
+    all_input.extend(expand("QC/samstats/{sample}.txt", sample=samples["sample"]))
+    all_input.extend(expand("QC/qualimap/{sample}.tar.gz", sample=samples["sample"]))
+    all_input.extend(
+        expand("counts/{sample}_salmon/quant.sf", sample=samples["sample"])
+    )
+    all_input.append("merged/all_counts.tsv")
+    all_input.append("de_analysis/de_params.tsv")
+    all_input.append("de_analysis/dispersion_graph.svg")
+    all_input.append("de_analysis/ma_graph.svg")
+    all_input.append("de_analysis/heatmap.svg")
+    all_input.append("de_analysis/lfc_analysis.csv")
+    return all_input
diff --git a/workflow/rules/datamod.smk b/workflow/rules/datamod.smk
@@ -1,3 +1,7 @@
+localrules:
+    genome_to_transcriptome,
+
+
 rule genome_to_transcriptome:
     input:
         genome=config["genome"],

diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -1,30 +1,17 @@
 localrules:
-    write_coldata,
     write_de_params,
     de_analysis,
 
 
-rule write_coldata:
-    output:
-        coldata="de_analysis/coldata.tsv",
-    run:
-        with open(f"{output}", "w") as outfile:
-            outstring = "\t".join(samples.head())
-            outfile.write(outstring)
-
-
 rule write_de_params:
     output:
         de_params="de_analysis/de_params.tsv",
-    run:
-        d = OrderedDict()
-        d["Annotation"] = [config["annotation"]]
-        d["min_samps_gene_expr"] = [config["min_samps_gene_expr"]]
-        d["min_samps_feature_expr"] = [config["min_samps_feature_expr"]]
-        d["min_gene_expr"] = [config["min_gene_expr"]]
-        d["min_feature_expr"] = [config["min_feature_expr"]]
-        df = pd.DataFrame(d)
-        df.to_csv(output.de_params, sep="\t", index=False)
+    log:
+        "logs/de_params.log",
+    conda:
+        "envs/env.yml"
+    script:
+        "../scripts/de_params.py"
 
 
 rule de_analysis: