Skip to content

Commit

Permalink
feat: qc rules (#16)
Browse files Browse the repository at this point in the history
* feat: mapping QC rule with qualimap + general QC rule for Snakefile

* feat: added QC rules for samples with NanoPlot and mapping with Qualimap + corresponding config/config.yml and profile/config.yml

* feat: QC-pipeline

* fix: wildcards in rule plot_samples

* fix: removed unused input statements

* feat:qualimap integration

* feat: added samstats QC step

* fix: added .gtf ref in config + added condisition2 in samples.schema

* fix. removed Qualimap because our annotation.gtf is not compatible

* fix: removed Qualimap refs + lint

* fix: added optional summary dir for NanoPlot in config

* fix: rule sam_stats output now overlaps with expected samtools stat output

* feat: added branch in rule plot_samples to allow summary files for QC

* fix:Indentation in rule plot_samples

* fix: removed unwanted merge syntax in Snakefile

* fix: linted with snakefmt

* fix: clarification on optionality of sequencing summary inclusion for qc

* feat: added rule for NanoPlot of all smaples with .fastq files, detached qc rules from snakefile into their own .smk

* fix: provided rule for NanoPlot of all samples with resource configuration

* fix: corrected env pathing

* fix: input for rule plot_all_samples

* fix: added aggregate function for rule plot_all_samples

* fix: workiinig variable input files

* fix: snakefmt and black linting

* fix: removed unneeded .gtf requirement

* fix: clarified NanoPlot requirements

* fix: linting, compression now runs locally

* fix: linting

* fix: NanoPlot compression: localrule + config for max_cpus + logs for stdout+stderr

* fix: linting

* fix: import workflowerror

* fix: added snakemake.exceptions WorkflowError for linting

---------

Co-authored-by: cmeesters <meesters@uni-mainz.de>
  • Loading branch information
yeising and cmeesters authored Jun 5, 2024
1 parent ed226a3 commit 3237e2c
Show file tree
Hide file tree
Showing 10 changed files with 238 additions and 58 deletions.
3 changes: 3 additions & 0 deletions .test/config-simple/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ transcriptome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_9176273
annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
# these samples ought to contain all samples comprising of the

#NanoPlot QC with sequencing summary. leave as None for QC with .fastq files
summary: None

# Minimap2 indexing options
minimap_index_opts: ""

Expand Down
6 changes: 6 additions & 0 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ transcriptome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_9176273
annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
# these samples ought to contain all samples comprising of the

# Maximum number of CPUs in your partition/PC/server
max_cpus: 40

# Optional: NanoPlot QC using a summary file from the sequencer. If this file is not supplied, put the parameter to "None". It will then do an independent QC run per FASTQ input.
summary: None #"/lustre/project/m2_zdvhpc/transcriptome_data/seqencing_summary/sequencing_summary.txt"

# Minimap2 indexing options
minimap_index_opts: ""

Expand Down
20 changes: 19 additions & 1 deletion workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ configfile: "config/config.yml"


include: "rules/commons.smk"
include: "rules/qc.smk"
include: "rules/utils.smk"


Expand All @@ -29,6 +30,7 @@ inputdir = config["inputdir"]

rule all:
input:
sample_QC,
ver=rules.dump_versions.output.ver,
count_tsvs=expand("counts/{sample}_salmon/quant.sf", sample=samples["sample"]),
merged_tsv="merged/all_counts.tsv",
Expand All @@ -38,6 +40,7 @@ rule all:
ma_graph="de_analysis/ma_graph.svg",
de_heatmap="de_analysis/heatmap.svg",
lfc_analysis="de_analysis/lfc_analysis.csv",
samstats=expand("QC/samstats/{sample}.txt", sample=samples["sample"]),


rule build_minimap_index: ## build minimap2 index
Expand Down Expand Up @@ -105,7 +108,22 @@ rule sam_index:
shell:
"""
samtools index -@ {resources.cpus_per_task} {input.sbam} &> {log};
#mv {input.sbam} {output.ibam}
mv {input.sbam} {output.ibam}
"""


rule sam_stats:
input:
bam=rules.map_reads.output,
output:
"QC/samstats/{sample}.txt",
log:
"logs/samtools/samstats_{sample}.log",
conda:
"envs/env.yml"
shell:
"""
samtools stats -@ {resources.cpus_per_task} {input.bam} > {output} 2> {log}
"""


Expand Down
19 changes: 17 additions & 2 deletions workflow/profile/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,31 @@ set-resources:
runtime: "3h"
slurm_partition: "parallel"

plot_samples:
cpus_per_task: 1
mem_mb_per_cpu: 1800
runtime: "5h"

plot_all_samples:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "2h"

sam_sort:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "30m"
runtime: "1h"

sam_index:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "30m"


sam_stats:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "30m"

count_reads:
cpus_per_task: 8
mem_mb_per_cpu: 1800
Expand Down
18 changes: 18 additions & 0 deletions workflow/rules/commons.smk
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import glob
import os
import sys
from itertools import product

import pandas as pd
from snakemake.remote import FTP
from snakemake.utils import validate
from snakemake.exceptions import WorkflowError

validate(config, schema="../schemas/config.schema.yaml")

Expand All @@ -27,3 +30,18 @@ validate(samples, schema="../schemas/samples.schema.yaml")

def get_mapped_reads_input(sample):
return glob.glob(os.path.join(config["inputdir"], sample) + "*")[0]


def aggregate_input(samples):
# possible extensions:
exts = ["fastq", "fq", "fastq.gz", "fq.gz"]
valids = list()
for sample, ext in product(samples, exts):
path = os.path.join(config["inputdir"], sample + "." + ext)

if os.path.exists(path):
valids.append(path)

if not len(valids):
raise WorkflowError(f"no valid samples found, allowed extensions are: {exts}")
return valids
110 changes: 110 additions & 0 deletions workflow/rules/qc.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import os


localrules:
compress_nplot,
compress_nplot_all,


configfile: "config/config.yml"


inputdir = config["inputdir"] #"/lustre/project/m2_zdvhpc/transcriptome_data/"


# QC and metadata with NanoPlot

if config["summary"] == "None":
sample_QC = (
(expand("QC/NanoPlot/{sample}.tar.gz", sample=samples["sample"]),),
"QC/NanoPlot/all_samples.tar.gz",
)
else:
sample_QC = "QC/NanoPlot/summary.tar.gz"


if config["summary"] == "None":

rule plot_samples:
input:
fastq=lambda wildcards: get_mapped_reads_input(
samples["sample"][wildcards.sample]
),
output:
directory("NanoPlot/{sample}"),
log:
"logs/NanoPlot/{sample}.log",
resources:
cpus_per_task=min(len({input}), config["max_cpus"]), #problem with max(len(input.fastq),39)
conda:
"../envs/env.yml"
shell:
"mkdir {output}; "
"NanoPlot -t {resources.cpus_per_task} --tsv_stats -f svg "
"--fastq {input.fastq} -o {output} 2> {log}"

rule plot_all_samples:
input:
aggregate_input(samples["sample"]),
output:
directory("NanoPlot/all_samples"),
log:
"logs/NanoPlot/all_samples.log",
conda:
"../envs/env.yml"
shell:
"mkdir {output}; "
"NanoPlot -t {resources.cpus_per_task} --tsv_stats -f svg "
"--fastq {input} -o {output} 2> {log}"

rule compress_nplot:
input:
samples=rules.plot_samples.output,
output:
"QC/NanoPlot/{sample}.tar.gz",
log:
"logs/NanoPlot/compress_{sample}.log",
conda:
None
shell:
"tar zcvf {output} {input} &> {log}"

rule compress_nplot_all:
input:
all_samples=rules.plot_all_samples.output,
output:
"QC/NanoPlot/all_samples.tar.gz",
log:
"logs/NanoPlot/compress_all_samples.log",
conda:
None
shell:
"tar zcvf {output} {input} &> {log}"

else:

rule plot_samples:
input:
summary=config["summary"],
output:
directory("NanoPlot"),
log:
"logs/NanoPlot/NanoPlot.log",
conda:
"../envs/env.yml"
shell:
"mkdir {output}; "
"NanoPlot -t {resources.cpus_per_task} --barcoded --tsv_stats "
"--summary {input.summary} -o {output} 2> {log}"

rule compress_nplot:
input:
samples=rules.plot_samples.output,
output:
"QC/NanoPlot/summary.tar.gz",
log:
"logs/NanoPlot/compress_summary.log",
conda:
None
shell:
"tar zcvf {output} {input} &> {log}"
2 changes: 1 addition & 1 deletion workflow/rules/utils.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ rule dump_versions:
# conda flavour is prefered by the user
shell:
"""
eval $(ensureconda) list > {output.ver}
eval $(ensureconda) list > {output.ver}
"""


Expand Down
8 changes: 3 additions & 5 deletions workflow/scripts/_template_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
import argparse

# Parse command line arguments:
parser = argparse.ArgumentParser(
description='Template script.')
parser.add_argument(
'-i', metavar='input', type=str, help="Input.")
parser = argparse.ArgumentParser(description="Template script.")
parser.add_argument("-i", metavar="input", type=str, help="Input.")


if __name__ == '__main__':
if __name__ == "__main__":
args = parser.parse_args()
Loading

0 comments on commit 3237e2c

Please sign in to comment.