feat: read length filter (#44)

* feat: read length filter rule * fix: made filtered fastq a tempfile * fix: unconditional uncompress * fix: added .fq functionality * fix: updated profile/config.yaml default resources and removed redundancy * fix: new raise TypeError + command line message for rule filter_reads * fix: formatting - no valid f-string was give --------- Co-authored-by: cmeesters <meesters@uni-mainz.de>
snakemake-workflows · Jul 3, 2024 · 10a8d93 · 10a8d93
1 parent 704b25c
commit 10a8d93
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 22 deletions.
diff --git a/config/config.yml b/config/config.yml
@@ -19,6 +19,9 @@ genome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PG
 annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
 # these samples ought to contain all samples comprising of the
 
+# Minimum read length, put 0 if you want to proceed with all reads.
+min_length: 200
+
 # Maximum number of CPUs in your partition/PC/server
 max_cpus: 40
 

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -61,6 +61,23 @@ rule genome_to_transcriptome:
     """
 
 
+rule filter_reads:
+    input:
+        fastq=lambda wildcards: get_mapped_reads_input(
+            samples["sample"][wildcards.sample]
+        ),
+    output:
+        temp("filter/{sample}_filtered.fq"),
+    message: 
+        f"Filtering with read length >= {config['min_length']}."
+    log:
+        "logs/filter_reads/{sample}.log",
+    conda:
+        "envs/env.yml"
+    script:
+        "scripts/read_filter.py"
+
+
 rule build_minimap_index:  ## build minimap2 index
     input:
         transcriptome=rules.genome_to_transcriptome.output,
@@ -82,9 +99,7 @@ rule build_minimap_index:  ## build minimap2 index
 rule map_reads:
     input:
         index=rules.build_minimap_index.output.index,
-        fastq=lambda wildcards: get_mapped_reads_input(
-            samples["sample"][wildcards.sample]
-        ),
+        fastq_filtered=rules.filter_reads.output,
     output:
         "alignments/{sample}.sam",
     log:

diff --git a/workflow/profile/config.yaml b/workflow/profile/config.yaml
@@ -1,65 +1,50 @@
 default-resources:
     slurm_account: "m2_zdvhpc"
     slurm_partition: "smp"
+    cpus_per_task: 1
+    mem_mb_per_cpu: 1800
+    runtime: "1h"
 
 set-resources:
     genome_to_transcriptome:
-        cpus_per_task: 1
-        mem_mb_per_cpu: 1800
         runtime: "2h"
 
     build_minimap_index:
         cpus_per_task: 4
         mem_mb_per_cpu: 3600
-        runtime: "1h"
 
     map_reads:
         cpus_per_task: 40
-        mem_mb_per_cpu: 1800
         runtime: "3h"
         slurm_partition: "parallel"
 
     plot_samples:
         cpus_per_task: 4
-        mem_mb_per_cpu: 1800
         runtime: "3h"
 
     plot_all_samples:
         cpus_per_task: 8
-        mem_mb_per_cpu: 1800
         runtime: "2h"
 
     map_qc:
         cpus_per_task: 8
-        mem_mb_per_cpu: 1800
-        runtime: "1h"
 
     sam_sort:
         cpus_per_task: 4
-        mem_mb_per_cpu: 1800
         runtime: "2h"
 
-    sam_view:
-        cpus_per_task: 1
-        mem_mb_per_cpu: 1800
-        runtime: "1h"
-
     sam_index:
         cpus_per_task: 8
-        mem_mb_per_cpu: 1800
         runtime: "30m"
 
     sam_stats:
         cpus_per_task: 8
-        mem_mb_per_cpu: 1800
         runtime: "30m"
 
     count_reads:
         cpus_per_task: 8
-        mem_mb_per_cpu: 1800
-        runtime: "1h"
 
     de_analysis:
         cpus_per_task: 4
         mem_mb_per_cpu: 5000
-        runtime: "1h"
+
diff --git a/workflow/scripts/read_filter.py b/workflow/scripts/read_filter.py
@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+from Bio import SeqIO
+import gzip
+import shutil
+
+# start logging
+sys.stderr = sys.stdout = open(snakemake.log[0], "wt")
+
+
+def is_zipped(fname):
+    return fname.endswith(".gz")
+
+
+if not isinstance(snakemake.config["min_length"], int):
+    raise TypeError(
+        "Your configuration has no valid minimum read length, allowed are: ('int'), check the 'min_length' parameter, please."
+    )
+
+if snakemake.config["min_length"] <= 0:
+    if is_zipped(snakemake.input[0]):
+        with gzip.open(snakemake.input[0], "wb") as f_in, open(
+            snakemake.output[0], "wb"
+        ) as f_out:
+            shutil.copyfileobj(f_in, f_out)
+    else:
+        shutil.copy2(snakemake.input[0], snakemake.output[0])
+else:
+    open_fh = gzip.open if is_zipped(snakemake.input[0]) else open
+    with open_fh(snakemake.input[0], "rt") as sample:
+        input_iterator = SeqIO.parse(sample, "fastq")
+
+        filter_iterator = (
+            read
+            for read in input_iterator
+            if len(read.seq) >= snakemake.config["min_length"]
+        )
+
+        SeqIO.write(filter_iterator, snakemake.output[0], "fastq")