Skip to content

Commit

Permalink
feat: read length filter (#44)
Browse files Browse the repository at this point in the history
* feat: read length filter rule

* fix: made filtered fastq a tempfile

* fix: unconditional uncompress

* fix: added .fq functionality

* fix: updated profile/config.yaml default resources and removed redundancy

* fix: new raise TypeError + command line message for rule filter_reads

* fix: formatting - no valid f-string was give

---------

Co-authored-by: cmeesters <meesters@uni-mainz.de>
  • Loading branch information
yeising and cmeesters authored Jul 3, 2024
1 parent 704b25c commit 10a8d93
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 22 deletions.
3 changes: 3 additions & 0 deletions config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ genome: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PG
annotation: "/lustre/miifs01/project/m2_zdvhpc/transcriptome_data/GCA_917627325.4_PGI_CHIRRI_v4_genomic.gff"
# these samples ought to contain all samples comprising of the

# Minimum read length, put 0 if you want to proceed with all reads.
min_length: 200

# Maximum number of CPUs in your partition/PC/server
max_cpus: 40

Expand Down
21 changes: 18 additions & 3 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,23 @@ rule genome_to_transcriptome:
"""


rule filter_reads:
input:
fastq=lambda wildcards: get_mapped_reads_input(
samples["sample"][wildcards.sample]
),
output:
temp("filter/{sample}_filtered.fq"),
message:
f"Filtering with read length >= {config['min_length']}."
log:
"logs/filter_reads/{sample}.log",
conda:
"envs/env.yml"
script:
"scripts/read_filter.py"


rule build_minimap_index: ## build minimap2 index
input:
transcriptome=rules.genome_to_transcriptome.output,
Expand All @@ -82,9 +99,7 @@ rule build_minimap_index: ## build minimap2 index
rule map_reads:
input:
index=rules.build_minimap_index.output.index,
fastq=lambda wildcards: get_mapped_reads_input(
samples["sample"][wildcards.sample]
),
fastq_filtered=rules.filter_reads.output,
output:
"alignments/{sample}.sam",
log:
Expand Down
23 changes: 4 additions & 19 deletions workflow/profile/config.yaml
Original file line number Diff line number Diff line change
@@ -1,65 +1,50 @@
default-resources:
slurm_account: "m2_zdvhpc"
slurm_partition: "smp"
cpus_per_task: 1
mem_mb_per_cpu: 1800
runtime: "1h"

set-resources:
genome_to_transcriptome:
cpus_per_task: 1
mem_mb_per_cpu: 1800
runtime: "2h"

build_minimap_index:
cpus_per_task: 4
mem_mb_per_cpu: 3600
runtime: "1h"

map_reads:
cpus_per_task: 40
mem_mb_per_cpu: 1800
runtime: "3h"
slurm_partition: "parallel"

plot_samples:
cpus_per_task: 4
mem_mb_per_cpu: 1800
runtime: "3h"

plot_all_samples:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "2h"

map_qc:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "1h"

sam_sort:
cpus_per_task: 4
mem_mb_per_cpu: 1800
runtime: "2h"

sam_view:
cpus_per_task: 1
mem_mb_per_cpu: 1800
runtime: "1h"

sam_index:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "30m"

sam_stats:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "30m"

count_reads:
cpus_per_task: 8
mem_mb_per_cpu: 1800
runtime: "1h"

de_analysis:
cpus_per_task: 4
mem_mb_per_cpu: 5000
runtime: "1h"

41 changes: 41 additions & 0 deletions workflow/scripts/read_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from Bio import SeqIO
import gzip
import shutil

# start logging
sys.stderr = sys.stdout = open(snakemake.log[0], "wt")


def is_zipped(fname):
return fname.endswith(".gz")


if not isinstance(snakemake.config["min_length"], int):
raise TypeError(
"Your configuration has no valid minimum read length, allowed are: ('int'), check the 'min_length' parameter, please."
)

if snakemake.config["min_length"] <= 0:
if is_zipped(snakemake.input[0]):
with gzip.open(snakemake.input[0], "wb") as f_in, open(
snakemake.output[0], "wb"
) as f_out:
shutil.copyfileobj(f_in, f_out)
else:
shutil.copy2(snakemake.input[0], snakemake.output[0])
else:
open_fh = gzip.open if is_zipped(snakemake.input[0]) else open
with open_fh(snakemake.input[0], "rt") as sample:
input_iterator = SeqIO.parse(sample, "fastq")

filter_iterator = (
read
for read in input_iterator
if len(read.seq) >= snakemake.config["min_length"]
)

SeqIO.write(filter_iterator, snakemake.output[0], "fastq")

0 comments on commit 10a8d93

Please sign in to comment.