From af854c35d6a506bd2c6b3a4ea6189b671e314a5b Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 17 Feb 2022 11:55:37 +0100 Subject: [PATCH 01/22] add rule to download illumina adapter from trimmomatic repo --- workflow/rules/reference.smk | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/workflow/rules/reference.smk b/workflow/rules/reference.smk index a152af3..08ff5ac 100644 --- a/workflow/rules/reference.smk +++ b/workflow/rules/reference.smk @@ -28,3 +28,27 @@ rule bwa_index: threads: 1 shell: "bwa index -a {params.algorithm} {input.ref}" + + +rule get_trimmomatic_adapters: + output: + "resources/adapter/NexteraPE-PE.fa", + "resources/adapter/TruSeq2-PE.fa", + "resources/adapter/TruSeq2-SE.fa", + "resources/adapter/TruSeq3-PE-2.fa", + "resources/adapter/TruSeq3-PE.fa", + "resources/adapter/TruSeq3-SE.fa", + params: + prefix="resources/adapter/", + URLs = [ + "https://raw.githubusercontent.com/usadellab/Trimmomatic/main/adapters/NexteraPE-PE.fa", + "https://raw.githubusercontent.com/usadellab/Trimmomatic/main/adapters/TruSeq2-PE.fa", + "https://raw.githubusercontent.com/usadellab/Trimmomatic/main/adapters/TruSeq2-SE.fa", + "https://raw.githubusercontent.com/usadellab/Trimmomatic/main/adapters/TruSeq3-PE-2.fa", + "https://raw.githubusercontent.com/usadellab/Trimmomatic/main/adapters/TruSeq3-PE.fa", + "https://raw.githubusercontent.com/usadellab/Trimmomatic/main/adapters/TruSeq3-SE.fa", + ] + log: + "logs/get_trimmomatic_adapter.log" + shell: + "wget -P {params.prefix} {params.URLs} -o {log}" \ No newline at end of file From 320497ba00ecec67e67892fe010dd6bd5a642f80 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 17 Feb 2022 17:13:21 +0100 Subject: [PATCH 02/22] change name of get_NGmerge_input to get_trimming_input --- workflow/rules/common.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 0c68618..a10c0fc 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -67,8 +67,8 @@ def get_read_group(sample): RG = f"@RG\\tID:{sample}\\tSM:{RGID}\\tLB:{library}\\tPL:{platform}" return RG -### not fully implemented -> cases: fastQ files as input -> SR,PE -def get_NGmerge_input(wildcards): +### not fully implemented -> cases: fastQ files as input -> SE,PE +def get_trimming_input(wildcards): sample=wildcards.SAMPLE inpath=samples.loc[sample].loc["path"] if ".bam" in inpath.lower(): From 8629006f57e8ae2fdb8a08eeb11a0198b4f18554 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 17 Feb 2022 17:15:16 +0100 Subject: [PATCH 03/22] move NGmerge to trimming rule; add trimmomatic to trimming rule --- workflow/rules/mapping.smk | 43 ----------------------- workflow/rules/trimming.smk | 70 +++++++++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 43 deletions(-) create mode 100644 workflow/rules/trimming.smk diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 4fc3c1e..338b8a6 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,47 +1,4 @@ -rule NGmerge: - input: - unpack(get_NGmerge_input), - qual_table="resources/qual_profile.txt", - output: - merged=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz"), - non_merged_1=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_1.fastq.gz"), - non_merged_2=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_2.fastq.gz"), - params: - non_merged_prefix="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged", - minlen=20 - log: - "results/logs/{ID}/NGmerge/{SAMPLE}.log", - conda: - "../envs/cfDNA_prep.yaml" - threads: 64 - shell: - "set +o pipefail;" - "workflow/scripts/NGmerge -w {input.qual_table} -u 41 -d -e {params.minlen} -n {threads} -z " - "-1 {input.r1} -2 {input.r2} -o {output.merged} " - "-f {params.non_merged_prefix} -l {log} -v" - - -rule NGmerge_adapter: - input: - non_merged_1="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_1.fastq.gz", - non_merged_2="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_2.fastq.gz", - qual_table="resources/qual_profile.txt", - output: - interleaved_output=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz") - params: - adapt_minlen=1 - log: - "results/logs/{ID}/NGmerge-adapter/{SAMPLE}.log", - conda: - "../envs/cfDNA_prep.yaml" - threads: 64 - shell: - "set +o pipefail;" - "workflow/scripts/NGmerge -a -i -w {input.qual_table} -u 41 -d -e {params.adapt_minlen} -n {threads} -z " - "-1 {input.non_merged_1} -2 {input.non_merged_2} -o {output.interleaved_output} " - "-c {log} -v" - rule filter_interleaved: input: unfiltered="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz" diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk new file mode 100644 index 0000000..da2ccd3 --- /dev/null +++ b/workflow/rules/trimming.smk @@ -0,0 +1,70 @@ +rule NGmerge: + input: + unpack(get_trimming_input), + qual_table="resources/qual_profile.txt", + output: + merged=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz"), + non_merged_1=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_1.fastq.gz"), + non_merged_2=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_2.fastq.gz"), + params: + non_merged_prefix="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged", + minlen=20 + log: + "logs/{ID}/NGmerge/{SAMPLE}.log", + conda: + "../envs/cfDNA_prep.yaml" + threads: 64 + shell: + "set +o pipefail;" + "workflow/scripts/NGmerge -w {input.qual_table} -u 41 -d -e {params.minlen} -n {threads} -z " + "-1 {input.r1} -2 {input.r2} -o {output.merged} " + "-f {params.non_merged_prefix} -l {log} -v" + + +rule NGmerge_adapter: + input: + non_merged_1="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_1.fastq.gz", + non_merged_2="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_2.fastq.gz", + qual_table="resources/qual_profile.txt", + output: + interleaved_output=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz") + params: + adapt_minlen=1 + log: + "logs/{ID}/NGmerge-adapter/{SAMPLE}.log", + conda: + "../envs/cfDNA_prep.yaml" + threads: 64 + shell: + "set +o pipefail;" + "workflow/scripts/NGmerge -a -i -w {input.qual_table} -u 41 -d -e {params.adapt_minlen} -n {threads} -z " + "-1 {input.non_merged_1} -2 {input.non_merged_2} -o {output.interleaved_output} " + "-c {log} -v" + +rule trimmomatic_pe: + input: + unpack(get_trimming_input), + output: + r1="results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz", + r2="results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz", + # reads where trimming entirely removed the mate + r1_unpaired="results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz", + r2_unpaired="results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz" + log: + "logs/{ID}/trimmomatic/{SAMPLE}.log" + params: + # list of trimmers (see manual) + trimmer=["TRAILING:3", "ILLUMINACLIP:resources/adapter/NexteraPE-PE.fa:4:30:10"], + # optional parameters + extra="", + compression_level="-9" + threads: + 32 + # optional specification of memory usage of the JVM that snakemake will respect with global + # resource restrictions (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources) + # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`: + # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties + resources: + mem_mb=1024 + wrapper: + "v1.1.0/bio/trimmomatic/pe" \ No newline at end of file From d026f91c56b175b50a6ec5334d426a1a17f52287 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Mon, 21 Feb 2022 13:44:50 +0100 Subject: [PATCH 04/22] add separate trimming rule --- workflow/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 9931f96..0538017 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -9,6 +9,7 @@ include: "rules/common.smk" include: "rules/reference.smk" include: "rules/raw_input_QC.smk" include: "rules/bam_to_fastq.smk" +include: "rules/trimming.smk" include: "rules/mapping.smk" include: "rules/QualityControl.smk" From 3d93a4f18290735446acb1cea33b69a9433908bf Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Mon, 21 Feb 2022 13:47:11 +0100 Subject: [PATCH 05/22] parse trimmomatic options from config.yaml; return properly formatted trimming options --- workflow/rules/common.smk | 141 ++++++++++++++++++++++++++++++------ workflow/rules/trimming.smk | 1 + 2 files changed, 121 insertions(+), 21 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index a10c0fc..699005a 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -24,7 +24,7 @@ def get_final_output(): "results/{ID}/flagstat/{SAMPLE}_flagstat.txt.gz", zip, ID=samples["ID"], - SAMPLE=samples["sample"] + SAMPLE=samples["sample"], ) ) final_output.extend( @@ -33,7 +33,7 @@ def get_final_output(): zip, ID=samples["ID"], SAMPLE=samples["sample"], - GENOME=samples["genome_build"] + GENOME=samples["genome_build"], ) ) final_output.extend( @@ -42,60 +42,159 @@ def get_final_output(): zip, ID=samples["ID"], SAMPLE=samples["sample"], - GENOME=samples["genome_build"] + GENOME=samples["genome_build"], ) ) final_output.extend( - expand( - "results/{ID}/qc/multiqc.html", - ID=samples["ID"].unique() - ) + expand("results/{ID}/qc/multiqc.html", ID=samples["ID"].unique()) ) return final_output + def get_read_group(sample): ID = samples.loc[sample].loc["ID"] library = samples.loc[sample].loc["library_name"] platform = samples.loc[sample].loc["platform"] info = samples.loc[sample].loc["info"] if pd.isna(info): - info="" - else: - info="_"+info + info = "" + else: + info = "_" + info RGID = f"{ID}_{sample}{info}" RG = f"@RG\\tID:{sample}\\tSM:{RGID}\\tLB:{library}\\tPL:{platform}" return RG -### not fully implemented -> cases: fastQ files as input -> SE,PE + +### not fully implemented -> cases: fastQ files as input -> SE,PE def get_trimming_input(wildcards): - sample=wildcards.SAMPLE - inpath=samples.loc[sample].loc["path"] + sample = wildcards.SAMPLE + inpath = samples.loc[sample].loc["path"] if ".bam" in inpath.lower(): - return {"r1":"results/{ID}/fastq/{SAMPLE}_R1.fastq.gz","r2":"results/{ID}/fastq/{SAMPLE}_R2.fastq.gz"} + return { + "r1": "results/{ID}/fastq/{SAMPLE}_R1.fastq.gz", + "r2": "results/{ID}/fastq/{SAMPLE}_R2.fastq.gz", + } +### get reference based on genome_build provided in samples.tsv def get_reference(wildcards): genome_build = wildcards.GENOME gpath = config[genome_build]["reference"] - p=Path(gpath) + p = Path(gpath) if p.exists() and not p.is_dir(): try: p.open().close() return p.as_posix() except PermissionError as f: - print(f,file = sys.stderr) - print(f"Please check file permissions of {gpath} or remove path from config.yaml \ - download a reference.",file = sys.stderr) + print( + f"Please check file permissions of {gpath} or remove path from config.yaml \ + download a reference.", + file=sys.stderr, + ) raise f else: return f"resources/reference/{genome_build}.fa" + +### provides download URLs for UCSC human genomes def get_ref_url(wildcards): genome_build = wildcards.GENOME - url_dict={ - "hg19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz", - "hg38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz", + url_dict = { + "hg19": "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz", + "hg38": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz", } return url_dict[genome_build] + +### returns properly formattet trimming rules based on config.yaml +def get_trimmomatic_trimmers(): + default_adapter = { + "nexterape-pe.fa": "resources/adapter/NexteraPE-PE.fa", + "truseq2-pe.fa": "resources/adapter/TruSeq2-PE.fa", + "truseq2-se.fa": "resources/adapter/TruSeq2-SE.fa", + "truseq3-pe-2.fa": "resources/adapter/TruSeq3-PE-2.fa", + "truseq3-pe.fa": "resources/adapter/TruSeq3-PE.fa", + "truseq3-se.fa": "resources/adapter/TruSeq3-SE.fa", + } + trimmers = list() + t_conf = config["trimmers"] + + if default_adapter.get(t_conf["Illuminaclip"]["adapter_files"].lower()) is not None: + adapter_path = Path( + default_adapter.get(t_conf["Illuminaclip"]["adapter_files"].lower()) + ) + else: + adapter_path = Path(t_conf["Illuminaclip"]["adapter_files"]) + + if isinstance(t_conf["Illuminaclip"]["seedMismatches"], int): + mismatches = abs(t_conf["Illuminaclip"]["seedMismatches"]) + else: + mismatches = 4 + print( + f"Invalid config, setting default value for seedMismatches: {mismatches}", + file=sys.stderr, + ) + + if isinstance(t_conf["Illuminaclip"]["palindromeClipThreshold"], int): + read_overlap = abs(t_conf["Illuminaclip"]["palindromeClipThreshold"]) + else: + read_overlap = 30 + print( + f"Invalid config, setting default value for palindromeClipThreshold: {read_overlap}", + file=sys.stderr, + ) + + if isinstance(t_conf["Illuminaclip"]["simpleClipThreshold"], int): + minimal_overlap = abs(t_conf["Illuminaclip"]["simpleClipThreshold"]) + else: + minimal_overlap = 10 + print( + f"Invalid config, setting default value for simpleClipThreshold: {minimal_overlap}", + file=sys.stderr, + ) + + if isinstance(t_conf["Illuminaclip"]["minAdapterLength"], int): + minAdapterLength = abs(t_conf["Illuminaclip"]["minAdapterLength"]) + else: + minAdapterLength = 8 + print( + f"Invalid config, setting default value for minAdapterLength: {minAdapterLength}", + file=sys.stderr, + ) + if t_conf["Illuminaclip"]["keepBothReads"] is True: + keepBothReads = "True" + else: + keepBothReads = "False" + + if adapter_path.exists() and not adapter_path.is_dir(): + try: + adapter_path.open().close() + illumina_string = f"ILLUMINACLIP:{adapter_path}:{mismatches}:{read_overlap}:{minimal_overlap}:{minAdapterLength}:{keepBothReads}" + trimmers.append(illumina_string) + except PermissionError as f: + print( + f"Please check file permissions of {adapter_path} or select one of the provided files.", + file=sys.stderr, + ) + raise f + + if ( + t_conf["SLIDINGWINDOW"]["window"] > 0 + and t_conf["SLIDINGWINDOW"]["quality_threshold"] > 0 + ): + window = t_conf["SLIDINGWINDOW"]["window"] + threshold = t_conf["SLIDINGWINDOW"]["quality_threshold"] + trimmers.append(f"SLIDINGWINDOW:{window}:{threshold}") + + if t_conf["LEADING"] > 0: + LEADING = t_conf["LEADING"] + trimmers.append(f"LEADING:{LEADING}") + if t_conf["TRAILING"] > 0: + TRAILING = t_conf["TRAILING"] + trimmers.append(f"TRAILING:{TRAILING}") + if t_conf["MINLEN"] > 0: + MINLEN = t_conf["MINLEN"] + trimmers.append(f"MINLEN:{MINLEN}") + + return trimmers diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk index da2ccd3..68eb9bb 100644 --- a/workflow/rules/trimming.smk +++ b/workflow/rules/trimming.smk @@ -55,6 +55,7 @@ rule trimmomatic_pe: params: # list of trimmers (see manual) trimmer=["TRAILING:3", "ILLUMINACLIP:resources/adapter/NexteraPE-PE.fa:4:30:10"], + trimmer1 = get_trimmomatic_trimmers(), # optional parameters extra="", compression_level="-9" From 153574e053db3fc699676338be143730dd3e9061 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Mon, 21 Feb 2022 14:08:43 +0100 Subject: [PATCH 06/22] add trimmomatic SE rule --- workflow/rules/trimming.smk | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk index 68eb9bb..aa76caf 100644 --- a/workflow/rules/trimming.smk +++ b/workflow/rules/trimming.smk @@ -54,8 +54,7 @@ rule trimmomatic_pe: "logs/{ID}/trimmomatic/{SAMPLE}.log" params: # list of trimmers (see manual) - trimmer=["TRAILING:3", "ILLUMINACLIP:resources/adapter/NexteraPE-PE.fa:4:30:10"], - trimmer1 = get_trimmomatic_trimmers(), + trimmer = get_trimmomatic_trimmers(), # optional parameters extra="", compression_level="-9" @@ -68,4 +67,29 @@ rule trimmomatic_pe: resources: mem_mb=1024 wrapper: - "v1.1.0/bio/trimmomatic/pe" \ No newline at end of file + "v1.1.0/bio/trimmomatic/pe" + +rule trimmomatic_se: + input: + "reads/{sample}.fastq.gz" # input and output can be uncompressed or compressed + output: + "results/{ID}/trimmed/trimmomatic/{SAMPLE}.trimmed.fastq.gz", + log: + "logs/{ID}/trimmomatic/{SAMPLE}.log" + params: + # list of trimmers (see manual) + trimmer=get_trimmomatic_trimmers(), + # optional parameters + extra="", + # optional compression levels from -0 to -9 and -11 + compression_level="-9" + threads: + 32 + # optional specification of memory usage of the JVM that snakemake will respect with global + # resource restrictions (https://snakemake.readthedocs.io/en/latest/snakefiles/rules.html#resources) + # and which can be used to request RAM during cluster job submission as `{resources.mem_mb}`: + # https://snakemake.readthedocs.io/en/latest/executing/cluster.html#job-properties + resources: + mem_mb=1024 + wrapper: + "v1.1.0/bio/trimmomatic/se" \ No newline at end of file From 2f4a8f33b1c062e94685698d41c22adf8a074c6b Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Mon, 21 Feb 2022 14:09:37 +0100 Subject: [PATCH 07/22] add option to configure min read length for filtering --- workflow/rules/mapping.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 338b8a6..f9dd24c 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -5,7 +5,7 @@ rule filter_interleaved: output: filtered=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz") params: - min_RL = 30 + min_RL = config["NGmerge"]["MINLEN"] conda: "../envs/cfDNA_prep.yaml" shell: @@ -17,7 +17,7 @@ rule filter_merged: output: filtered=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz") params: - min_RL = 30 + min_RL = config["NGmerge"]["MINLEN"] conda: "../envs/cfDNA_prep.yaml" shell: @@ -29,7 +29,7 @@ rule filter_single_read: output: filtered=temp("results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz") params: - min_RL = 30 + min_RL = config["NGmerge"]["MINLEN"] conda: "../envs/cfDNA_prep.yaml" shell: From 2452ac9ab31ec99f24f52cb4b30cd2f24e9d3b94 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Tue, 22 Feb 2022 17:16:39 +0100 Subject: [PATCH 08/22] move length filter for NGmerge to filtering.smk --- workflow/rules/filtering.smk | 35 +++++++++++++++++++++++++++++++++++ workflow/rules/mapping.smk | 36 ------------------------------------ 2 files changed, 35 insertions(+), 36 deletions(-) create mode 100644 workflow/rules/filtering.smk diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk new file mode 100644 index 0000000..5dea300 --- /dev/null +++ b/workflow/rules/filtering.smk @@ -0,0 +1,35 @@ +rule filter_interleaved: + input: + unfiltered="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz" + output: + filtered=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz") + params: + min_RL = config["NGmerge"]["MINLEN"] + conda: + "../envs/cfDNA_prep.yaml" + shell: + "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" + +rule filter_merged: + input: + unfiltered="results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz" + output: + filtered=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz") + params: + min_RL = config["NGmerge"]["MINLEN"] + conda: + "../envs/cfDNA_prep.yaml" + shell: + "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" + +rule filter_single_read: + input: + unfiltered="results/{ID}/fastq/{SAMPLE}_single_read.fastq.gz" + output: + filtered=temp("results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz") + params: + min_RL = config["NGmerge"]["MINLEN"] + conda: + "../envs/cfDNA_prep.yaml" + shell: + "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index f9dd24c..295b906 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,40 +1,4 @@ -rule filter_interleaved: - input: - unfiltered="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz" - output: - filtered=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz") - params: - min_RL = config["NGmerge"]["MINLEN"] - conda: - "../envs/cfDNA_prep.yaml" - shell: - "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" - -rule filter_merged: - input: - unfiltered="results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz" - output: - filtered=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz") - params: - min_RL = config["NGmerge"]["MINLEN"] - conda: - "../envs/cfDNA_prep.yaml" - shell: - "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" - -rule filter_single_read: - input: - unfiltered="results/{ID}/fastq/{SAMPLE}_single_read.fastq.gz" - output: - filtered=temp("results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz") - params: - min_RL = config["NGmerge"]["MINLEN"] - conda: - "../envs/cfDNA_prep.yaml" - shell: - "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" - rule map_reads: input: From 7cf472eddeaed5b0e6619d27f543dd07bbd45ea9 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Tue, 22 Feb 2022 17:42:58 +0100 Subject: [PATCH 09/22] remove common.py --- workflow/scripts/common.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 workflow/scripts/common.py diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py deleted file mode 100644 index 8780733..0000000 --- a/workflow/scripts/common.py +++ /dev/null @@ -1 +0,0 @@ -# Any Python script in the scripts folder will be able to import from this module. From 7b3d1d1d7c4137a24c3f15ce73fc343f7994913d Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Tue, 22 Feb 2022 17:43:37 +0100 Subject: [PATCH 10/22] add README.md, containing info on NGmerge --- workflow/scripts/README.md | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 workflow/scripts/README.md diff --git a/workflow/scripts/README.md b/workflow/scripts/README.md new file mode 100644 index 0000000..ba73951 --- /dev/null +++ b/workflow/scripts/README.md @@ -0,0 +1,3 @@ +# NGmerge executable + +The NGmerge executable was downloaded and compiled as described in the official documentation of [NGmerge v0.3](https://github.com/jsh58/NGmerge/releases/tag/v0.3). The main reason is an [outdated version of the bioconda recipe](https://github.com/jsh58/NGmerge/issues/20). \ No newline at end of file From 344fc92f6a46d10b26d38692391adf4abb0426c0 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Tue, 22 Feb 2022 17:43:54 +0100 Subject: [PATCH 11/22] initial commit --- workflow/scripts/bwa-mem2_wrapper.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 workflow/scripts/bwa-mem2_wrapper.py diff --git a/workflow/scripts/bwa-mem2_wrapper.py b/workflow/scripts/bwa-mem2_wrapper.py new file mode 100644 index 0000000..f670024 --- /dev/null +++ b/workflow/scripts/bwa-mem2_wrapper.py @@ -0,0 +1,12 @@ +# Any Python script in the scripts folder will be able to import from this module. +__author__ = "Sebastian Röner" +__copyright__ = ( + "Copyright 2022, Sebastian Röner" +) +__email__ = "sebastian.roener@bih-charite.de" +__license__ = "MIT" + + +from os import path + +from snakemake.shell import shell \ No newline at end of file From 60e73c9c5e1994123568b5219dd06ff1248a03e8 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 13:18:52 +0100 Subject: [PATCH 12/22] add notes on bwa-mem2 --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index 92b2f5b..420f1d3 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Insert your code into the respective folders, i.e. `scripts`, `rules`, and `envs - [Authors](#authors) - [Table of Contents](#table-of-contents) - [Dependencies](#dependencies) + - [Notes](#notes) - [Usage](#usage) - [Step 1: Obtain a copy of this workflow](#step-1-obtain-a-copy-of-this-workflow) - [Step 2: Configure workflow](#step-2-configure-workflow) @@ -35,6 +36,20 @@ Dependencies are automatically managed by conda environments. Only exception is - NGmerge v0.3.0 ``` +## Notes + +- The indext creation of bwa-mem2 is resource intesive (needs quotation): + +``` +# Indexing the reference sequence (Requires 28N GB memory where N is the size of the reference sequence). +./bwa-mem2 index [-p prefix] +Where + is the path to reference sequence fasta file and + is the prefix of the names of the files that store the resultant index. Default is in.fasta. +``` + +- bwa-mem2 mem gets resource intensive when using many cores + ## Usage If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above). From c54d8d7046d34838b8fb33fb6ea25067a5c0bdbd Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 13:20:21 +0100 Subject: [PATCH 13/22] substitute bwa with bwa-mem2 --- workflow/envs/cfDNA_prep.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/envs/cfDNA_prep.yaml b/workflow/envs/cfDNA_prep.yaml index 9d3f40b..a7b6602 100644 --- a/workflow/envs/cfDNA_prep.yaml +++ b/workflow/envs/cfDNA_prep.yaml @@ -6,6 +6,6 @@ channels: dependencies: - bedtools - python=3.8 - - bwa + - bwa-mem2 - samtools=1.14 From 54a790ebbde9ecccafae27e929e1eb2e3df19d5f Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 13:21:13 +0100 Subject: [PATCH 14/22] add separate filtering rule --- workflow/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/Snakefile b/workflow/Snakefile index 0538017..ae65c0a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -10,6 +10,7 @@ include: "rules/reference.smk" include: "rules/raw_input_QC.smk" include: "rules/bam_to_fastq.smk" include: "rules/trimming.smk" +include: "rules/filtering.smk" include: "rules/mapping.smk" include: "rules/QualityControl.smk" From a21ccc7400d0b458968684530379497a4aa49035 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 13:22:21 +0100 Subject: [PATCH 15/22] substitute bwa indexing with bwa-mem2 version --- workflow/rules/reference.smk | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/workflow/rules/reference.smk b/workflow/rules/reference.smk index 08ff5ac..52a17b7 100644 --- a/workflow/rules/reference.smk +++ b/workflow/rules/reference.smk @@ -14,20 +14,19 @@ rule get_reference: -rule bwa_index: +rule bwa_mem2_index: input: ref="resources/reference/{GENOME}.fa", output: - ref=multiext( "resources/reference/{GENOME}.fa", ".amb", ".ann", ".bwt", ".pac", ".sa"), - params: - algorithm="bwtsw", + ref=multiext( "resources/reference/{GENOME}.fa", ".amb", ".ann", ".pac", ), + #ref=multiext( "resources/reference/{GENOME}.fa", ".amb", ".ann", ".bwt", ".pac", ".sa"), log: "logs/bwa_index-{GENOME}.log", conda: "../envs/cfDNA_prep.yaml" - threads: 1 + threads: 16 shell: - "bwa index -a {params.algorithm} {input.ref}" + "bwa-mem2 index {input.ref}" rule get_trimmomatic_adapters: From 0e4a5744dd2ee4b87960d111041f28e68a0bc2fd Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 13:26:01 +0100 Subject: [PATCH 16/22] initial commit --- workflow/scripts/bwa-mem2_wrapper.py | 34 +++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/bwa-mem2_wrapper.py b/workflow/scripts/bwa-mem2_wrapper.py index f670024..911ef63 100644 --- a/workflow/scripts/bwa-mem2_wrapper.py +++ b/workflow/scripts/bwa-mem2_wrapper.py @@ -8,5 +8,37 @@ from os import path +from wsgiref.handlers import BaseCGIHandler + +from snakemake.shell import shell + +inputs = snakemake.input +params = snakemake.params + +log = snakemake.log_fmt_shell(stdout=False, stderr=True) + +# Check inputs/arguments. +if not isinstance(snakemake.input.reads, str) and len(snakemake.input.reads) not in { + 1, + 2, +}: + raise ValueError("input must have 1 (single-end) or 2 (paired-end) elements") + + +non_merged_cmd = "" +singleton_cmd = "" + +if snakemake.input.get("non_merged"): + non_merged_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.non_merged} | grep -v \"^@\" || true ; " + +if snakemake.input.get("single_reads"): + singleton_cmd = "bwa-mem2 mem -t {snakemake.threads} -R \"{snakemake.params.RG}\" {snakemake.input.ref} {snakemake.input.single_reads} | grep -v \"^@\" || true ; " + +base_cmd = f"((bwa-mem2 mem -t {{snakemake.threads}} -R \"{{snakemake.params.RG}}\" {{snakemake.input.ref}} {{snakemake.input.reads}}; \ +{non_merged_cmd}\ +{singleton_cmd}\ +) | samtools view -b -o {{snakemake.output.mapped_reads}} - ) 2>{{snakemake.log}}" + + +shell(base_cmd) -from snakemake.shell import shell \ No newline at end of file From 4b4be9bc93eb24b031675845dd2873a92b3a7d8c Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 13:26:32 +0100 Subject: [PATCH 17/22] update mapping from bwa to bwa-mem2 --- workflow/rules/mapping.smk | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 295b906..6cc7544 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,26 +1,22 @@ - -rule map_reads: +rule map_reads_bwa2: input: ref=lambda wc: get_reference(wc), - ref_index=lambda wc: get_reference(wc)+".amb", - merged="results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz", + ref_index=lambda wc: multiext(get_reference(wc),".amb",".ann"), + reads="results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz", non_merged="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz", - single_read = "results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz", + single_reads = "results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz", output: - mapped_reads=temp("results/{ID}/mapped_reads/{SAMPLE}_all.{GENOME}.bam") + mapped_reads=temp("results/{ID}/mapped_reads/{SAMPLE}_bwa2.{GENOME}.bam") params: RG=lambda wc: get_read_group(wc.SAMPLE), log: "results/logs/{ID}/mapping/{SAMPLE}_all.{GENOME}.log", conda: "../envs/cfDNA_prep.yaml" - threads: 64 - shell: - "((bwa mem -t {threads} -R \"{params.RG}\" {input.ref} {input.merged}; " - "bwa mem -t {threads} -R \"{params.RG}\" {input.ref} {input.non_merged} | grep -v \"^@\" || true ; " - "bwa mem -t {threads} -R \"{params.RG}\" {input.ref} {input.single_read}" - "| grep -v \"^@\" || true) | samtools view -b -o {output.mapped_reads} - ) 2>{log}" + threads: 32 + script: + "../scripts/bwa-mem2_wrapper.py" rule mark_duplicates: input: From 92f4419462e486cca173a2c702b276c7647de1c7 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 18:15:28 +0100 Subject: [PATCH 18/22] update notes section --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 420f1d3..04cafe9 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,9 @@ Where is the prefix of the names of the files that store the resultant index. Default is in.fasta. ``` -- bwa-mem2 mem gets resource intensive when using many cores +- bwa-mem2 mem uses around 4GB memory per thread + +- NGmerge merges/removes adapter from 3' end of the reads. The ends of the merged reads are defined by the 5' ends of the reads. ## Usage From c9fa7f8b42e81e6025f66f8841c1a6f1c41ca13b Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 18:16:55 +0100 Subject: [PATCH 19/22] add new options for trimming and mapping --- config/example.config.yaml | 50 +++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/config/example.config.yaml b/config/example.config.yaml index fb0bc4c..8f0ccad 100644 --- a/config/example.config.yaml +++ b/config/example.config.yaml @@ -14,4 +14,52 @@ GRCh38: ### global variables ### -TMPDIR: "" # path to directory for writing TMP files \ No newline at end of file +TMPDIR: "" # path to directory for writing TMP files + + + +### trimming ### + +trimming_algorithm: "trimmomatic" #can be either NGmerge or trimmomatic + +#### NGmerge specific options ####9 + +NGmerge: + MINLEN: 30 # min lenght of reads in additional filter steps + +#### trimmomatic specific options #### + + +# Illuminaclip takes a fasta file with adapter sequences and removes them in the trimming step. +# The adapter_files option takes either the path to a custom file +# or one of the following default files provided by Illumina and Trimmomatic: + # "NexteraPE-PE.fa", + # "TruSeq2-PE.fa", + # "TruSeq2-SE.fa", + # "TruSeq3-PE-2.fa", + # "TruSeq3-PE.fa", + # "TruSeq3-SE.fa", +# For more information on Trimmomatic and its functionality can be found at: +# http://www.usadellab.org/cms/?page=trimmomatic +trimmers: + Illuminaclip: + adapter_files: "TruSeq3-PE.fa" # filter not used if empty/invalid; can be either a path to a fasta file or the name of the default files; + seedMismatches: 2 # if not provided, defaults to 4 + palindromeClipThreshold: 30 # if not provided, defaults to 30 + simpleClipThreshold: 10 # if not provided, defaults to 10 + minAdapterLength: 8 #optional argument, default ist 8 + keepBothReads: True # if not provided, defaults to False + LEADING: 3 #(LEADING:3) Remove leading low quality or N bases (below quality 3); deactivated when set to 0 + TRAILING: 3 #(TRAILING:3) Remove trailing low quality or N bases (below quality 3); deactivated when set to 0 + SLIDINGWINDOW: #(SLIDINGWINDOW:4:15) Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15; deactivated when set to 0 + window: 4 # deactivated when set to 0 + quality_threshold: 15 + MINLEN: 30 #Drop reads below a ceartain length (e.g. 36 bp) (MINLEN:36); deactivated when set to 0 + + +### Mapping ### + +# This option lets you add unpaired/singleton reads in the mapping step that were filtered, +# but are either not paired or not merged. Otherwise these reads are not further processed. +mapping: + all_data: False # default is False \ No newline at end of file From e212d065cd75e727c13471f61be73d760b045617 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 18:50:40 +0100 Subject: [PATCH 20/22] remove intermediate files aufter processing --- workflow/rules/trimming.smk | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/rules/trimming.smk b/workflow/rules/trimming.smk index aa76caf..86ff384 100644 --- a/workflow/rules/trimming.smk +++ b/workflow/rules/trimming.smk @@ -27,7 +27,7 @@ rule NGmerge_adapter: non_merged_2="results/{ID}/NGmerge/nonmerged/{SAMPLE}_nonmerged_2.fastq.gz", qual_table="resources/qual_profile.txt", output: - interleaved_output=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz") + interleaved_output=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz"), params: adapt_minlen=1 log: @@ -45,11 +45,11 @@ rule trimmomatic_pe: input: unpack(get_trimming_input), output: - r1="results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz", - r2="results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz", + r1=temp("results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz"), + r2=temp("results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz"), # reads where trimming entirely removed the mate - r1_unpaired="results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz", - r2_unpaired="results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz" + r1_unpaired=temp("results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz"), + r2_unpaired=temp("results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz") log: "logs/{ID}/trimmomatic/{SAMPLE}.log" params: @@ -73,7 +73,7 @@ rule trimmomatic_se: input: "reads/{sample}.fastq.gz" # input and output can be uncompressed or compressed output: - "results/{ID}/trimmed/trimmomatic/{SAMPLE}.trimmed.fastq.gz", + temp("results/{ID}/trimmed/trimmomatic/{SAMPLE}.trimmed.fastq.gz",) log: "logs/{ID}/trimmomatic/{SAMPLE}.log" params: From 4f99fdf09f207db8f6b6ec4870b7f2a8cb02868e Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 18:51:56 +0100 Subject: [PATCH 21/22] add input function for mapping --- workflow/rules/common.smk | 19 +++++++++++++++++++ workflow/rules/mapping.smk | 7 ++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 699005a..ce47fc8 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -198,3 +198,22 @@ def get_trimmomatic_trimmers(): trimmers.append(f"MINLEN:{MINLEN}") return trimmers + + + +def get_mapping_input(wildcards): + mapping_input = dict() + trimming_algorithm = config["trimming_algorithm"] + all_data = config["mapping"]["all_data"] + + if trimming_algorithm.lower() == "ngmerge": + mapping_input["reads"] = "results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz" + if all_data: + mapping_input["non_merged"] = "results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz" + mapping_input["single_reads"] = "results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz" + elif trimming_algorithm.lower() == "trimmomatic": + mapping_input["reads"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz", "results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz",] + if all_data: + mapping_input["non_merged"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz","results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz"] + + return mapping_input \ No newline at end of file diff --git a/workflow/rules/mapping.smk b/workflow/rules/mapping.smk index 6cc7544..7c8cf5f 100644 --- a/workflow/rules/mapping.smk +++ b/workflow/rules/mapping.smk @@ -1,13 +1,10 @@ - rule map_reads_bwa2: input: + unpack(get_mapping_input), ref=lambda wc: get_reference(wc), ref_index=lambda wc: multiext(get_reference(wc),".amb",".ann"), - reads="results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz", - non_merged="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz", - single_reads = "results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz", output: - mapped_reads=temp("results/{ID}/mapped_reads/{SAMPLE}_bwa2.{GENOME}.bam") + mapped_reads=temp("results/{ID}/mapped_reads/{SAMPLE}_all.{GENOME}.bam") params: RG=lambda wc: get_read_group(wc.SAMPLE), log: From 62c827f2b3beee397c5bc03adfccf63723b0fc61 Mon Sep 17 00:00:00 2001 From: sroener <40714954+sroener@users.noreply.github.com> Date: Thu, 24 Feb 2022 18:52:47 +0100 Subject: [PATCH 22/22] refactor: reorder of rules --- workflow/rules/filtering.smk | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk index 5dea300..358041f 100644 --- a/workflow/rules/filtering.smk +++ b/workflow/rules/filtering.smk @@ -1,8 +1,8 @@ -rule filter_interleaved: +rule filter_merged: input: - unfiltered="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz" + unfiltered="results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz" output: - filtered=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz") + filtered=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz") params: min_RL = config["NGmerge"]["MINLEN"] conda: @@ -10,11 +10,12 @@ rule filter_interleaved: shell: "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) | gzip -c > {output.filtered}" -rule filter_merged: + +rule filter_interleaved: input: - unfiltered="results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz" + unfiltered="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz" output: - filtered=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz") + filtered=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz"), params: min_RL = config["NGmerge"]["MINLEN"] conda: