kircherlab · sroener · Feb 24, 2022 · Feb 17, 2022 · Feb 17, 2022 · Feb 17, 2022
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Insert your code into the respective folders, i.e. `scripts`, `rules`, and `envs
     - [Authors](#authors)
     - [Table of Contents](#table-of-contents)
     - [Dependencies](#dependencies)
+    - [Notes](#notes)
     - [Usage](#usage)
         - [Step 1: Obtain a copy of this workflow](#step-1-obtain-a-copy-of-this-workflow)
         - [Step 2: Configure workflow](#step-2-configure-workflow)
@@ -35,6 +36,22 @@ Dependencies are automatically managed by conda environments. Only exception is
 - NGmerge v0.3.0
 ```
 
+## Notes
+
+- The indext creation of bwa-mem2 is resource intesive (needs quotation): 
+
+```
+# Indexing the reference sequence (Requires 28N GB memory where N is the size of the reference sequence).
+./bwa-mem2 index [-p prefix] <in.fasta>
+Where 
+<in.fasta> is the path to reference sequence fasta file and 
+<prefix> is the prefix of the names of the files that store the resultant index. Default is in.fasta.
+```
+
+- bwa-mem2 mem uses around 4GB memory per thread
+
+- NGmerge merges/removes adapter from 3' end of the reads. The ends of the merged reads are defined by the 5' ends of the reads.
+
 ## Usage
 
 If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of this (original) repository and, if available, its DOI (see above).

diff --git a/config/example.config.yaml b/config/example.config.yaml
@@ -14,4 +14,52 @@ GRCh38:
 
 ### global variables ###
 
-TMPDIR: "" # path to directory for writing TMP files
+TMPDIR: "" # path to directory for writing TMP files
+
+
+
+### trimming ###
+
+trimming_algorithm: "trimmomatic" #can be either NGmerge or trimmomatic
+
+#### NGmerge specific options ####9
+
+NGmerge:
+  MINLEN: 30 # min lenght of reads in additional filter steps
+
+#### trimmomatic specific options ####
+
+
+# Illuminaclip takes a fasta file with adapter sequences and removes them in the trimming step.
+# The adapter_files option takes either the path to a custom file </PATH/TO/CUSTOM/ADAPTER.fa>
+# or one of the following default files provided by Illumina and Trimmomatic:
+    # "NexteraPE-PE.fa",
+    # "TruSeq2-PE.fa",
+    # "TruSeq2-SE.fa",
+    # "TruSeq3-PE-2.fa",
+    # "TruSeq3-PE.fa",
+    # "TruSeq3-SE.fa",
+# For more information on Trimmomatic and its functionality can be found at:
+# http://www.usadellab.org/cms/?page=trimmomatic
+trimmers:
+  Illuminaclip: 
+    adapter_files: "TruSeq3-PE.fa" # filter not used if empty/invalid; can be either a path to a fasta file or the name of the default files;
+    seedMismatches: 2 # if not provided, defaults to 4
+    palindromeClipThreshold: 30 # if not provided, defaults to 30
+    simpleClipThreshold: 10 # if not provided, defaults to 10
+    minAdapterLength: 8 #optional argument, default ist 8
+    keepBothReads: True # if not provided, defaults to False
+  LEADING: 3  #(LEADING:3) Remove leading low quality or N bases (below quality 3); deactivated when set to 0
+  TRAILING: 3 #(TRAILING:3) Remove trailing low quality or N bases (below quality 3); deactivated when set to 0
+  SLIDINGWINDOW: #(SLIDINGWINDOW:4:15) Scan the read with a 4-base wide sliding window, cutting when the average quality per base drops below 15; deactivated when set to 0
+    window: 4 # deactivated when set to 0
+    quality_threshold: 15 
+  MINLEN: 30 #Drop reads below a ceartain length (e.g. 36 bp)  (MINLEN:36); deactivated when set to 0
+
+
+### Mapping ###
+
+# This option lets you add unpaired/singleton reads in the mapping step that were filtered,
+# but are either not paired or not merged. Otherwise these reads are not further processed.
+mapping:
+  all_data: False # default is False
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -9,6 +9,8 @@ include: "rules/common.smk"
 include: "rules/reference.smk"
 include: "rules/raw_input_QC.smk"
 include: "rules/bam_to_fastq.smk"
+include: "rules/trimming.smk"
+include: "rules/filtering.smk"
 include: "rules/mapping.smk"
 include: "rules/QualityControl.smk"
 

diff --git a/workflow/envs/cfDNA_prep.yaml b/workflow/envs/cfDNA_prep.yaml
@@ -6,6 +6,6 @@ channels:
 dependencies:
   - bedtools
   - python=3.8
-  - bwa
+  - bwa-mem2
   - samtools=1.14
 
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -24,7 +24,7 @@ def get_final_output():
             "results/{ID}/flagstat/{SAMPLE}_flagstat.txt.gz",
             zip,
             ID=samples["ID"],
-            SAMPLE=samples["sample"]
+            SAMPLE=samples["sample"],
         )
     )
     final_output.extend(
@@ -33,7 +33,7 @@ def get_final_output():
             zip,
             ID=samples["ID"],
             SAMPLE=samples["sample"],
-            GENOME=samples["genome_build"]
+            GENOME=samples["genome_build"],
         )
     )
     final_output.extend(
@@ -42,60 +42,178 @@ def get_final_output():
             zip,
             ID=samples["ID"],
             SAMPLE=samples["sample"],
-            GENOME=samples["genome_build"]
+            GENOME=samples["genome_build"],
         )
     )
     final_output.extend(
-        expand(
-            "results/{ID}/qc/multiqc.html",
-            ID=samples["ID"].unique()
-        )
+        expand("results/{ID}/qc/multiqc.html", ID=samples["ID"].unique())
     )
 
     return final_output
 
+
 def get_read_group(sample):
     ID = samples.loc[sample].loc["ID"]
     library = samples.loc[sample].loc["library_name"]
     platform = samples.loc[sample].loc["platform"]
     info = samples.loc[sample].loc["info"]
     if pd.isna(info):
-        info=""
-    else:    
-        info="_"+info
+        info = ""
+    else:
+        info = "_" + info
     RGID = f"{ID}_{sample}{info}"
     RG = f"@RG\\tID:{sample}\\tSM:{RGID}\\tLB:{library}\\tPL:{platform}"
     return RG
 
-### not fully implemented -> cases: fastQ files as input -> SR,PE 
-def get_NGmerge_input(wildcards):
-    sample=wildcards.SAMPLE
-    inpath=samples.loc[sample].loc["path"]
+
+### not fully implemented -> cases: fastQ files as input -> SE,PE
+def get_trimming_input(wildcards):
+    sample = wildcards.SAMPLE
+    inpath = samples.loc[sample].loc["path"]
     if ".bam" in inpath.lower():
-        return {"r1":"results/{ID}/fastq/{SAMPLE}_R1.fastq.gz","r2":"results/{ID}/fastq/{SAMPLE}_R2.fastq.gz"}
+        return {
+            "r1": "results/{ID}/fastq/{SAMPLE}_R1.fastq.gz",
+            "r2": "results/{ID}/fastq/{SAMPLE}_R2.fastq.gz",
+        }
 
 
+### get reference based on genome_build provided in samples.tsv
 def get_reference(wildcards):
     genome_build = wildcards.GENOME
     gpath = config[genome_build]["reference"]
-    p=Path(gpath)
+    p = Path(gpath)
     if p.exists() and not p.is_dir():
         try:
             p.open().close()
             return p.as_posix()
         except PermissionError as f:
-            print(f,file = sys.stderr)
-            print(f"Please check file permissions of {gpath} or remove path from config.yaml \
-            download a reference.",file = sys.stderr)
+            print(
+                f"Please check file permissions of {gpath} or remove path from config.yaml \
+            download a reference.",
+                file=sys.stderr,
+            )
             raise f
     else:
         return f"resources/reference/{genome_build}.fa"
 
+
+### provides download URLs for UCSC human genomes
 def get_ref_url(wildcards):
     genome_build = wildcards.GENOME
-    url_dict={
-        "hg19":"https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz",
-        "hg38":"https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz",
+    url_dict = {
+        "hg19": "https://hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/hg19.fa.gz",
+        "hg38": "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.fa.gz",
     }
     return url_dict[genome_build]
 
+
+### returns properly formattet trimming rules based on config.yaml
+def get_trimmomatic_trimmers():
+    default_adapter = {
+        "nexterape-pe.fa": "resources/adapter/NexteraPE-PE.fa",
+        "truseq2-pe.fa": "resources/adapter/TruSeq2-PE.fa",
+        "truseq2-se.fa": "resources/adapter/TruSeq2-SE.fa",
+        "truseq3-pe-2.fa": "resources/adapter/TruSeq3-PE-2.fa",
+        "truseq3-pe.fa": "resources/adapter/TruSeq3-PE.fa",
+        "truseq3-se.fa": "resources/adapter/TruSeq3-SE.fa",
+    }
+    trimmers = list()
+    t_conf = config["trimmers"]
+
+    if default_adapter.get(t_conf["Illuminaclip"]["adapter_files"].lower()) is not None:
+        adapter_path = Path(
+            default_adapter.get(t_conf["Illuminaclip"]["adapter_files"].lower())
+        )
+    else:
+        adapter_path = Path(t_conf["Illuminaclip"]["adapter_files"])
+
+    if isinstance(t_conf["Illuminaclip"]["seedMismatches"], int):
+        mismatches = abs(t_conf["Illuminaclip"]["seedMismatches"])
+    else:
+        mismatches = 4
+        print(
+            f"Invalid config, setting default value for seedMismatches: {mismatches}",
+            file=sys.stderr,
+        )
+
+    if isinstance(t_conf["Illuminaclip"]["palindromeClipThreshold"], int):
+        read_overlap = abs(t_conf["Illuminaclip"]["palindromeClipThreshold"])
+    else:
+        read_overlap = 30
+        print(
+            f"Invalid config, setting default value for palindromeClipThreshold: {read_overlap}",
+            file=sys.stderr,
+        )
+
+    if isinstance(t_conf["Illuminaclip"]["simpleClipThreshold"], int):
+        minimal_overlap = abs(t_conf["Illuminaclip"]["simpleClipThreshold"])
+    else:
+        minimal_overlap = 10
+        print(
+            f"Invalid config, setting default value for simpleClipThreshold: {minimal_overlap}",
+            file=sys.stderr,
+        )
+
+    if isinstance(t_conf["Illuminaclip"]["minAdapterLength"], int):
+        minAdapterLength = abs(t_conf["Illuminaclip"]["minAdapterLength"])
+    else:
+        minAdapterLength = 8
+        print(
+            f"Invalid config, setting default value for minAdapterLength: {minAdapterLength}",
+            file=sys.stderr,
+        )
+    if t_conf["Illuminaclip"]["keepBothReads"] is True:
+        keepBothReads = "True"
+    else:
+        keepBothReads = "False"
+
+    if adapter_path.exists() and not adapter_path.is_dir():
+        try:
+            adapter_path.open().close()
+            illumina_string = f"ILLUMINACLIP:{adapter_path}:{mismatches}:{read_overlap}:{minimal_overlap}:{minAdapterLength}:{keepBothReads}"
+            trimmers.append(illumina_string)
+        except PermissionError as f:
+            print(
+                f"Please check file permissions of {adapter_path} or select one of the provided files.",
+                file=sys.stderr,
+            )
+            raise f
+
+    if (
+        t_conf["SLIDINGWINDOW"]["window"] > 0
+        and t_conf["SLIDINGWINDOW"]["quality_threshold"] > 0
+    ):
+        window = t_conf["SLIDINGWINDOW"]["window"]
+        threshold = t_conf["SLIDINGWINDOW"]["quality_threshold"]
+        trimmers.append(f"SLIDINGWINDOW:{window}:{threshold}")
+
+    if t_conf["LEADING"] > 0:
+        LEADING = t_conf["LEADING"]
+        trimmers.append(f"LEADING:{LEADING}")
+    if t_conf["TRAILING"] > 0:
+        TRAILING = t_conf["TRAILING"]
+        trimmers.append(f"TRAILING:{TRAILING}")
+    if t_conf["MINLEN"] > 0:
+        MINLEN = t_conf["MINLEN"]
+        trimmers.append(f"MINLEN:{MINLEN}")
+
+    return trimmers
+
+
+
+def get_mapping_input(wildcards):
+    mapping_input = dict()
+    trimming_algorithm = config["trimming_algorithm"]
+    all_data = config["mapping"]["all_data"]
+
+    if trimming_algorithm.lower() == "ngmerge":
+        mapping_input["reads"] = "results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz"
+        if all_data:
+            mapping_input["non_merged"] = "results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz"
+            mapping_input["single_reads"] = "results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz"
+    elif trimming_algorithm.lower() == "trimmomatic":
+        mapping_input["reads"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.fastq.gz", "results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.fastq.gz",]
+        if all_data:
+            mapping_input["non_merged"] = ["results/{ID}/trimmed/trimmomatic/{SAMPLE}.1.unpaired.fastq.gz","results/{ID}/trimmed/trimmomatic/{SAMPLE}.2.unpaired.fastq.gz"]
+
+    return mapping_input
diff --git a/workflow/rules/filtering.smk b/workflow/rules/filtering.smk
@@ -0,0 +1,36 @@
+rule filter_merged:
+    input:
+        unfiltered="results/{ID}/NGmerge/merged/{SAMPLE}_merged.unfiltered.fastq.gz"
+    output:
+        filtered=temp("results/{ID}/NGmerge/merged/{SAMPLE}_merged.filtered.fastq.gz")
+    params:
+        min_RL = config["NGmerge"]["MINLEN"]
+    conda:
+        "../envs/cfDNA_prep.yaml"
+    shell:
+        "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) |  gzip -c > {output.filtered}"
+
+
+rule filter_interleaved:
+    input:
+        unfiltered="results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.unfiltered.fastq.gz"
+    output:
+        filtered=temp("results/{ID}/NGmerge/nonmerged/{SAMPLE}_interleaved_noadapters.filtered.fastq.gz"),
+    params:
+        min_RL = config["NGmerge"]["MINLEN"]
+    conda:
+        "../envs/cfDNA_prep.yaml"
+    shell:
+        "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) |  gzip -c > {output.filtered}"
+
+rule filter_single_read:
+    input:
+        unfiltered="results/{ID}/fastq/{SAMPLE}_single_read.fastq.gz"
+    output:
+        filtered=temp("results/{ID}/fastq/{SAMPLE}_single_read.filtered.fastq.gz")
+    params:
+        min_RL = config["NGmerge"]["MINLEN"]
+    conda:
+        "../envs/cfDNA_prep.yaml"
+    shell:
+        "awk 'BEGIN {{FS = \"\\t\" ; OFS = \"\\n\"}} {{header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= {params.min_RL}) {{print header, seq, qheader, qseq}}}}' <( zcat {input.unfiltered} ) |  gzip -c > {output.filtered}"