kircherlab · sroener · Oct 23, 2023 · Aug 18, 2023 · Aug 18, 2023 · Aug 18, 2023
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,8 @@
 !resources/
 !resources/blacklists/
 !resources/blacklists/**
+!resources/testregions/
+!resources/testregions/**
 
 !supplement/
 !supplement/**
@@ -26,9 +28,11 @@
 !config/
 !config/example.config.yaml
 !config/example.samples.tsv
+!config/example.regions.tsv
 !config/multiqc_config.yaml
 !config/test-config.yaml
 !config/test-samples.tsv
+!config/test-regions.tsv
 
 !resources/qual_profile.txt
 

diff --git a/README.md b/README.md
diff --git a/config/example.config.yaml b/config/example.config.yaml
@@ -1,8 +1,11 @@
 # This file should contain everything to configure the workflow on a global scale.
 # In case of sample based data, it should be complemented by a samples.tsv file that contains
 # one row per sample. It can be parsed easily via pandas.
-samples: "config/samples.tsv" 
+samples: "config/example.samples.tsv" 
 
+regions: "config/example.regions.tsv"
+
+control_name: "healthy" # name of the control samples specified in the samples.tsv. Has to match the name in the status field.
 
 ### genome build specific options ###
 
@@ -25,17 +28,25 @@ TMPDIR: "./" # path to directory for writing TMP files
 
 SEED: 42 # seed for increased reproducibility. Mainly used in GCbias estimation
 
+### Utility options ###
+
+utility:
+  GCbias-plot: True
+  GCbias-correction: True
+  ichorCNA: True
+  case-control-plot: True
+
 ### trimming ###
 
-trimming_algorithm: "NGmerge" #can be either NGmerge or trimmomatic
+PE_trimming_algorithm: "NGmerge" #can be either NGmerge or trimmomatic
 
 #### NGmerge specific options ####
 
 length-filter:
   MINLEN: 30 # min lenght of reads in additional filter steps
 
 #### trimmomatic specific options ####
-
+phred-quality-encoding: phred-33 # three options: empty = automatic detection, phred-33 and phred-64
 
 # Illuminaclip takes a fasta file with adapter sequences and removes them in the trimming step.
 # The adapter_files option takes either the path to a custom file </PATH/TO/CUSTOM/ADAPTER.fa>
@@ -66,20 +77,17 @@ trimmers:
 
 ### Mapping ###
 
-# This option lets you add unpaired/singleton reads in the mapping step that were filtered,
-# but are either not paired or not merged. Otherwise these reads are not further processed.
+# This option lets you add unmerged/singleton or single-end reads in the mapping step.
+# Unmerged or singleton reads are paired end reads that were filtered by samtools fastq or NGmerge.
+# Single-end reads are from single end libraries.These categories can be excluded for specialised analyses.
 mapping:
-  unmerged: True # default is True
-  singleton: False # default is False
-
-### Utility ###
-
-utility:
-  GCbias-plot: True
-  GCbias-correction: True
-  ichorCNA: True
-
+  paired_end:
+    unmerged: True # default is True. Reads not merged by NGmerge.
+    singleton: False # default is False. Reads that are from paired end libraries without a matching pair.
+  single_end:
+    SEreads: True # default is True. This option is essential for Single End libraries. Setting to true in PE libraries has no effect on the output.
 
+### Utility parameters ###
 
 #### ichorCNA ####
 
@@ -118,3 +126,31 @@ ichorCNA:
   scStates: '"c(1,3)"'
   txnE: 0.9999
   txnStrength: 10000
+
+#### GCbias ####
+
+##### GCbias estimation #####
+
+GCbias_estimation:
+  normalized_interpolation: True # boolean, True or False. If True, the smooth parameter is normalized such that results are invariant to xdata range and less sensitive to nonuniformity of weights and xdata clumping.
+
+#### Signal extraction ####
+
+minRL: 120 # minimum read length for calculating WPS
+maxRL: 180 # maximum read length for calculating WPS
+bpProtection: 120 # bp protection for calculating WPS
+lengthSR: 76 # length of single reads, if used for calculating WPS
+
+#### Signal processing ####
+
+overlay_mode: "mean" # Can be either "mean" or "median". Sets overlay mode, specifying how regions should be aggregated for each sample.
+smoothing: True # Activates smoothing with Savitzky-Golay filter.
+smooth_window: 21 # Sets windows size used for smoothing with Savitzky-Golay filter.
+smooth_polyorder: 2 # Sets order of polynomial used for smoothing with Savitzky-Golay filter.
+rolling: True # Activates trend removal with a rolling median filter.
+rolling_window: 1000 # Sets window size used in rolling median filter.
+flank_norm: True # Activates normalization by dividing the signals by the mean coverage in flanking intervals around the region of interest.
+flank: 2000 # Sets the size of the flanking intervals around the region of interest. Should be <= 0.5 of the extracted signals
+signal: "coverage" # can be either "coverage" or "WPS"
+display_window: [-1500,1500]
+aggregate_controls: True
diff --git a/config/example.regions.tsv b/config/example.regions.tsv
@@ -0,0 +1,3 @@
+target	path
+region1	PATH/TO/region1.bed
+region2	PATH/TO/region2.bed
diff --git a/config/example.samples.tsv b/config/example.samples.tsv
@@ -1,3 +1,3 @@
-ID	sample	bam	fq1	fq2	genome_build	library_name	platform	info
-experiment_ID	samplename1	PATH/TO/BAM	-	-	some_library_kit	Sequencing_platform	healthy
-experiment_ID	samplename2	-	PATH/TO/FQ1	PATH/TO/FQ2	some_library_kit	Sequencing_platform	some_condition
+ID	sample	bam	fq1	fq2	genome_build	library_name	platform	status	info
+experiment_ID	samplename1	PATH/TO/BAM	-	-	some_library_kit	Sequencing_platform	healthy	SomeAdditionalInfoForReadGroup/ID
+experiment_ID	samplename2	-	PATH/TO/FQ1	PATH/TO/FQ2	some_library_kit	Sequencing_platform	some_condition	SomeAdditionalInfoForReadGroup/ID
diff --git a/config/multiqc_config.yaml b/config/multiqc_config.yaml
@@ -8,3 +8,5 @@ max_table_rows: 600
 use_filename_as_sample_name:
   - fastqc/zip
   - fastqc/data
+
+show_analysis_paths: False
diff --git a/config/test-config.yaml b/config/test-config.yaml
@@ -1,7 +1,11 @@
 # This file should contain everything to configure the workflow on a global scale.
 # In case of sample based data, it should be complemented by a samples.tsv file that contains
 # one row per sample. It can be parsed easily via pandas.
-samples: "config/test-samples.tsv" #"config/Delfi_samples.tsv"
+samples: "config/test-samples.tsv" 
+
+regions: "config/test-regions.tsv"
+
+control_name: "healthy" # name of the control samples specified in the samples.tsv. Has to match the name in the status field.
 
 
 ### genome build specific options ###
@@ -25,9 +29,17 @@ TMPDIR: "$TMPDIR" # path to directory for writing TMP files
 
 SEED: 42 # seed for increased reproducibility. Mainly used in GCbias estimation
 
+### Utility options ###
+
+utility:
+  GCbias-plot: True
+  GCbias-correction: True
+  ichorCNA: True
+  case-control-plot: True
+
 ### trimming ###
 
-trimming_algorithm: "NGmerge" #can be either NGmerge or trimmomatic
+PE_trimming_algorithm: "NGmerge" #can be either NGmerge or trimmomatic
 
 #### NGmerge specific options ####
 
@@ -36,6 +48,7 @@ length-filter:
 
 #### trimmomatic specific options ####
 
+phred-quality-encoding: phred-33 # three options: empty = automatic detection, phred-33 and phred-64
 
 # Illuminaclip takes a fasta file with adapter sequences and removes them in the trimming step.
 # The adapter_files option takes either the path to a custom file </PATH/TO/CUSTOM/ADAPTER.fa>
@@ -66,20 +79,18 @@ trimmers:
 
 ### Mapping ###
 
-# This option lets you add unpaired/singleton reads in the mapping step that were filtered,
-# but are either not paired or not merged. Otherwise these reads are not further processed.
+# This option lets you add unmerged/singleton or single-end reads in the mapping step.
+# Unmerged or singleton reads are paired end reads that were filtered by samtools fastq or NGmerge.
+# Single-end reads are from single end libraries.These categories can be excluded for specialised analyses.
 mapping:
-  unmerged: True # default is True
-  singleton: True # default is False
-
-### Utility ###
-
-utility:
-  GCbias-plot: True
-  GCbias-correction: True
-  ichorCNA: True
+  paired_end:
+    unmerged: True # default is True. Reads not merged by NGmerge.
+    singleton: False # default is False. Reads that are from paired end libraries without a matching pair.
+  single_end:
+    SEreads: True # default is True. This option is essential for Single End libraries. Setting to true in PE libraries has no effect on the output.
 
 
+### Utility parameters ###
 
 #### ichorCNA ####
 
@@ -118,3 +129,32 @@ ichorCNA:
   scStates: '"c(1,3)"'
   txnE: 0.9999
   txnStrength: 10000
+
+#### GCbias ####
+
+##### GCbias estimation #####
+
+GCbias_estimation:
+  normalized_interpolation: True # boolean, True or False. If True, the smooth parameter is normalized such that results are invariant to xdata range and less sensitive to nonuniformity of weights and xdata clumping.
+
+
+#### Signal extraction ####
+
+minRL: 120 # minimum read length for calculating WPS
+maxRL: 180 # maximum read length for calculating WPS
+bpProtection: 120 # bp protection for calculating WPS
+lengthSR: 76 # length of single reads, if used for calculating WPS
+
+#### Signal processing ####
+
+overlay_mode: "mean" # Can be either "mean" or "median". Sets overlay mode, specifying how regions should be aggregated for each sample.
+smoothing: True # Activates smoothing with Savitzky-Golay filter.
+smooth_window: 21 # Sets windows size used for smoothing with Savitzky-Golay filter.
+smooth_polyorder: 2 # Sets order of polynomial used for smoothing with Savitzky-Golay filter.
+rolling: True # Activates trend removal with a rolling median filter.
+rolling_window: 1000 # Sets window size used in rolling median filter.
+flank_norm: True # Activates normalization by dividing the signals by the mean coverage in flanking intervals around the region of interest.
+flank: 2000 # Sets the size of the flanking intervals around the region of interest. Should be <= 0.5 of the extracted signals
+signal: "coverage" # can be either "coverage" or "WPS"
+display_window: [-1500,1500]
+aggregate_controls: True
diff --git a/config/test-regions.tsv b/config/test-regions.tsv
@@ -0,0 +1,3 @@
+target	path
+LYL1	resources/testregions/LYL1.hg38.bed
+GRHL2	resources/testregions/GRHL2.hg38.bed
diff --git a/config/test-samples.tsv b/config/test-samples.tsv
@@ -1,3 +1,3 @@
-ID	sample	bam	fq1	fq2	genome_build	library_name	platform	info
-test-run	test19_chr20-22	resources/testsample/testsample_hg19_1x_chr20-22.bam	-	-	hg19	ThruPLEX_DNA-seq	Illumina_NextSeq_500	
-test-run	test38_chr20-22	resources/testsample/testsample_hg19_1x_chr20-22.bam	-	-	hg38	ThruPLEX_DNA-seq	Illumina_NextSeq_500	
+ID	sample	bam	fq1	fq2	genome_build	library_name	platform	status	info
+test-run	test38_chr20-22	resources/testsample/testsample_hg19_1x_chr20-22.bam	-	-	hg38	ThruPLEX_DNA-seq	Illumina_NextSeq_500	healthy	
+test-run	test38_chr20-22-case	resources/testsample/testsample_hg19_1x_chr20-22.bam	-	-	hg38	ThruPLEX_DNA-seq	Illumina_NextSeq_500	healthy-case