From b28bfd33a5d0ceeda2f311d5c9214943b70658c8 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 30 Jun 2023 10:47:13 +0200
Subject: [PATCH 01/73] added functionality for AssignToBins

---
 DIMS/AssignToBins.R  | 126 +++++++++++++++++++++++++++++++++++++++++++
 DIMS/AssignToBins.nf |  20 +++++++
 2 files changed, 146 insertions(+)
 create mode 100644 DIMS/AssignToBins.R
 create mode 100644 DIMS/AssignToBins.nf

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
new file mode 100644
index 0000000..0d9fd0b
--- /dev/null
+++ b/DIMS/AssignToBins.R
@@ -0,0 +1,126 @@
+#!/usr/bin/Rscript
+
+#.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.6.2")
+
+# load required packages 
+suppressPackageStartupMessages(library("xcms"))
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+filepath <- cmd_args[1] # location of mzML file
+breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
+resol <- as.numeric(cmd_args[3]) # 140000
+trim <- 0.1
+dimsThresh <- 100
+print(filepath)
+print(breaks_filepath)
+print(resol)
+
+#sampname <- sub('\\..*$', '', basename(filepath))
+#cat(paste0("\n", sampname))
+sampname <- "AssignToBins"
+
+#suppressPackageStartupMessages(library("Cairo"))
+options(digits=16)
+
+### process one sample at a time and find peaks FOR BOTH SCAN MODES! #
+int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+scale=2 # Initial value used to estimate scaling parameter
+width=1024
+height=768
+
+# Aggregate
+trimLeft=NULL
+trimRight=NULL
+breaks.fwhm=NULL
+breaks.fwhm.avg=NULL
+bins=NULL
+pos_results=NULL
+neg_results=NULL
+
+# read in the data for 1 sample
+raw_data <- suppressMessages(xcmsRaw(filepath))
+
+# load breaks.fwhm
+load(breaks_filepath)
+
+# Create empty placeholders for later use
+bins <- rep(0, length(breaks.fwhm) - 1)
+pos_bins <- bins
+neg_bins <- bins
+
+# Generate a matrix
+raw_data_matrix <- rawMat(raw_data)
+
+# Get time values for positive and negative scans
+pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
+neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
+# Select scans between trimLeft and trimRight
+pos_times <- pos_times[pos_times > trimLeft & pos_times < trimRight]
+neg_times <- neg_times[neg_times > trimLeft & neg_times < trimRight]
+
+# Generate an index with which to select values for each mode
+pos_index <- which(raw_data_matrix[ ,"time"] %in% pos_times)
+neg_index <- which(raw_data_matrix[ ,"time"] %in% neg_times)
+# Separate each mode into its own matrix
+pos_raw_data_matrix <- raw_data_matrix[pos_index, ]
+neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
+
+# Get index for binning intensity values
+bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
+bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
+
+# Get the list of intensity values for each bin, and add the
+# intensity values which are in the same bin
+if (nrow(pos_raw_data_matrix) > 0) {
+  # set NA in intensities to zero
+  pos_raw_data_matrix[is.na(pos_raw_data_matrix[,"intensity"]), "intensity"] <- 0
+  # use only values above dimsThresh
+  pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"]
+  # aggregate intensities, calculate mean
+  aggr_int_pos <- stats::aggregate(pos_intensity_above_threshold, list(bin_indices_pos), mean)
+  pos_bins[aggr_int_pos[ ,1]] <- aggr_int_pos[ ,2]
+}
+if (nrow(neg_raw_data_matrix) > 0) {
+  # set NA in intensities to zero
+  neg_raw_data_matrix[is.na(neg_raw_data_matrix[,"intensity"]), "intensity"] <- 0
+  # use only values above dimsThresh
+  neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"]
+  # aggregate intensities, calculate mean
+  aggr_int_neg <- stats::aggregate(neg_intensity_above_threshold, list(bin_indices_neg), mean)
+  neg_bins[aggr_int_neg[ ,1]] <- aggr_int_neg[ ,2]
+}
+
+# Zero any values that are below the threshold
+pos_bins[pos_bins < dimsThresh] <- 0
+neg_bins[neg_bins < dimsThresh] <- 0
+
+pos_results = cbind(pos_results, pos_bins)
+neg_results = cbind(neg_results, neg_bins)
+
+# transpose
+pos_results_transpose = t(pos_results)
+neg_results_transpose = t(neg_results)
+
+# Add file names as row names
+rownames(pos_results_transpose) = sampname
+rownames(neg_results_transpose) = sampname
+
+# delete the last value of breaks.fwhm.avg to match dimensions of pos_results and neg_results
+breaks.fwhm.avg.minus1 <- breaks.fwhm.avg[-length(breaks.fwhm.avg)]
+# Format as string and show precision of float to 5 digits
+breaks.fwhm.avg.minus1 <- sprintf("%.5f", breaks.fwhm.avg.minus1)
+
+# Use this as the column names
+colnames(pos_results_transpose) <- breaks.fwhm.avg.minus1
+colnames(neg_results_transpose) <- breaks.fwhm.avg.minus1
+
+# transpose back
+pos_results_final <- t(pos_results_transpose)
+neg_results_final <- t(neg_results_transpose)
+
+pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks.fwhm)
+
+save(pklist, file=paste("./", sampname, ".RData", sep=""))
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
new file mode 100644
index 0000000..872e32f
--- /dev/null
+++ b/DIMS/AssignToBins.nf
@@ -0,0 +1,20 @@
+process AssignToBins {
+    label 'AssignToBins'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       tuple(file_id, path(mzML_filename))
+       file(breaks_file)
+       val(resolution)
+
+    output:
+       file 'AssignToBins.RData'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $resolution
+        """
+}
+
+

From dcbfbcf2cffd91ed8f009022cd1a57ddcc017533 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 30 Jun 2023 10:48:55 +0200
Subject: [PATCH 02/73] added functionality for GenerateBreaks

---
 DIMS/GenerateBreaks.R  | 57 ++++++++++++++++++++++++++++++++++++++++++
 DIMS/GenerateBreaks.nf | 17 +++++++++++++
 2 files changed, 74 insertions(+)
 create mode 100644 DIMS/GenerateBreaks.R
 create mode 100644 DIMS/GenerateBreaks.nf

diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
new file mode 100644
index 0000000..85065ac
--- /dev/null
+++ b/DIMS/GenerateBreaks.R
@@ -0,0 +1,57 @@
+#### GenerateBreaks.R ####
+## adapted from 1-generateBreaksFwhm.HPC.R ##
+#!/usr/bin/Rscript
+
+# .libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2")
+
+# load required package 
+suppressPackageStartupMessages(library("xcms"))
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+filepath <- cmd_args[1] # 1 of the mzML files
+outdir   <- cmd_args[2] 
+trim     <- as.numeric(cmd_args[3]) # 0.1
+resol    <- as.numeric(cmd_args[4]) # 140000
+
+# initialize
+trimLeft        = NULL
+trimRight       = NULL
+breaks.fwhm     = NULL
+breaks.fwhm.avg = NULL
+bins            = NULL
+posRes          = NULL
+negRes          = NULL
+
+# read in mzML file
+raw_data <- suppressMessages(xcmsRaw(filepath))
+
+# trim scans at the start and end
+trimLeft  = round(raw_data@scantime[length(raw_data@scantime)*trim])
+trimRight = round(raw_data@scantime[length(raw_data@scantime)*(1-trim)])
+
+# Mass range m/z
+lowMZ = raw_data@mzrange[1]
+highMZ = raw_data@mzrange[2]
+
+# determine number of segments (bins)
+nsegment = 2*(highMZ-lowMZ)
+segment  = seq(from=lowMZ, to=highMZ, length.out=nsegment+1)
+
+# determine start and end of each bin.
+for (i in 1:nsegment) {
+  startsegm   <- segment[i]
+  endsegm     <- segment[i+1]
+  resol.mz    <- resol*(1/sqrt(2)^(log2(startsegm/200)))
+  fwhmsegm    <- startsegm/resol.mz
+  breaks.fwhm <- c(breaks.fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm))
+  # average the m/z instead of start value
+  range = seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)
+  deltaMZ = range[2] - range[1]
+  breaks.fwhm.avg <- c(breaks.fwhm.avg, range + 0.5*deltaMZ)
+}
+
+save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/"))
+
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
new file mode 100644
index 0000000..f73ad54
--- /dev/null
+++ b/DIMS/GenerateBreaks.nf
@@ -0,0 +1,17 @@
+process GenerateBreaks {
+    label 'GenerateBreaks'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       tuple(file_id, path(mzML_file))
+
+
+    output:
+       file 'breaks.fwhm.RData' 
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution 
+        """
+}

From ca3ae4431a82cad812f377ae3a7f36ec3e0bcf77 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 30 Jun 2023 13:27:36 +0200
Subject: [PATCH 03/73] reference to old file added

---
 DIMS/AssignToBins.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 0d9fd0b..90f510a 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -1,4 +1,5 @@
 #!/usr/bin/Rscript
+## adapted from 2-DIMS.R
 
 #.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.6.2")
 

From 0e666355e93ff283a9612184eeebe6245ca99b6b Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 30 Jun 2023 13:28:39 +0200
Subject: [PATCH 04/73] added functionality for MakeInit file

---
 DIMS/MakeInit.R  | 34 ++++++++++++++++++++++++++++++++++
 DIMS/MakeInit.nf | 16 ++++++++++++++++
 2 files changed, 50 insertions(+)
 create mode 100644 DIMS/MakeInit.R
 create mode 100644 DIMS/MakeInit.nf

diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
new file mode 100644
index 0000000..2657b66
--- /dev/null
+++ b/DIMS/MakeInit.R
@@ -0,0 +1,34 @@
+#!/usr/bin/env Rscript
+## adapted from makeInit in old pipeline
+
+# used for when init.RData has to be created manually
+# arg1 : path to sampleNames.txt or whatever the name of the samplesheet txt file may be
+# arg2 : amount of technical replicates (usually 3)
+
+args <- commandArgs(trailingOnly=TRUE)
+sample_sheet <- read.csv(args[1], sep="\t") 
+nr_replicates <- as.numeric(args[2])
+
+sampleNames <- trimws(as.vector(unlist(sample_sheet[1])))
+nr_sampgrps <- length(sampleNames)/nr_replicates
+groupNames <- trimws(as.vector(unlist(sample_sheet[2])))
+groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames)
+groupNamesUnique <- unique(groupNames)
+#groupNamesNotUnique <- groupNames[duplicated(groupNames)]
+
+repl.pattern <- c()
+for (sampgrp in 1:nr_sampgrps) {
+  tmp <- c()
+  for (repl in nr_replicates:1) {
+    index <- ((sampgrp*nr_replicates) - repl) + 1
+    tmp <- c(tmp, sampleNames[index])
+  }
+  repl.pattern <- c(repl.pattern, list(tmp))
+}
+
+names(repl.pattern) <- groupNamesUnique
+
+# just to preview
+head(repl.pattern)
+
+save(repl.pattern, file="init.RData")
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
new file mode 100644
index 0000000..064225c
--- /dev/null
+++ b/DIMS/MakeInit.nf
@@ -0,0 +1,16 @@
+process MakeInit {
+    label 'MakeInit'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       tuple(path(samplesheet), val(nr_replicates))
+
+    output:
+       file 'init.RData' 
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates 
+        """
+}

From e5a015446a78096f67221f0b06e60bfa79bbe7df Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 11 Aug 2023 15:07:17 +0200
Subject: [PATCH 05/73] added new files

---
 .../10-collectSamplesFilled_extraoutput.R     |  94 +++++
 DIMS/AddOnFunctions/checkOverlap.R            |  16 +
 DIMS/AddOnFunctions/elementInfo.R             |   6 +
 DIMS/AddOnFunctions/export.R                  |  13 +
 DIMS/AddOnFunctions/fit1Peak.R                | 120 +++++++
 DIMS/AddOnFunctions/fit2G.R                   |  43 +++
 DIMS/AddOnFunctions/fit2peaks.R               | 100 ++++++
 DIMS/AddOnFunctions/fit3G.R                   |  18 +
 DIMS/AddOnFunctions/fit3peaks.R               | 122 +++++++
 DIMS/AddOnFunctions/fit4G.R                   |  18 +
 DIMS/AddOnFunctions/fit4peaks.R               | 149 ++++++++
 DIMS/AddOnFunctions/fitG.R                    |  18 +
 DIMS/AddOnFunctions/fitGaussian.R             | 327 ++++++++++++++++++
 DIMS/AddOnFunctions/fitGaussianInit.R         |  60 ++++
 DIMS/AddOnFunctions/generateBreaksFwhm.R      |  26 ++
 DIMS/AddOnFunctions/generateExcelFile.R       |  24 ++
 DIMS/AddOnFunctions/generateGaussian.R        |  48 +++
 DIMS/AddOnFunctions/getArea.R                 |  23 ++
 DIMS/AddOnFunctions/getDeltaMZ.R              |   5 +
 DIMS/AddOnFunctions/getFitQuality.R           |  65 ++++
 DIMS/AddOnFunctions/getFwhm.R                 |  24 ++
 DIMS/AddOnFunctions/getPatients.R             |   9 +
 DIMS/AddOnFunctions/getSD.R                   |   9 +
 DIMS/AddOnFunctions/globalAssignments.HPC.R   | 114 ++++++
 DIMS/AddOnFunctions/iden.code.R               |  45 +++
 DIMS/AddOnFunctions/ident.hires.noise.HPC.R   | 242 +++++++++++++
 DIMS/AddOnFunctions/isWithinXppm.R            |  51 +++
 DIMS/AddOnFunctions/mergeDuplicatedRows.R     |  49 +++
 DIMS/AddOnFunctions/normalization_2.1.R       |  75 ++++
 DIMS/AddOnFunctions/optimizeGauss.R           |  11 +
 DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R |  88 +++++
 DIMS/AddOnFunctions/remove.dupl.2.1.R         |  32 ++
 DIMS/AddOnFunctions/remove.dupl.R             |  39 +++
 DIMS/AddOnFunctions/replaceZeros.R            | 108 ++++++
 DIMS/AddOnFunctions/replaceZeros_setseed.R    | 108 ++++++
 DIMS/AddOnFunctions/run.vbs                   |  22 ++
 DIMS/AddOnFunctions/runVBAMacro.R             |  48 +++
 DIMS/AddOnFunctions/searchMZRange.R           | 187 ++++++++++
 DIMS/AddOnFunctions/sourceDir.R               |   8 +
 DIMS/AddOnFunctions/statistics_z.R            |  69 ++++
 DIMS/AddOnFunctions/sumCurves.R               |  39 +++
 DIMS/AddOnFunctions/trimZeros.R               |   8 +
 DIMS/AssignToBins.R                           |   4 +-
 DIMS/AssignToBins.nf                          |   5 +-
 DIMS/AverageTechReplicates.R                  | 119 +++++++
 DIMS/AverageTechReplicates.nf                 |  24 ++
 DIMS/GenerateBreaks.R                         |   2 +
 DIMS/GenerateBreaks.nf                        |   2 +-
 DIMS/HMDBparts.R                              | 126 +++++++
 DIMS/HMDBparts.nf                             |  20 ++
 DIMS/MakeInit.nf                              |   2 +-
 DIMS/PeakFinding.R                            |  68 ++++
 DIMS/PeakFinding.nf                           |  23 ++
 DIMS/PeakGroupingIdentified.R                 | 180 ++++++++++
 DIMS/PeakGroupingIdentified.nf                |  18 +
 DIMS/SpectrumPeakFinding.R                    |  77 +++++
 DIMS/SpectrumPeakFinding.nf                   |  17 +
 57 files changed, 3360 insertions(+), 7 deletions(-)
 create mode 100644 DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R
 create mode 100644 DIMS/AddOnFunctions/checkOverlap.R
 create mode 100644 DIMS/AddOnFunctions/elementInfo.R
 create mode 100644 DIMS/AddOnFunctions/export.R
 create mode 100644 DIMS/AddOnFunctions/fit1Peak.R
 create mode 100644 DIMS/AddOnFunctions/fit2G.R
 create mode 100644 DIMS/AddOnFunctions/fit2peaks.R
 create mode 100644 DIMS/AddOnFunctions/fit3G.R
 create mode 100644 DIMS/AddOnFunctions/fit3peaks.R
 create mode 100644 DIMS/AddOnFunctions/fit4G.R
 create mode 100644 DIMS/AddOnFunctions/fit4peaks.R
 create mode 100644 DIMS/AddOnFunctions/fitG.R
 create mode 100644 DIMS/AddOnFunctions/fitGaussian.R
 create mode 100644 DIMS/AddOnFunctions/fitGaussianInit.R
 create mode 100644 DIMS/AddOnFunctions/generateBreaksFwhm.R
 create mode 100644 DIMS/AddOnFunctions/generateExcelFile.R
 create mode 100644 DIMS/AddOnFunctions/generateGaussian.R
 create mode 100644 DIMS/AddOnFunctions/getArea.R
 create mode 100644 DIMS/AddOnFunctions/getDeltaMZ.R
 create mode 100644 DIMS/AddOnFunctions/getFitQuality.R
 create mode 100644 DIMS/AddOnFunctions/getFwhm.R
 create mode 100644 DIMS/AddOnFunctions/getPatients.R
 create mode 100644 DIMS/AddOnFunctions/getSD.R
 create mode 100644 DIMS/AddOnFunctions/globalAssignments.HPC.R
 create mode 100644 DIMS/AddOnFunctions/iden.code.R
 create mode 100644 DIMS/AddOnFunctions/ident.hires.noise.HPC.R
 create mode 100644 DIMS/AddOnFunctions/isWithinXppm.R
 create mode 100644 DIMS/AddOnFunctions/mergeDuplicatedRows.R
 create mode 100644 DIMS/AddOnFunctions/normalization_2.1.R
 create mode 100644 DIMS/AddOnFunctions/optimizeGauss.R
 create mode 100644 DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R
 create mode 100644 DIMS/AddOnFunctions/remove.dupl.2.1.R
 create mode 100644 DIMS/AddOnFunctions/remove.dupl.R
 create mode 100644 DIMS/AddOnFunctions/replaceZeros.R
 create mode 100644 DIMS/AddOnFunctions/replaceZeros_setseed.R
 create mode 100644 DIMS/AddOnFunctions/run.vbs
 create mode 100644 DIMS/AddOnFunctions/runVBAMacro.R
 create mode 100644 DIMS/AddOnFunctions/searchMZRange.R
 create mode 100644 DIMS/AddOnFunctions/sourceDir.R
 create mode 100644 DIMS/AddOnFunctions/statistics_z.R
 create mode 100644 DIMS/AddOnFunctions/sumCurves.R
 create mode 100644 DIMS/AddOnFunctions/trimZeros.R
 create mode 100644 DIMS/AverageTechReplicates.R
 create mode 100644 DIMS/AverageTechReplicates.nf
 create mode 100644 DIMS/HMDBparts.R
 create mode 100644 DIMS/HMDBparts.nf
 create mode 100644 DIMS/PeakFinding.R
 create mode 100644 DIMS/PeakFinding.nf
 create mode 100644 DIMS/PeakGroupingIdentified.R
 create mode 100644 DIMS/PeakGroupingIdentified.nf
 create mode 100644 DIMS/SpectrumPeakFinding.R
 create mode 100644 DIMS/SpectrumPeakFinding.nf

diff --git a/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R
new file mode 100644
index 0000000..21ccdce
--- /dev/null
+++ b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R
@@ -0,0 +1,94 @@
+#!/usr/bin/Rscript
+
+.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2")
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+outdir <- cmd_args[1]
+scanmode <- cmd_args[2]
+normalization <- cmd_args[3]
+scripts <- cmd_args[4]
+z_score <- as.numeric(cmd_args[5])
+ppm <- as.numeric(cmd_args[6])
+
+#outdir <- "/Users/nunen/Documents/Metab/processed/test_old"
+#scanmode <- "negative"
+#normalization <- "disabled"
+#scripts <- "/Users/nunen/Documents/Metab/DIMS/scripts"
+#db <- "/Users/nunen/Documents/Metab/DIMS/db/HMDB_add_iso_corrNaCl_withIS_withC5OH.RData"
+#z_score <- 0
+
+object.files = list.files(paste(outdir, "9-samplePeaksFilled", sep="/"), full.names=TRUE, pattern=scanmode)
+outlist.tot=NULL
+for (i in 1:length(object.files)) {
+  load(object.files[i])
+  print(print(object.files[i]))
+  outlist.tot = rbind(outlist.tot, final.outlist.idpat3)
+}
+
+source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/"))
+sourceDir(paste(scripts, "AddOnFunctions", sep="/"))
+
+# remove duplicates
+outlist.tot = mergeDuplicatedRows(outlist.tot)
+
+# sort on mass
+outlist.tot = outlist.tot[order(outlist.tot[,"mzmed.pgrp"]),]
+
+# normalization
+load(paste0(outdir, "/repl.pattern.",scanmode,".RData"))
+
+if (normalization != "disabled") {
+  outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB")
+}
+
+if (z_score == 1) {
+  outlist.stats = statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
+  nr.removed.samples=length(which(repl.pattern.filtered[]=="character(0)"))
+  order.index.int=order(colnames(outlist.stats)[8:(length(repl.pattern.filtered)-nr.removed.samples+7)])
+  outlist.stats.more = cbind(outlist.stats[,1:7],
+                             outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+8):(length(repl.pattern.filtered)-nr.removed.samples+8+6)],
+                             outlist.stats[,8:(length(repl.pattern.filtered)-nr.removed.samples+7)][order.index.int],
+                             outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+5+10):ncol(outlist.stats)])
+  
+  tmp.index=grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE)
+  tmp.index.order=order(colnames(outlist.stats.more[,tmp.index]))
+  tmp = outlist.stats.more[,tmp.index[tmp.index.order]]
+  outlist.stats.more=outlist.stats.more[,-tmp.index]
+  outlist.stats.more=cbind(outlist.stats.more,tmp)
+  outlist.tot = outlist.stats.more
+}
+
+# filter identified compounds
+index.1=which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"])))
+index.2=which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"])))
+index=union(index.1,index.2)
+outlist.ident = outlist.tot[index,]
+outlist.not.ident = outlist.tot[-index,]
+
+if (z_score == 1) {
+  outlist.ident$ppmdev=as.numeric(outlist.ident$ppmdev)
+  outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),]
+}
+# NAs in theormz_noise <======================================================================= uitzoeken!!!
+outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0
+outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise)
+outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0
+outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise)
+
+save(outlist.not.ident, outlist.ident, file=paste(outdir, "/outlist_identified_", scanmode, ".RData", sep=""))
+
+# Extra output in Excel-readable format:
+remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
+remove_colindex <- which(colnames(outlist.ident) %in% remove_columns)
+outlist.ident <- outlist.ident[ , -remove_colindex]
+write.table(outlist.ident, file=paste0(outdir, "/outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns)
+outlist.not.ident <- outlist.not.ident[ , -remove_colindex]
+write.table(outlist.not.ident, file=paste0(outdir, "/outlist_not_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+
diff --git a/DIMS/AddOnFunctions/checkOverlap.R b/DIMS/AddOnFunctions/checkOverlap.R
new file mode 100644
index 0000000..f7ef818
--- /dev/null
+++ b/DIMS/AddOnFunctions/checkOverlap.R
@@ -0,0 +1,16 @@
+checkOverlap <- function(range1,range2){
+  if (length(intersect(range1,range2))==2) {
+    # Overlap
+    # message("Overlap, smaller range is used")
+    if (length(range1) >= length(range2)){
+      range1=range1[-length(range1)]  
+    } else {
+      range2=range2[-1]
+    }
+  } else if (length(intersect(range1,range2))==3){
+    # message("Overlap, smaller range is used")
+    range1=range1[-length(range1)]  
+    range2=range2[-1]
+  }
+  return(list("range1"=range1,"range2"=range2))
+}
diff --git a/DIMS/AddOnFunctions/elementInfo.R b/DIMS/AddOnFunctions/elementInfo.R
new file mode 100644
index 0000000..18eaeff
--- /dev/null
+++ b/DIMS/AddOnFunctions/elementInfo.R
@@ -0,0 +1,6 @@
+elementInfo <- function(name, elements = NULL) { # from Rdisop function .getElement  
+  if (!is.list(elements) || length(elements)==0 ) {
+    elements <- initializePSE() }
+  if (name=="CH3OH+H"){rex<-"^CH3OH\\+H$"}else{rex <- paste ("^",name,"$", sep="")}
+  elements [[grep (rex, sapply (elements, function(x) {x$name}))]]
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/export.R b/DIMS/AddOnFunctions/export.R
new file mode 100644
index 0000000..cab79a0
--- /dev/null
+++ b/DIMS/AddOnFunctions/export.R
@@ -0,0 +1,13 @@
+export <- function(peaklist, plotdir, adducts, control_label, case_label, patients, sub, fileName){
+  # peaklist = outlist.adducts
+  # adducts=TRUE
+  # control_label="C"
+  # case_label="P"
+  # patients = getPatients(outlist.adducts)
+  # sub=3000
+  
+  # peaklist = statistics_z_4export(as.data.frame(peaklist), plotdir, patients, adducts, control_label, case_label)
+  
+  # generateExcelFile(peaklist, file.path(plotdir), imageNum=2, fileName, subName=c("","_box"), sub, adducts)
+  
+}
diff --git a/DIMS/AddOnFunctions/fit1Peak.R b/DIMS/AddOnFunctions/fit1Peak.R
new file mode 100644
index 0000000..b09ce20
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit1Peak.R
@@ -0,0 +1,120 @@
+fit1Peak <- function(x2,x,y,index,scale,resol,plot,FQ,useBounds) {
+  #FQ=FQ1
+  
+  if (length(y)<3){
+    message("Range to small, no fit possible!")
+  } else {
+    
+    if ((length(y)==4)) {
+      mu = weighted.mean(x,y)
+      sigma = getSD(x,y)
+      fitP = fitG_2(x,y,sigma,mu,scale,useBounds)
+    } else {
+      
+      if ((length(x) - length(index)) < 2) {
+        range1=c((length(x)-4):length(x))
+      } else if (length(index) < 2) {
+        range1=c(1:5)
+      } else {
+        range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2)
+      }
+      
+      if (range1[1]==0) range1=range1[-1]
+      
+      # remove NA
+      if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))]
+      
+      mu = weighted.mean(x[range1],y[range1])
+      sigma = getSD(x[range1],y[range1])
+      fitP = fitG_2(x,y,sigma,mu,scale,useBounds)
+    }
+    
+    p2 = fitP$par
+    
+    #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y)
+    fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new
+    
+    if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green")
+    
+    scale_new = 1.2*scale
+    # cat(fq_new)
+    
+    if (fq_new > FQ) { # <=== bad fit?
+      # optimize scaling factor
+      fq = 0
+      scale = 0
+      
+      if (sum(y)>sum(p2[2]*dnorm(x,p2[1],sigma))){
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000))  {
+          fq = fq_new
+          scale = scale_new
+          
+          #cat(scale)
+          fitP = fitG_2(x,y,sigma,mu,scale,useBounds)
+          p2 = fitP$par
+          
+          #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y)
+          fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new
+          scale_new=1.2*scale
+          
+          if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green")
+          #           cat(paste("fq_new: ", fq_new))
+          #           cat(paste("scale_new: ", scale_new))
+          #           (round(fq, digits = 4) != round(fq_new, digits = 4))
+        }
+      } else {
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) {
+          fq = fq_new
+          scale = scale_new
+          
+          # cat(scale)
+          fitP = fitG_2(x,y,sigma,mu,scale,useBounds)
+          p2 = fitP$par
+          
+          #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y)
+          fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new
+          scale_new=0.8*scale
+          
+          if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green")
+          # cat(paste("fq_new: ", fq_new))
+        }
+      }
+      
+      if (fq < fq_new) {
+        #         cat(paste("fq_new: ", fq_new))
+        #         cat(paste("fq: ", fq))
+        #         cat(paste("scale_new: ", scale_new))
+        #         cat(paste("scale: ", scale))
+        
+        fitP = fitG_2(x,y,sigma,mu,scale,useBounds)
+        p2 = fitP$par
+        fq_new = fq
+        # cat(paste("==> fq_new: ", fq_new))
+        if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="dark green")
+        
+      }
+    }
+    
+    if (plot & (fq_new < FQ)) {
+      # plot ###################
+      #lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green")
+      fwhm = getFwhm(p2[1],resol)
+      half_max = max(p2[2]*dnorm(x2,p2[1],sigma))*0.5
+      lines(c(p2[1] - 0.5*fwhm, p2[1] + 0.5*fwhm),c(half_max,half_max),col="orange")
+      abline(v = p2[1], col="green")
+      h=c(paste("mean =", p2[1], sep=" "),
+          paste("fq =", fq_new, sep=" "))
+      legend("topright", legend=h)
+      ##########################
+      
+      
+      #       abline(v = x[6], col="red")
+      #       fwhm = getFwhm(x[6])
+      #       abline(v =x[6] + 0.6*fwhm, col="red")
+      #       abline(v =x[6] - 0.6*fwhm, col="red")
+      
+    }
+  }
+  
+  return(list("mean"=p2[1], "scale"=p2[2], "sigma"=sigma, "qual"=fq_new))
+}
diff --git a/DIMS/AddOnFunctions/fit2G.R b/DIMS/AddOnFunctions/fit2G.R
new file mode 100644
index 0000000..9dc76dd
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit2G.R
@@ -0,0 +1,43 @@
+fit2G_2 <- function(x,y,sig1,sig2,mu1,scale1,mu2,scale2,useBounds){
+  
+  f = function(p){
+    d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2)
+    sum((d-y)^2)
+  }
+  
+  if (useBounds){
+    lower = c(x[1],0,x[1],0)
+    upper = c(x[length(x)],Inf,x[length(x)],Inf)
+    
+    if (is.null(mu2) && is.null(scale2) && is.null(sig2)){
+      sig2=sig1
+      optim(c(as.numeric(mu1),
+              as.numeric(scale1),
+              as.numeric(mu1),
+              as.numeric(scale1)),
+            f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper)
+    } else {
+      optim(c(as.numeric(mu1),
+              as.numeric(scale1),
+              as.numeric(mu2),
+              as.numeric(scale2)),
+            f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper)
+    }
+    
+  } else {
+    if (is.null(mu2) && is.null(scale2) && is.null(sig2)){
+      sig2=sig1
+      optim(c(as.numeric(mu1),
+              as.numeric(scale1),
+              as.numeric(mu1),
+              as.numeric(scale1)),
+            f,control=list(maxit=10000))
+    } else{
+      optim(c(as.numeric(mu1),
+              as.numeric(scale1),
+              as.numeric(mu2),
+              as.numeric(scale2)),
+            f,control=list(maxit=10000))
+    }  
+  }
+}
diff --git a/DIMS/AddOnFunctions/fit2peaks.R b/DIMS/AddOnFunctions/fit2peaks.R
new file mode 100644
index 0000000..4d9c64d
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit2peaks.R
@@ -0,0 +1,100 @@
+fit2peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){
+  
+  peak.mean = NULL
+  peak.area = NULL
+  peak.scale = NULL
+  peak.sigma = NULL
+  
+  range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2)
+  if (range1[1]==0) range1=range1[-1]
+  
+  range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2)
+  
+  if (length(x)<range2[length(range2)]) range2=range2[-length(range2)]
+  
+  range1=checkOverlap(range1,range2)[[1]]
+  range2=checkOverlap(range1,range2)[[2]]
+  
+  # check for negative or 0
+  remove=which(range1<1)
+  if (length(remove)>0) range1=range1[-remove]
+  remove=which(range2<1)
+  if (length(remove)>0) range2=range2[-remove]
+  
+  # remove NA
+  if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))]
+  if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))]
+  
+  mu1 = weighted.mean(x[range1],y[range1])
+  sigma1 = getSD(x[range1],y[range1])
+  
+  #   message(paste("fit2peaks mu =>", mu1))
+  #   message(paste("fit2peaks sigma =>", sigma1))
+  #   message(paste("fit2peaks scale =>", scale))
+  
+  fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds)
+  p = fitP$par
+  
+  mu2 = weighted.mean(x[range2],y[range2])
+  sigma2 = getSD(x[range2],y[range2])
+  fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds)
+  p2 = fitP$par
+  
+  fit2P = fit2G_2(x, y, sigma1, sigma2, p[1], p[2], p2[1], p2[2],useBounds)
+  p3 = fit2P$par
+  
+  if (is.null(sigma2)) sigma2=sigma1
+  
+  
+  # plot ###################
+  sumFit2 = (p3[2]*dnorm(x2,p3[1],sigma1))+(p3[4]*dnorm(x2,p3[3],sigma2))
+  sumFit = (p3[2]*dnorm(x,p3[1],sigma1))+(p3[4]*dnorm(x,p3[3],sigma2))
+  fq=getFitQuality(x,y,sort(c(p3[1],p3[3]))[1],sort(c(p3[1],p3[3]))[2],resol,sumFit=sumFit)$fq_new
+  
+  fwhm = getFwhm(p3[1],resol)
+  half_max = max(p3[2]*dnorm(x2,p3[1],sigma1))*0.5
+  if (plot & (fq < FQ)) lines(c(p3[1] - 0.5*fwhm, p3[1] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  if (plot & (fq < FQ)) lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="grey")
+  if (plot & (fq < FQ)) abline(v = p3[3], col="grey")
+  
+  fwhm = getFwhm(p3[3],resol)
+  half_max = max(p3[4]*dnorm(x2,p3[3],sigma2))*0.5
+  if (plot & (fq < FQ)) lines(c(p3[3] - 0.5*fwhm, p3[3] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,sumFit2,col="black")
+  
+  if (plot & (fq < FQ)) lines(x2,p3[2]*dnorm(x2,p3[1],sigma1),col="grey")
+  if (plot & (fq < FQ)) abline(v = p3[1], col="grey")
+  
+  
+  h2=c(paste("mean =", p3[1], sep=" "),
+       paste("mean =", p3[3], sep=" "),
+       paste("fq =", fq, sep=" "))
+  
+  if (plot & (fq < FQ)) legend("topright", legend=h2)
+  ##########################
+  
+  #  lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="red")
+  #   area1 = sum(p3[2]*dnorm(x2,p3[1],sigma1))
+  #   area2 = sum(p3[4]*dnorm(x2,p3[3],sigma2))
+  
+  #   area1 = max(p3[2]*dnorm(x2,p3[1],sigma1))
+  #   area2 = max(p3[4]*dnorm(x2,p3[3],sigma2))
+  
+  area1 = getArea(p3[1],resol,p3[2],sigma1,int.factor)
+  area2 = getArea(p3[3],resol,p3[4],sigma2,int.factor)
+  
+  peak.area = c(peak.area, area1)
+  peak.area = c(peak.area, area2)
+  
+  peak.mean = c(peak.mean, p3[1])
+  peak.mean = c(peak.mean, p3[3])
+  
+  peak.scale = c(peak.scale, p3[2])
+  peak.scale = c(peak.scale, p3[4])
+  
+  peak.sigma = c(peak.sigma, sigma1)
+  peak.sigma = c(peak.sigma, sigma2)
+  
+  return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq))
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/fit3G.R b/DIMS/AddOnFunctions/fit3G.R
new file mode 100644
index 0000000..05543c5
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit3G.R
@@ -0,0 +1,18 @@
+fit3G_2 <- function(x,y,sig1,sig2,sig3,mu1,scale1,mu2,scale2,mu3,scale3,useBounds){
+  
+  f = function(p){
+    d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3)
+    sum((d-y)^2)
+  }
+  
+  if (useBounds){
+    lower = c(x[1],0,x[1],0,x[1],0)
+    upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf)
+    
+    optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper)
+    
+  } else {
+    optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000))
+  }
+  
+}
diff --git a/DIMS/AddOnFunctions/fit3peaks.R b/DIMS/AddOnFunctions/fit3peaks.R
new file mode 100644
index 0000000..e373f14
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit3peaks.R
@@ -0,0 +1,122 @@
+fit3peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){
+  
+  peak.mean = NULL
+  peak.area = NULL
+  peak.scale = NULL
+  peak.sigma = NULL
+  
+  range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2)
+  range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2)
+  range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2)
+  
+  remove=which(range1<1)
+  if (length(remove)>0) {
+    range1=range1[-remove]
+  }
+  remove=which(range2<1)
+  if (length(remove)>0) {
+    range2=range2[-remove]
+  }
+  
+  if (length(x)<range3[length(range3)]) range3=range3[-length(range3)]
+  
+  range1=checkOverlap(range1,range2)[[1]]
+  range2=checkOverlap(range1,range2)[[2]]
+  range2=checkOverlap(range2,range3)[[1]]
+  range3=checkOverlap(range2,range3)[[2]]
+  
+  # check for negative or 0
+  remove=which(range1<1)
+  if (length(remove)>0) range1=range1[-remove]
+  remove=which(range2<1)
+  if (length(remove)>0) range2=range2[-remove]
+  remove=which(range3<1)
+  if (length(remove)>0) range3=range3[-remove]
+  
+  # remove NA
+  if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))]
+  if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))]
+  if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))]
+  
+  mu1 = weighted.mean(x[range1],y[range1])
+  sigma1 = getSD(x[range1],y[range1])
+  fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds)
+  p = fitP$par
+  
+  mu2 = weighted.mean(x[range2],y[range2])
+  sigma2 = getSD(x[range2],y[range2])
+  fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds)
+  p2 = fitP$par
+  
+  mu3 = weighted.mean(x[range3],y[range3])
+  sigma3 = getSD(x[range3],y[range3])
+  fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds)
+  p3 = fitP$par
+  
+  fit3P = fit3G_2(x, y, sigma1, sigma2, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], useBounds)
+  p4 = fit3P$par
+  
+  # plot ##############################
+  sumFit2 = (p4[2]*dnorm(x2,p4[1],sigma1))+(p4[4]*dnorm(x2,p4[3],sigma2))+(p4[6]*dnorm(x2,p4[5],sigma3))
+  sumFit = (p4[2]*dnorm(x,p4[1],sigma1))+(p4[4]*dnorm(x,p4[3],sigma2))+(p4[6]*dnorm(x,p4[5],sigma3))
+  fq=getFitQuality(x,y,sort(c(p4[1],p4[3],p4[5]))[1],sort(c(p4[1],p4[3],p4[5]))[3],resol,sumFit=sumFit)$fq_new
+  
+  if (plot & (fq < FQ)) lines(x2,p4[2]*dnorm(x2,p4[1],sigma1),col="yellow")
+  if (plot & (fq < FQ)) abline(v = p4[1], col="yellow")
+  fwhm = getFwhm(p4[1],resol)
+  half_max = max(p4[2]*dnorm(x2,p4[1],sigma1))*0.5
+  if (plot & (fq < FQ)) lines(c(p4[1] - 0.5*fwhm, p4[1] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,p4[4]*dnorm(x2,p4[3],sigma2),col="yellow")
+  if (plot & (fq < FQ)) abline(v = p4[3], col="yellow")
+  fwhm = getFwhm(p4[3],resol)
+  half_max = max(p4[4]*dnorm(x2,p4[3],sigma2))*0.5
+  if (plot & (fq < FQ)) lines(c(p4[3] - 0.5*fwhm, p4[3] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,p4[6]*dnorm(x2,p4[5],sigma3),col="yellow")
+  if (plot & (fq < FQ)) abline(v = p4[5], col="yellow")
+  fwhm = getFwhm(p4[5],resol)
+  half_max = max(p4[6]*dnorm(x2,p4[5],sigma3))*0.5
+  if (plot & (fq < FQ)) lines(c(p4[5] - 0.5*fwhm, p4[5] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,sumFit2,col="red")
+  
+  h2=c(paste("mean =", p4[1], sep=" "),
+       paste("mean =", p4[3], sep=" "),
+       paste("mean =", p4[5], sep=" "),
+       paste("fq =", fq, sep=" "))
+  
+  if (plot & (fq < FQ)) legend("topright", legend=h2)
+  #########################################
+  
+  #   area1 = sum(p4[2]*dnorm(x2,p4[1],sigma1))
+  #   area2 = sum(p4[4]*dnorm(x2,p4[3],sigma2))
+  #   area3 = sum(p4[6]*dnorm(x2,p4[5],sigma3))
+  
+  #   area1 = max(p4[2]*dnorm(x2,p4[1],sigma1))
+  #   area2 = max(p4[4]*dnorm(x2,p4[3],sigma2))
+  #   area3 = max(p4[6]*dnorm(x2,p4[5],sigma3))
+  
+  area1 = getArea(p4[1],resol,p4[2],sigma1,int.factor)
+  area2 = getArea(p4[3],resol,p4[4],sigma2,int.factor)
+  area3 = getArea(p4[5],resol,p4[6],sigma3,int.factor)
+  
+  peak.area = c(peak.area, area1)
+  peak.area = c(peak.area, area2)
+  peak.area = c(peak.area, area3)
+  
+  peak.mean = c(peak.mean, p4[1])
+  peak.mean = c(peak.mean, p4[3])
+  peak.mean = c(peak.mean, p4[5])
+  
+  peak.scale = c(peak.scale, p4[2])
+  peak.scale = c(peak.scale, p4[4])
+  peak.scale = c(peak.scale, p4[6])
+  
+  peak.sigma = c(peak.sigma, sigma1)
+  peak.sigma = c(peak.sigma, sigma2)
+  peak.sigma = c(peak.sigma, sigma3)
+  
+  return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq))
+  
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/fit4G.R b/DIMS/AddOnFunctions/fit4G.R
new file mode 100644
index 0000000..6cad63d
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit4G.R
@@ -0,0 +1,18 @@
+fit4G_2 <- function(x,y,sig1,sig2,sig3,sig4,mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4,useBounds){
+  
+  f = function(p){
+    d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) + p[8]*dnorm(x,mean=p[7],sd=sig4)
+    sum((d-y)^2)
+  }
+  
+  if (useBounds){
+    lower = c(x[1],0,x[1],0,x[1],0,x[1],0)
+    upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf)
+    
+    optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper)
+    
+  } else {
+    optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000))
+  }
+  
+}
diff --git a/DIMS/AddOnFunctions/fit4peaks.R b/DIMS/AddOnFunctions/fit4peaks.R
new file mode 100644
index 0000000..a506541
--- /dev/null
+++ b/DIMS/AddOnFunctions/fit4peaks.R
@@ -0,0 +1,149 @@
+fit4peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor) {
+  
+  peak.mean = NULL
+  peak.area = NULL
+  peak.scale = NULL
+  peak.sigma = NULL
+  
+  range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2)
+  range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2)
+  range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2)
+  range4=c(index[4]-2,index[4]-1,index[4],index[4]+1,index[4]+2)
+  if (range1[1]==0) range1=range1[-1]
+  if (length(x)<range4[length(range4)]) range4=range4[-length(range4)]
+  
+  range1=checkOverlap(range1,range2)[[1]]
+  range2=checkOverlap(range1,range2)[[2]]
+  range2=checkOverlap(range2,range3)[[1]]
+  range3=checkOverlap(range2,range3)[[2]]
+  range3=checkOverlap(range3,range4)[[1]]
+  range4=checkOverlap(range3,range4)[[2]]
+  
+  remove=which(range4>length(x))
+  if (length(remove)>0) {
+    range4=range4[-remove]
+    # message(length(range4))
+  }
+  
+  # check for negative or 0
+  remove=which(range1<1)
+  if (length(remove)>0) range1=range1[-remove]
+  remove=which(range2<1)
+  if (length(remove)>0) range2=range2[-remove]
+  remove=which(range3<1)
+  if (length(remove)>0) range3=range3[-remove]
+  remove=which(range4<1)
+  if (length(remove)>0) range4=range4[-remove]
+  
+  # remove NA
+  if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))]
+  if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))]
+  if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))]
+  if (length(which(is.na(y[range4])))!=0) range4=range4[-which(is.na(y[range4]))]
+  
+  mu1 = weighted.mean(x[range1],y[range1])
+  sigma1 = getSD(x[range1],y[range1])
+  fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds)
+  p = fitP$par
+  
+  mu2 = weighted.mean(x[range2],y[range2])
+  sigma2 = getSD(x[range2],y[range2])
+  fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds)
+  p2 = fitP$par
+  
+  mu3 = weighted.mean(x[range3],y[range3])
+  sigma3 = getSD(x[range3],y[range3])
+  fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds)
+  p3 = fitP$par
+  
+  mu4 = weighted.mean(x[range4],y[range4])
+  sigma4 = getSD(x[range4],y[range4])
+  fitP = fitG_2(x[range4],y[range4],sigma4,mu4,scale,useBounds)
+  p4 = fitP$par
+  
+  fit4P = fit4G_2(x, y, sigma1, sigma2, sigma3, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2],  p4[1], p4[2], useBounds)
+  p5 = fit4P$par
+  
+  # plot #####################################
+  sumFit2 = (p5[2]*dnorm(x2,p5[1],sigma1))+(p5[4]*dnorm(x2,p5[3],sigma2))+(p5[6]*dnorm(x2,p5[5],sigma3))+(p5[8]*dnorm(x2,p5[7],sigma3))
+  sumFit = (p5[2]*dnorm(x,p5[1],sigma1))+(p5[4]*dnorm(x,p5[3],sigma2))+(p5[6]*dnorm(x,p5[5],sigma3))+(p5[8]*dnorm(x,p5[7],sigma3))
+  fq=getFitQuality(x,y,sort(c(p5[1],p5[3],p5[5],p5[7]))[1],sort(c(p5[1],p5[3],p5[5],p5[7]))[4],resol,sumFit=sumFit)$fq_new
+  
+  if (plot & (fq < FQ)) lines(x2,p5[2]*dnorm(x2,p5[1],sigma1),col="purple")
+  if (plot & (fq < FQ)) abline(v = p5[1], col="purple")
+  fwhm = getFwhm(p5[1],resol)
+  half_max = max(p5[2]*dnorm(x2,p5[1],sigma1))*0.5
+  if (plot & (fq < FQ)) lines(c(p5[1] - 0.5*fwhm, p5[1] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,p5[4]*dnorm(x2,p5[3],sigma2),col="purple")
+  if (plot & (fq < FQ)) abline(v = p5[3], col="purple")
+  fwhm = getFwhm(p5[3],resol)
+  half_max = max(p5[4]*dnorm(x2,p5[3],sigma2))*0.5
+  if (plot & (fq < FQ)) lines(c(p5[3] - 0.5*fwhm, p5[3] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,p5[6]*dnorm(x2,p5[5],sigma3),col="purple")
+  if (plot & (fq < FQ)) abline(v = p5[5], col="purple")
+  fwhm = getFwhm(p5[5],resol)
+  half_max = max(p5[6]*dnorm(x2,p5[5],sigma3))*0.5
+  if (plot & (fq < FQ)) lines(c(p5[5] - 0.5*fwhm, p5[5] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,p5[8]*dnorm(x2,p5[7],sigma3),col="purple")
+  if (plot & (fq < FQ)) abline(v = p5[7], col="purple")
+  fwhm = getFwhm(p5[7],resol)
+  half_max = max(p5[8]*dnorm(x2,p5[7],sigma3))*0.5
+  if (plot & (fq < FQ)) lines(c(p5[7] - 0.5*fwhm, p5[7] + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  if (plot & (fq < FQ)) lines(x2,sumFit2,col="blue")
+  
+  #fq = abs(sum(y) - sum(sumFit))/sum(y)
+  #fq=abs(sum(y) - sum(sumFit))/sum(y)
+  #fq=mean(abs(sumFit - y)/sumFit)
+  
+  
+  h2=c(paste("mean =", p5[1], sep=" "),
+       paste("mean =", p5[3], sep=" "),
+       paste("mean =", p5[5], sep=" "),
+       paste("mean =", p5[7], sep=" "),
+       paste("fq =", fq, sep=" "))
+  
+  if (plot & (fq < FQ)) legend("topright", legend=h2)
+  #############################################
+  
+  #   area1 = sum(p5[2]*dnorm(x2,p5[1],sigma1))
+  #   area2 = sum(p5[4]*dnorm(x2,p5[3],sigma2))
+  #   area3 = sum(p5[6]*dnorm(x2,p5[5],sigma3))
+  #   area4 = sum(p5[8]*dnorm(x2,p5[7],sigma4))
+  
+  #   area1 = max(p5[2]*dnorm(x2,p5[1],sigma1))
+  #   area2 = max(p5[4]*dnorm(x2,p5[3],sigma2))
+  #   area3 = max(p5[6]*dnorm(x2,p5[5],sigma3))
+  #   area4 = max(p5[8]*dnorm(x2,p5[7],sigma4))
+  
+  area1 = getArea(p5[1],resol,p5[2],sigma1,int.factor)
+  area2 = getArea(p5[3],resol,p5[4],sigma2,int.factor)
+  area3 = getArea(p5[5],resol,p5[6],sigma3,int.factor)
+  area4 = getArea(p5[7],resol,p5[8],sigma4,int.factor)
+  
+  peak.area = c(peak.area, area1)
+  peak.area = c(peak.area, area2)
+  peak.area = c(peak.area, area3)
+  peak.area = c(peak.area, area4)
+  
+  peak.mean = c(peak.mean, p5[1])
+  peak.mean = c(peak.mean, p5[3])
+  peak.mean = c(peak.mean, p5[5])
+  peak.mean = c(peak.mean, p5[7])
+  
+  peak.scale = c(peak.scale, p5[2])
+  peak.scale = c(peak.scale, p5[4])
+  peak.scale = c(peak.scale, p5[6])
+  peak.scale = c(peak.scale, p5[8])
+  
+  peak.sigma = c(peak.sigma, sigma1)
+  peak.sigma = c(peak.sigma, sigma2)
+  peak.sigma = c(peak.sigma, sigma3)
+  peak.sigma = c(peak.sigma, sigma4)
+  
+  return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq))
+  
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/fitG.R b/DIMS/AddOnFunctions/fitG.R
new file mode 100644
index 0000000..b1b595d
--- /dev/null
+++ b/DIMS/AddOnFunctions/fitG.R
@@ -0,0 +1,18 @@
+fitG_2 <- function(x,y,sig,mu,scale,useBounds) {
+  
+  f = function(p) {
+    d = p[2]*dnorm(x,mean=p[1],sd=sig)
+    sum((d-y)^2)
+  }
+  
+  if (useBounds){
+    lower = c(x[1],0,x[1],0)
+    upper = c(x[length(x)],Inf,x[length(x)],Inf)
+    
+    optim(c(as.numeric(mu), as.numeric(scale)),
+          f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper)
+  } else {  
+    #optim(c(mu,scale),f)
+    optim(c(as.numeric(mu),as.numeric(scale)),f,control=list(maxit=10000))
+  }  
+}
diff --git a/DIMS/AddOnFunctions/fitGaussian.R b/DIMS/AddOnFunctions/fitGaussian.R
new file mode 100644
index 0000000..3c5027d
--- /dev/null
+++ b/DIMS/AddOnFunctions/fitGaussian.R
@@ -0,0 +1,327 @@
+fitGaussian <- function(x2,x,y,index,scale,resol,outdir,force,useBounds,plot,scanmode,int.factor,width,height) {
+  # force=length(index)
+  # useBounds=FALSE
+  
+  peak.mean = NULL
+  peak.area = NULL
+  peak.qual = NULL
+  peak.min = NULL
+  peak.max = NULL
+  
+  FQ1 = 0.15
+  FQ = 0.2
+  
+  # One local max
+  if (force==1){
+    
+    retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds)
+    
+    scale = 2
+    
+    if (retVal$mean[1]<x[1] | retVal$mean[1]>x[length(x)]) {   # <=== mean outside range
+      
+      # do it again
+      return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=1,useBounds=TRUE,plot,scanmode,int.factor,width,height))
+      
+    } else { # <=== mean within range
+      
+      if (retVal$qual > FQ1){ # <=== bad fit
+        
+        # Try to fit two curves
+        
+        # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima.
+        # The +1 below takes care of the fact that the result of diff is shorter than the input vector.
+        new_index=which(diff(sign(diff(y)))==-2)+1
+        
+        if (length(new_index)!=2) {
+          new_index = round(length(x)/3)
+          new_index = c(new_index, 2*new_index)
+        }
+        
+        #length(new_index)  
+        return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=2,useBounds=FALSE,plot,scanmode,int.factor,width,height))
+        
+      } else { # <=== good fit
+        
+        peak.mean = c(peak.mean, retVal$mean)
+        #peak.area = c(peak.area, sum(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma)))
+        # "Centroid mode"
+        #         peak.area = c(peak.area, max(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma)))
+        peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor))
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+      }
+    }
+    
+    # Two local max  
+  } else if (force==2 & (length(x)>6)) {
+    
+    # fit two curves
+    retVal = fit2peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) # <=== fit 2 curves
+    
+    if (retVal$mean[1]<x[1] | retVal$mean[1]>x[length(x)] |   # <=== one of means outside range
+        retVal$mean[2]<x[1] | retVal$mean[2]>x[length(x)]) {
+      
+      # check quality
+      if (retVal$qual > FQ) { # <=== bad fit 
+        
+        # do it again
+        return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=2,useBounds=TRUE,plot,scanmode,int.factor,width,height))
+        
+        # good fit
+      } else {
+        
+        # check which mean is outside range
+        # Todo ========> check this!!!!!!!!!!!!!!!
+        for (i in 1:length(retVal$mean)){
+          if (retVal$mean[i]<x[1] | retVal$mean[i]>x[length(x)] ) {
+            peak.mean = c(peak.mean, -i)
+            peak.area = c(peak.area, -i)
+          } else {
+            peak.mean = c(peak.mean, retVal$mean[i])
+            peak.area = c(peak.area, retVal$area[i])
+          }
+        }
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+      }
+    } else { # <=== all means within range
+      
+      if (retVal$qual > FQ) { # <=== bad fit 
+        
+        # Try to fit three curves
+        new_index=which(diff(sign(diff(y)))==-2)+1
+        
+        if (length(new_index)!=3) {
+          new_index = round(length(x)/4)
+          new_index = c(new_index, 2*new_index, 3*new_index)
+        }
+        
+        #length(new_index)      
+        return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=3,useBounds=FALSE,plot,scanmode,int.factor,width,height))
+        
+      } else { # <======== good fit
+        
+        # check if means are within 3 ppm and sum if so  
+        tmp = retVal$qual 
+        
+        nMeanNew = -1
+        nMean = length(retVal$mean)
+        while (nMean!=nMeanNew){
+          nMean = length(retVal$mean)
+          retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot)
+          nMeanNew = length(retVal$mean)
+        }
+        
+        retVal$qual = tmp
+        
+        h2=NULL
+        
+        for (i in 1:length(retVal$mean)){
+          h2 = c(h2, paste("mean =", retVal$mean[i], sep=" "))
+          
+          peak.mean = c(peak.mean, retVal$mean[i])
+          peak.area = c(peak.area, retVal$area[i])
+        }
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+        
+        h2 = c(h2, paste("fq =", retVal$qual, sep=" "))
+        if (plot) legend("topright", legend=h2)
+      }  
+    }
+    
+    # Three local max  
+  } else if (force==3 & (length(x)>6)){
+    
+    retVal = fit3peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor)
+    
+    # outside range
+    if (retVal$mean[1]<x[1] | retVal$mean[1]>x[length(x)] | # <=== one of means outside range
+        retVal$mean[2]<x[1] | retVal$mean[2]>x[length(x)] |
+        retVal$mean[3]<x[1] | retVal$mean[3]>x[length(x)]) {
+      
+      # check quality
+      if (retVal$qual > FQ) { # <=== bad fit 
+        
+        # do it again
+        return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height))
+        
+        # good fit
+      } else {
+        
+        # check which mean is outside range
+        # Todo ========> check this!!!!!!!!!!!!!!!
+        for (i in 1:length(retVal$mean)){
+          if (retVal$mean[i]<x[1] | retVal$mean[i]>x[length(x)] ) {
+            peak.mean = c(peak.mean, -i)
+            peak.area = c(peak.area, -i)
+          } else {
+            peak.mean = c(peak.mean, retVal$mean[i])
+            peak.area = c(peak.area, retVal$area[i])
+          }
+        }
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+      }
+      
+    } else { # <=== all means within range
+      
+      if (retVal$qual > FQ) { # <=== bad fit 
+        
+        # Try to fit four curves
+        new_index=which(diff(sign(diff(y)))==-2)+1
+        
+        if (length(new_index)!=4) {
+          new_index = round(length(x)/5)
+          new_index = c(new_index, 2*new_index, 3*new_index, 4*new_index)
+        }
+        
+        #length(new_index)
+        return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=4,useBounds=FALSE,plot,scanmode,int.factor,width,height))
+        
+      } else { # <======== good fit
+        
+        # check if means are within 3 ppm and sum if so  
+        tmp = retVal$qual 
+        
+        nMeanNew = -1
+        nMean = length(retVal$mean)
+        while (nMean!=nMeanNew){
+          nMean = length(retVal$mean)
+          retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot)
+          nMeanNew = length(retVal$mean)
+        }
+        
+        retVal$qual = tmp
+        
+        h2=NULL
+        #         peak.mean=NULL
+        #         peak.area=NULL
+        
+        for (i in 1:length(retVal$mean)){
+          h2 = c(h2, paste("mean =", retVal$mean[i], sep=" "))
+          
+          peak.mean = c(peak.mean, retVal$mean[i])
+          peak.area = c(peak.area, retVal$area[i])
+        }
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+        
+        h2 = c(h2, paste("fq =", retVal$qual, sep=" "))
+        if (plot) legend("topright", legend=h2)
+        
+      }  
+    }
+    
+    # Four local max  
+  } else if (force==4 & (length(x)>6)){
+    
+    retVal = fit4peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor)
+    
+    if (retVal$mean[1]<x[1] | retVal$mean[1]>x[length(x)] |
+        retVal$mean[2]<x[1] | retVal$mean[2]>x[length(x)] |
+        retVal$mean[3]<x[1] | retVal$mean[3]>x[length(x)] |
+        retVal$mean[4]<x[1] | retVal$mean[4]>x[length(x)]) {
+      
+      # check quality
+      if (retVal$qual > FQ) { # <=== bad fit 
+        
+        # do it again
+        return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height))
+        
+        # good fit
+      } else {
+        
+        # check which mean is outside range
+        # Todo ========> check this!!!!!!!!!!!!!!!
+        for (i in 1:length(retVal$mean)){
+          if (retVal$mean[i]<x[1] | retVal$mean[i]>x[length(x)] ) {
+            peak.mean = c(peak.mean, -i)
+            peak.area = c(peak.area, -i)
+          } else {
+            peak.mean = c(peak.mean, retVal$mean[i])
+            peak.area = c(peak.area, retVal$area[i])
+          }
+        }
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+      }
+      
+    } else {
+      
+      if (retVal$qual > FQ) { # <=== bad fit 
+        
+        # Generate 1 curve
+        return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=5,useBounds=FALSE,plot,scanmode,int.factor,width,height))
+        
+      } else { # <======== good fit
+        
+        # check if means are within 3 ppm and sum if so  
+        tmp = retVal$qual 
+        
+        nMeanNew = -1
+        nMean = length(retVal$mean)
+        while (nMean!=nMeanNew){
+          nMean = length(retVal$mean)
+          retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot)
+          nMeanNew = length(retVal$mean)
+        }
+        
+        retVal$qual = tmp
+        
+        h2=NULL
+        
+        for (i in 1:length(retVal$mean)){
+          h2 = c(h2, paste("mean =", retVal$mean[i], sep=" "))
+          
+          peak.mean = c(peak.mean, retVal$mean[i])
+          peak.area = c(peak.area, retVal$area[i])
+        }
+        peak.qual = retVal$qual
+        peak.min = x[1]
+        peak.max = x[length(x)]
+        
+        h2 = c(h2, paste("fq =", retVal$qual, sep=" "))
+        if (plot) legend("topright", legend=h2)
+      }  
+    }
+    
+    # More then four local max  
+  } else {
+    
+    scale=2
+    FQ1=0.40
+    useBounds=TRUE
+    index=which(y==max(y))
+    retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds)
+    
+    if (retVal$qual > FQ1){ # <=== bad fit
+      
+      if (plot) dev.off()    
+      
+      rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height)
+      peak.mean = c(peak.mean, rval$mean)
+      peak.area = c(peak.area, rval$area)
+      peak.min = rval$min
+      peak.max = rval$max
+      peak.qual = 0
+      
+    } else { # <=== good fit
+      
+      peak.mean = c(peak.mean, retVal$mean)
+      peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor))
+      peak.qual = retVal$qual
+      peak.min = x[1]
+      peak.max = x[length(x)]
+    }
+  }
+  
+  return(list("mean"=peak.mean, "area"=peak.area, "qual" = peak.qual, "min"=peak.min, "max"=peak.max))
+}
diff --git a/DIMS/AddOnFunctions/fitGaussianInit.R b/DIMS/AddOnFunctions/fitGaussianInit.R
new file mode 100644
index 0000000..1c92bce
--- /dev/null
+++ b/DIMS/AddOnFunctions/fitGaussianInit.R
@@ -0,0 +1,60 @@
+fitGaussianInit <- function(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) {
+  # scanmode="negative"
+  
+  mz.range = x[length(x)] - x[1]
+  x2 = seq(x[1],x[length(x)],length=mz.range*int.factor)
+  
+  #   # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima.
+  #   # The +1 below takes care of the fact that the result of diff is shorter than the input vector.
+  #   index=which(diff(sign(diff(y)))==-2)+1
+  
+  # Alway try to fit one curve first
+  index = which(y==max(y))
+  
+  if (scanmode=="positive"){
+    plot_label="pos_fit.png"
+  } else {
+    plot_label="neg_fit.png"
+  }
+  
+  if (plot) {
+    CairoPNG(filename=paste(outdir,"Gaussian_fit",paste(sampname, x[1], plot_label, sep="_"), sep="/"), width, height)
+    plot(x,y,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y)),main=paste(i,start,end, sep=" ")) #, ylim=c(0,1e+05)
+  }
+  
+  retVal = fitGaussian(x2,x,y,index,scale,resol,outdir,force=length(index),useBounds=FALSE,plot,scanmode,int.factor,width,height)
+  
+  if (plot) {
+    if (length(retVal$mean)==1) {
+      
+      result = tryCatch(dev.off(), warning = function(w){},
+                        error=function(e){},
+                        finally = {})
+      
+      # file.rename(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"),
+      #             paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean,digits = 6), plot_label, sep="_"), sep="/"))
+    } else {
+      
+      
+      h2=NULL
+      for (i in 1:length(retVal$mean)){
+        h2=c(h2, paste("mean =", retVal$mean[i], sep=" "))
+      }
+      h2=c(h2, paste("fq =", retVal$qual, sep=" "))
+      legend("topright", legend=h2)
+      
+      dev.off()
+      
+      for (i in 1:length(retVal$mean)){
+        if (retVal$mean[i]!=-1){
+          file.copy(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"),
+                    paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean[i],digits = 6), plot_label, sep="_"), sep="/"))
+        }
+      }
+      file.remove(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"))
+    }
+  }
+  
+  return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max))
+  
+}
diff --git a/DIMS/AddOnFunctions/generateBreaksFwhm.R b/DIMS/AddOnFunctions/generateBreaksFwhm.R
new file mode 100644
index 0000000..1a42631
--- /dev/null
+++ b/DIMS/AddOnFunctions/generateBreaksFwhm.R
@@ -0,0 +1,26 @@
+# options(digits=16)
+# resol = 140000
+# breaks.fwhm = 1
+# i = 1
+# breaks.fwhm.avg = NULL
+#
+# Sys.time()
+#
+# while (breaks.fwhm[length(breaks.fwhm)]<1000){
+#
+#   resol.mz = resol*(1/sqrt(2)^(log2(breaks.fwhm[i]/200)))
+#   fwhm.0.1 = (breaks.fwhm[i]/resol.mz)/10
+#   breaks.fwhm = c(breaks.fwhm, breaks.fwhm[i] + fwhm.0.1)
+#   breaks.fwhm.avg = c(breaks.fwhm.avg,(breaks.fwhm[i] +breaks.fwhm[i+1])/2)
+#
+#   if (i %% 10000 == 0){
+#     cat(paste("i =", i))
+#     cat(paste("breaks.fwhm =", breaks.fwhm[length(breaks.fwhm)]))
+#   }
+#
+#   i = i + 1
+# }
+#
+# Sys.time()
+# save(list=c("breaks.fwhm", "breaks.fwhm.avg"), file="breaks.RData")
+# cat("Breaks saved!")
diff --git a/DIMS/AddOnFunctions/generateExcelFile.R b/DIMS/AddOnFunctions/generateExcelFile.R
new file mode 100644
index 0000000..31474d9
--- /dev/null
+++ b/DIMS/AddOnFunctions/generateExcelFile.R
@@ -0,0 +1,24 @@
+generateExcelFile <- function(peaklist,  fileName, sub, plot = TRUE) {
+  # plotdir=file.path(plotdir)
+  # imageNum=2
+  # fileName="./results/xls/Pos_allpgrps_identified"
+  # subName=c("","_box")
+  
+  end <- 0
+  i <- 0
+  
+  if (dim(peaklist)[1]>=sub & (sub>0)){
+    for (i in 1:floor(dim(peaklist)[1]/sub)){
+      start=-(sub-1)+i*sub
+      end=i*sub
+      message(paste0(start, ":", end))
+      
+      genExcelFileV3(peaklist[c(start:end),], paste(fileName, sep="_"), plot)
+    }
+  }  
+  start = end + 1
+  end = dim(peaklist)[1]
+  message(start)
+  message(end)
+  genExcelFileV3(peaklist[c(start:end),], paste(fileName, i+1, sep="_"), plot)
+}
diff --git a/DIMS/AddOnFunctions/generateGaussian.R b/DIMS/AddOnFunctions/generateGaussian.R
new file mode 100644
index 0000000..da6dec2
--- /dev/null
+++ b/DIMS/AddOnFunctions/generateGaussian.R
@@ -0,0 +1,48 @@
+generateGaussian <- function(x,y,resol,plot,scanmode,int.factor,width,height) {
+  
+  factor=1.5
+  index = which(y==max(y))[1]
+  x=x[index]
+  y=y[index]
+  mu = x
+  fwhm = getFwhm(mu,resol)
+  x.p = c(mu-factor*fwhm, x, mu+factor*fwhm)
+  y.p = c(0, y, 0)
+  
+  #   if (plot) dir.create("./results/plots",showWarnings = FALSE)
+  #   if (plot) dir.create("./results/plots/Gaussian_fit",showWarnings = FALSE)
+  
+  if (plot) {
+    if (scanmode=="positive"){
+      plot_label="pos_fit.png"
+    } else {
+      plot_label="neg_fit.png"
+    }
+  }
+  
+  mz.range = x.p[length(x.p)] - x.p[1]
+  x2 = seq(x.p[1],x.p[length(x.p)],length=mz.range*int.factor)
+  sigma = getSD(x.p,y.p)
+  scale = optimizeGauss(x.p,y.p,sigma,mu)
+  
+  if (plot) {
+    CairoPNG(filename=paste("./results/Gaussian_fit",paste(sampname, mu, plot_label, sep="_"), sep="/"), width, height)
+    
+    plot(x.p,y.p,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y)))
+    lines(x2,scale*dnorm(x2,mu,sigma), col="green")
+    half_max = max(scale*dnorm(x2,mu,sigma))*0.5
+    lines(c(mu - 0.5*fwhm, mu + 0.5*fwhm),c(half_max,half_max),col="orange")
+    abline(v = mu, col="green")
+    h=c(paste("mean =", mu, sep=" "))
+    legend("topright", legend=h)
+    
+    dev.off()
+  }
+  
+  #  area = sum(scale*dnorm(x2,mu,sigma))
+  #  area = max(scale*dnorm(x2,mu,sigma))
+  area = getArea(mu,resol,scale,sigma,int.factor)
+  
+  return(list("mean"=mu, "area"=area, "min"=x2[1] , "max"=x2[length(x2)]))
+  
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/getArea.R b/DIMS/AddOnFunctions/getArea.R
new file mode 100644
index 0000000..415b050
--- /dev/null
+++ b/DIMS/AddOnFunctions/getArea.R
@@ -0,0 +1,23 @@
+getArea <- function(mu,resol,scale,sigma,int.factor){
+  
+  #   mu=p3[1]
+  #   scale=p3[2]
+  #   sigma=sigma1
+  
+  #   mu=p3[1]
+  #   scale=p3[2]
+  #   sigma=sigma1
+  
+  # avoid too big vector (cannot allocate vector of size ...)
+  if (mu>1200) return(0)
+  
+  fwhm = getFwhm(mu,resol)
+  mzMin = mu - 2*fwhm
+  mzMax = mu + 2*fwhm
+  mz.range = mzMax - mzMin 
+  x_int = seq(mzMin,mzMax,length=mz.range*int.factor)
+  
+  #plot(x_int,scale*dnorm(x_int,mu,sigma),col="red",type="l")
+  
+  return(sum(scale*dnorm(x_int,mu,sigma))/100)  
+}
diff --git a/DIMS/AddOnFunctions/getDeltaMZ.R b/DIMS/AddOnFunctions/getDeltaMZ.R
new file mode 100644
index 0000000..f09a0a4
--- /dev/null
+++ b/DIMS/AddOnFunctions/getDeltaMZ.R
@@ -0,0 +1,5 @@
+getDeltaMZ <- function(mass, breaks.fwhm){
+  index = which(breaks.fwhm<mass)
+  tmp = index[length(index)]
+  return(breaks.fwhm[tmp+1] - breaks.fwhm[tmp]) 
+}
diff --git a/DIMS/AddOnFunctions/getFitQuality.R b/DIMS/AddOnFunctions/getFitQuality.R
new file mode 100644
index 0000000..f9197c8
--- /dev/null
+++ b/DIMS/AddOnFunctions/getFitQuality.R
@@ -0,0 +1,65 @@
+getFitQuality <- function(x,y,muFirst,muLast,resol,scale=NULL,sigma=NULL,sumFit=NULL){
+  # muFirst=sort(c(p5[1],p5[3],p5[5],p5[7]))[1]
+  # muLast=sort(c(p5[1],p5[3],p5[5],p5[7]))[4]
+  
+  # muFirst=p2[1]
+  # muLast=p2[1]
+  # scale=p2[2]
+  
+  # x,y,sort(c(p5[1],p5[3],p5[5],p5[7]))[1],sort(c(p5[1],p5[3],p5[5],p5[7]))[4],resol,sumFit=sumFit
+  
+  if (is.null(sumFit)){
+    # fwhm = getFwhm(muFirst,resol)
+    # intRangeMin = muFirst - 1.2*fwhm
+    # intRangeMax = muFirst + 1.2*fwhm
+    # tmp = which((x>intRangeMin) & (x<intRangeMax))
+    
+    # # mu outside range!!!
+    # if (length(tmp)==0) return(list("fq_new"=1, "x_int"=x, "y_int"=y))
+    # 
+    # x_int = x[tmp]
+    # y_int = y[tmp] 
+    
+    x_int = x
+    y_int = y
+    
+    # fq_new = mean(abs((scale*dnorm(x_int,muFirst,sigma)) - y_int)/(scale*dnorm(x_int,muFirst,sigma)))
+    fq_new = mean(abs((scale*dnorm(x_int,muFirst,sigma)) - y_int)/rep((max(scale*dnorm(x_int,muFirst,sigma))/2),length(x_int)))
+    
+    # lines(x_int,scale*dnorm(x_int,muFirst,sigma), col="blue")
+    # fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y)
+    
+  } else {
+    # fwhm = getFwhm(muFirst,resol)
+    # intRangeMin = muFirst - 1.2*fwhm
+    # fwhm = getFwhm(muLast,resol)
+    # intRangeMax = muLast + 1.2*fwhm
+    # 
+    # tmp = which((x>intRangeMin) & (x<intRangeMax))
+    # 
+    # if (length(tmp)==0){
+    #   # outside range
+    #   fq_new = 1
+    #   x_int = x
+    #   y_int = y
+    #   
+    # } else {
+    
+    # x_int = x[tmp]
+    # y_int = y[tmp]
+    # sumFit_int = sumFit[tmp] 
+    
+    sumFit_int = sumFit
+    y_int = y
+    x_int = x
+    
+    # fq_new = mean(abs(sumFit_int - y_int)/sumFit_int)
+    fq_new = mean(abs(sumFit_int - y_int)/rep(max(sumFit_int)/2,length(sumFit_int)))
+    # }    
+  }
+  
+  # Caused by dividing by 0
+  if (is.nan(fq_new)) fq_new=1 
+  
+  return(list("fq_new"=fq_new, "x_int"=x_int, "y_int"=y_int))  
+}
diff --git a/DIMS/AddOnFunctions/getFwhm.R b/DIMS/AddOnFunctions/getFwhm.R
new file mode 100644
index 0000000..890a865
--- /dev/null
+++ b/DIMS/AddOnFunctions/getFwhm.R
@@ -0,0 +1,24 @@
+getFwhm <- function(mu,resol) {   
+  
+  if (is.nan(mu)) mu=0
+  if (is.na(mu)) mu=0
+  if (is.null(mu)) mu=0
+  if (mu<0) mu=0
+  
+  # tryCatch({
+  resol.mz = resol*(1/sqrt(2)^(log2(mu/200)))
+  fwhm = mu/resol.mz
+  #   }, warning = function(resol.mz, fwhm) {
+  #     message(warnings())
+  #     message(paste("====================> mu: ",mu))
+  #     message(paste("====================> resol.mz: ",resol.mz))
+  #   })
+  
+  #   , error = function(e) {
+  #     error-handler-code
+  #   }, finally = {
+  #     cleanup-code
+  #   })
+  
+  return(fwhm)
+}
diff --git a/DIMS/AddOnFunctions/getPatients.R b/DIMS/AddOnFunctions/getPatients.R
new file mode 100644
index 0000000..3b03f0a
--- /dev/null
+++ b/DIMS/AddOnFunctions/getPatients.R
@@ -0,0 +1,9 @@
+getPatients <- function(peaklist){
+  patients=colnames(peaklist)[grep("P", colnames(peaklist), fixed = TRUE)]
+  patients=unique(as.vector(unlist(lapply(strsplit(patients, ".", fixed = TRUE), function(x) x[1]))))
+  # ToDo: If 2 P's in sample names!!!!!!!!!!!!!
+  # patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "_P", fixed = TRUE), function(x) x[2]))))))
+  patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "P", fixed = TRUE), function(x) x[2]))))))
+  
+  return(patients)
+}
diff --git a/DIMS/AddOnFunctions/getSD.R b/DIMS/AddOnFunctions/getSD.R
new file mode 100644
index 0000000..197a002
--- /dev/null
+++ b/DIMS/AddOnFunctions/getSD.R
@@ -0,0 +1,9 @@
+getSD <- function(x,y,resol=140000) {   
+  
+  index = which(y==max(y))
+  mean = x[index]
+  resol.mz = resol*(1/sqrt(2)^(log2(mean/200)))
+  fwhm = mean/resol.mz
+  sd = (fwhm/2)*0.85 
+  return(sd)
+}
diff --git a/DIMS/AddOnFunctions/globalAssignments.HPC.R b/DIMS/AddOnFunctions/globalAssignments.HPC.R
new file mode 100644
index 0000000..9207661
--- /dev/null
+++ b/DIMS/AddOnFunctions/globalAssignments.HPC.R
@@ -0,0 +1,114 @@
+#.libPaths(new="/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2")
+## calculate relative abundancies from theoretical mass and composition
+
+#library(Rdisop)
+
+#theor.table <- read.table(file="C:/Users/mraves/Metabolomics/TheoreticalMZ_Negative_composition.txt", sep="\t", header=TRUE)
+options(digits=16)
+
+#library(OrgMassSpecR)
+
+suppressPackageStartupMessages(library(lattice))
+# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST
+Ba <- list(name= 'Ba', mass=130, isotope=list(mass=c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768),                                      abundance=c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717)))
+Br <- list(name= 'Br', mass=79,  isotope=list(mass=c(-0.0816639, 0, -0.083711),                                                                                               abundance=c(0.5069, 0, 0.4931)))
+C <-  list(name= 'C',  mass=12,  isotope=list(mass=c(0, 0.003354838, 0.003241989),                                                                                            abundance=c(0.9893, 0.0107, 0)))
+Ca <- list(name= 'Ca', mass=40,  isotope=list(mass=c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467),                                          abundance=c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187)))
+Cl <- list(name= 'Cl', mass=35,  isotope=list(mass=c(-0.03114732, 0, -0.03409741),                                                                                            abundance=c(0.7576, 0, 0.2424)))
+Cr <- list(name= 'Cr', mass=50,  isotope=list(mass=c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175),                                                                      abundance=c(0.04345, 0, 0.83789, 0.09501, 0.02365)))
+Cu <- list(name= 'Cu', mass=63,  isotope=list(mass=c(-0.0704011, 0, -0.0722071),                                                                                              abundance=c(0.6917, 0, 0.3083)))
+F <-  list(name= 'F',  mass=19,  isotope=list(mass=c(-0.00159678),                                                                                                            abundance=c(1)))
+Fe <- list(name= 'Fe', mass=54,  isotope=list(mass=c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227),                                                                      abundance=c(0.058, 0, 0.9172, 0.022, 0.0028)))
+H <-  list(name= 'H',  mass=1,   isotope=list(mass=c(0.00782503207, 0.014101778, 0.01604928),                                                                                 abundance=c(0.999885, 0.000115, 0)))
+Hg <- list(name= 'Hg', mass=196, isotope=list(mass=c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533),                                        abundance=c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687)))
+I <-  list(name= 'I',  mass=127, isotope=list(mass=c(-0.095527),                                                                                                              abundance=c(1)))
+K <-  list(name= 'K',  mass=39,  isotope=list(mass=c(-0.0362926, -0.0360008, -0.0381746),                                                                                     abundance=c(0.932581, 0.000117, 0.067302)))
+Li <- list(name= 'Li', mass=6,   isotope=list(mass=c(0.0151214, 0.016003),                                                                                                    abundance=c(0.075, 0.925)))
+Mg <- list(name= 'Mg', mass=24,  isotope=list(mass=c(-0.0149577, -0.0141626, -0.0174063),                                                                                     abundance=c(0.7899, 0.1, 0.1101)))
+Mn <- list(name= 'Mn', mass=55,  isotope=list(mass=c(-0.0619529),                                                                                                             abundance=c(1)))
+N <-  list(name= 'N',  mass=14,  isotope=list(mass=c(0.003074002, 0.00010897),                                                                                                abundance=c(0.99634, 0.00366)))
+Na <- list(name= 'Na', mass=23,  isotope=list(mass=c(-0.0102323),                                                                                                             abundance=c(1)))
+Ni <- list(name= 'Ni', mass=58,  isotope=list(mass=c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321),                                                       abundance=c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926)))
+O <-  list(name= 'O',  mass=16,  isotope=list(mass=c(-0.00508538044, -0.0008683, -0.0008397),                                                                                 abundance=c(0.99757, 0.000381, 0.00205)))
+P <-  list(name= 'P',  mass=31,  isotope=list(mass=c(-0.026238),                                                                                                              abundance=c(1)))
+S <-  list(name= 'S',  mass=32,  isotope=list(mass=c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924),                                                                     abundance=c(0.9499, 0.0075, 0.0425, 0, 1e-04)))
+Se <- list(name= 'Se', mass=74,  isotope=list(mass=c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022),                                         abundance=c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873)))
+Si <- list(name= 'Si', mass=28,  isotope=list(mass=c(-0.0230729, -0.0235051, -0.0262293),                                                                                     abundance=c(0.9223, 0.0467, 0.031)))
+Sn <- list(name= 'Sn', mass=112, isotope=list(mass=c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257),abundance=c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579)))
+Zn <- list(name= 'Zn', mass=64,  isotope=list(mass=c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675),                                                        abundance=c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006)))
+
+NH4 <- list(name= "NH4", mass=18, isotope=list(mass=c(0.03437, 0.03141, -0.95935)), abundance=c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1
+Ac <- list(name= "Ac", mass=60, isotope=list(mass=c(0.02114, 0.02450, 0.02538)), abundance=c(0.975, 0.021, 0.004))   # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4
+NaCl <- list(name= "NaCl", mass=58, isotope=list(mass=c(-0.04137, 0, -0.04433)), abundance=c(0.755, 0, 0.245))   # SISweb: 57.95862 100 59.95567 32.4
+NaCl2 <- list(name= "NaCl2", mass=116, isotope=list(mass=c(-0.08274, 0, -0.08866)), abundance=c(0.755, 0, 0.245))   # SISweb: 57.95862 100 59.95567 32.4
+NaCl3 <- list(name= "NaCl3", mass=174, isotope=list(mass=c(-0.12411, 0, -0.13299)), abundance=c(0.755, 0, 0.245))   # SISweb: 57.95862 100 59.95567 32.4
+NaCl4 <- list(name= "NaCl4", mass=232, isotope=list(mass=c(-0.16548, 0, -0.17732)), abundance=c(0.755, 0, 0.245))   # SISweb: 57.95862 100 59.95567 32.4
+NaCl5 <- list(name= "NaCl5", mass=290, isotope=list(mass=c(-0.20685, 0, -0.22165)), abundance=c(0.755, 0, 0.245))   # SISweb: 57.95862 100 59.95567 32.4
+For <- list(name= "For", mass=45, isotope=list(mass=c(-0.00233, 0.00103)), abundance=c(0.989, 0.011))   # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4
+Na2 <- list(name= "2Na-H", mass=46, isotope=list(mass=c(-1.0282896)), abundance=c(1))  # SISweb for Na2: 45.97954 100 # minus 1 H !
+Met <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2
+CH3OH <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2
+Na3 <- list(name= "3Na-2H", mass=69, isotope=list(mass=c(-2.0463469)), abundance=c(1))  # SISweb for Na2: 45.97954 100 # minus 1 H !
+KCl <- list(name= "KCl", mass=74, isotope=list(mass=c(-0.06744,0.92961,-0.06744,0.92772)), abundance=c(0.7047,0.2283,0.0507,0.0162))   # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3
+H2PO4 <- list(name= "H2PO4", mass=97, isotope=list(mass=c(-0.03091)), abundance=c(1))
+HSO4 <- list(name= "HSO4", mass=97, isotope=list(mass=c(-0.04042,0,-0.04462)), abundance=c(0.96,0,0.04))
+Met2 <- list(name= "Met2", mass=64, isotope=list(mass=c(1.060265,1.013405)), abundance=c(0.978,0.022))
+Met3 <- list(name= "Met3", mass=96, isotope=list(mass=c(1.086485,1.089845)), abundance=c(0.969,0.031))
+Met4 <- list(name= "Met4", mass=128, isotope=list(mass=c(1.112705,1.116065)), abundance=c(0.959,0.041))
+Met5 <- list(name= "Met5", mass=160, isotope=list(mass=c(1.20935,1.142285)), abundance=c(0.949,0.051))
+NaminH <- list(name= "Na-H", mass=21, isotope=list(mass=c(-0.02571416)), abundance=c(1))
+KminH <- list(name= "K-H", mass=37, isotope=list(mass=c(-0.05194,0.94617)), abundance=c(0.9328,0.0672))
+H2O <- list(name= "H2O", mass=-19, isotope=list(mass=c(-0.01894358)), abundance=c(1))
+NaK <- list(name= "NaK-H", mass=61, isotope=list(mass=c(-0.054345,0.943765)), abundance=c(0.9328,0.0672))
+min2H <- list(name= "min2H", mass=-2, isotope=list(mass=c(-0.0151014)), abundance=c(1))
+plus2H <- list(name= "plus2H", mass=2, isotope=list(mass=c(0.0151014)), abundance=c(1))
+plus2Na <- list(name= "plus2Na", mass=46,  isotope=list(mass=c(-0.02046), abundance=c(1)))
+plusNaH <- list(name= "plusNaH", mass=24,  isotope=list(mass=c(-0.00295588), abundance=c(1)))
+plusKH <- list(name= "plusKH", mass=40, isotope=list(mass=c(-0.029008,0.969101)), abundance=c(0.9328,0.0672))
+plusHNH <- list(name= "plusHNH", mass=19, isotope=list(mass=c(0.04164642)), abundance=c(1))
+min3H <- list(name= "min3H", mass=-3, isotope=list(mass=c(-0.02182926)), abundance=c(1))
+plus3H <- list(name= "plus3H", mass=3, isotope=list(mass=c(0.02182926)), abundance=c(1))
+plus3Na <- list(name= "plus3Na", mass=68, isotope=list(mass=c(0.96931)), abundance=c(1))
+plus2NaH <- list(name= "plus2NaH", mass=47, isotope=list(mass=c(0.2985712)), abundance=c(1))
+plusNa2H <- list(name= "plusNa2H", mass=25, isotope=list(mass=c(0.00432284)), abundance=c(1))
+
+Cl37 <- list(name= "Cl37", mass=37,  isotope=list(mass=c(-0.03409741)),  abundance=c(1))
+
+allelements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn)
+allAdducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, NH4, Ac, NaCl, For,Na2,CH3OH,NaCl2,NaCl3,NaCl4,NaCl5,Na3,KCl,H2PO4,HSO4,Met2,Met3,Met4,Met5,NaminH,KminH,H2O,NaK,min2H,plus2H,plus2Na,plusNaH,plusKH,min3H,plus3H,plusHNH,plus3Na,plus2NaH,plusNa2H)
+
+#atomsinuse <- c("P",     "O",         "N",           "C",     "H",          "S",        "Cl")
+#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721)
+#electron <- 0.00054858
+#Mol.comp <- c(0,4,0,0,1,1,0)
+#Mol.exact <- sum(Mol.comp * atomicWts) + electron
+#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact  # mass as found in peak group list
+#getMass(Mol) + electron - 0.0022 + 0.000007*getMass(Mol)
+
+#atomsinuse <- c("P",     "O",         "N",           "C",     "H",          "S",        "Cl",          "D",           "34S",          "18O")
+#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 33.96786690, 17.9991610)
+#electron <- 0.00054858
+#Mol.comp <- c(0,4,0,0,1,1,0,0,0,0)  # main peak HSO4
+#Mol.comp <- c(0,4,0,0,0,1,0,1,0,0)  # deuterated
+#Mol.comp <- c(0,4,0,0,1,0,0,0,1,0)  # 34S
+#Mol.comp <- c(0,3,0,0,1,1,0,0,0,1)  # 18O
+#Mol.exact <- sum(Mol.comp * atomicWts) + electron
+#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact  # mass as found in peak group list
+
+atomsinuse <- c("P",     "O",         "N",           "C",     "H",          "S",        "Cl",          "D",           "13C",       "34S",          "18O",     "37Cl")
+atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259)
+electron <- 0.00054858
+############# P O N C H  S Cl D 13C 34S 18O 37Cl
+#Mol.comp <- c(0,6,0,6,12,0,1, 0, 0, 0,  0,  0)  # main peak Galactose HCl negative ion
+#Mol.comp <- c(0,6,0,5,12,0,1, 0, 1, 0,  0,  0)  # 13C Galactose HCl negative ion
+#Mol.comp <- c(0,6,0,6,11,0,1, 1, 0, 0,  0,  0)  # D Galactose HCl negative ion
+#Mol.comp <- c(0,5,0,6,12,0,1, 0, 0, 0,  1,  0)  # 18O Galactose HCl negative ion
+#Mol.comp <- c(0,6,0,6,12,0,0, 0, 0, 0,  0,  1)  # 18O Galactose HCl negative ion
+#Mol.exact <- sum(Mol.comp * atomicWts) + electron
+#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact  # mass as found in peak group list
+
+Hmass <- H$mass + H$isotope$mass[1]
+Dmass <- H$mass + 1 + H$isotope$mass[2]
+Tmass <- H$mass + 2 + H$isotope$mass[3]
+C13mass <- C$mass + 1 + C$isotope$mass[2]
+N15mass <- N$mass + 1 + N$isotope$mass[2]
diff --git a/DIMS/AddOnFunctions/iden.code.R b/DIMS/AddOnFunctions/iden.code.R
new file mode 100644
index 0000000..4853647
--- /dev/null
+++ b/DIMS/AddOnFunctions/iden.code.R
@@ -0,0 +1,45 @@
+iden.code <- function(peaklist, db, ppm, theor_mass_label) {
+  # theor_mass_label = {"MNeg", "Mpos"}
+  
+  mcol <- peaklist[ , "mzmed.pgrp"]
+  theor.mcol <- db[,theor_mass_label]
+  assi_HMDB <- iso_HMDB <- HMDB_code <- c(rep("",length(mcol)))
+  theormz_HMDB <- c(rep(0,length(mcol)))
+  
+  for(t in 1:length(mcol)){
+    mz<-mcol[t]
+    mtol <- mz*ppm/1000000
+    selp <- which(theor.mcol > (mz - mtol) & theor.mcol < (mz + mtol))
+    assinames <- isonames <- HMDBcodes <- ""
+    
+    if(length(selp)!=0){
+      for(i in 1:length(selp)){
+        
+        if(grepl(" iso ", db[selp[i], "CompoundName"])){
+          mainpeak <- strsplit(db[selp[i],"CompoundName"]," iso ")[[1]][1]
+          
+          # Check if peak without isotope occurs, this assumes that peaklist is ordered on mass!!! 
+          if(length(grep(mainpeak, assi_HMDB, fixed=TRUE)) > 0){
+            isonames <- as.character(paste(isonames,as.character(db[selp[i],"CompoundName"]), sep=";"))
+          }
+          
+        } else {
+          assinames <- as.character(paste(assinames,as.character(db[selp[i], "CompoundName"]), sep=";"))
+          HMDBcodes <- as.character(paste(HMDBcodes, as.character(rownames(db)[selp[i]]), sep=";"))
+        }  
+      }
+    }  
+    
+    assi_HMDB[t] <- as.character(assinames)
+    iso_HMDB[t] <-  as.character(isonames)
+    if ((assinames=="") & (isonames=="")){ 
+      theormz_HMDB[t] <- ""
+    } else {
+      theormz_HMDB[t] <- theor.mcol[selp[1]]
+    }
+    HMDB_code[t] <- as.character(HMDBcodes)
+  }
+  
+  return(cbind(peaklist, assi_HMDB, iso_HMDB, theormz_HMDB, HMDB_code))
+}
+
diff --git a/DIMS/AddOnFunctions/ident.hires.noise.HPC.R b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R
new file mode 100644
index 0000000..0be43d1
--- /dev/null
+++ b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R
@@ -0,0 +1,242 @@
+# modified identify function to also look for adducts and their isotopes
+ident.hires.noise.HPC <- function(peaklist, allAdducts, scanmode="Negative", look4=c("Cl", "Ac"), identlist=NULL, resol=280000, slope=0, incpt=0, ppm.fixed, ppm.iso.fixed) {
+  
+  #     peaklist=outlist_Neg
+  #     scanmode="Negative"
+  #     identlist=noise.Neg.MZ
+  #     look4=look4.addN
+  #     resol=resol
+  #     slope=0 
+  #     incpt=0
+  #     ppm.fixed=3
+  #     ppm.iso.fixed=2  
+  
+  metlin <- assi <- iso <- rep("", nrow(peaklist))
+  theormz <- nisos <- expint <- conf <- rep(0, nrow(peaklist))
+  # nrH <- nrD <- nrC <- nr13C <- nrN <- nr15N <- nrO <- nrP  <- nrS <- nrCl <- rep(0, nrow(peaklist))
+  
+  # add adducts to identification list
+  if (scanmode == "Positive") { add.mode <-  "+" } else { add.mode <- "-" }
+  # identlist <- theorMZ
+  identlist.orig <- identlist
+  
+  for (p in 1:length(look4)) { # loop over type of adduct
+    #print(p)
+    identlist.Adduct <- identlist.orig
+    identlist.Adduct[ , "CompoundName"] <- as.character(identlist.orig[ , "CompoundName"])
+    
+    #add2label <- paste("[M+", look4[p], "]", add.mode, sep="")
+    if (look4[p]=="H2O") {
+      add2label <- paste("[M-", look4[p], "]", add.mode, sep="")
+    } else {
+      add2label <- paste("[M+", look4[p], "]", add.mode, sep="")
+    }
+    
+    identlist.Adduct[ , "CompoundName"] <- paste(identlist.Adduct[ , "CompoundName"], add2label, sep=" ")
+    adductInfo <- elementInfo(look4[p], allAdducts)
+    if (scanmode == "Positive") {
+      adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] - Hmass } else {
+        adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] + Hmass
+      }
+    for (q in 1:nrow(identlist.Adduct)) { # loop over compounds in database
+      # construct information for compound + adduct:
+      if (scanmode == "Positive") {
+        identlist.Adduct[q, "Mpos"] <-  as.numeric(identlist.Adduct[q, "Mpos"]) + adductMass # + Na - H
+        identlist.Adduct[q, "MNeg"] <-  0
+      } else {
+        identlist.Adduct[q, "Mpos"] <-  0
+        identlist.Adduct[q, "MNeg"] <-  as.numeric(identlist.Adduct[q, "MNeg"]) + adductMass # + Cl + H
+      }
+    }
+    
+    #     # modify columns with info for mol. formula:
+    #     if (look4[p] == "2Na-H") {
+    #       identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1
+    #     } else if (look4[p] == "NH4") {
+    #       identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 3
+    #       identlist.Adduct[, "nrN"] <- as.numeric(identlist.Adduct[, "nrN"]) + 1
+    #     } else if (look4[p] == "Cl") {
+    #       identlist.Adduct[, "nrCl"] <- as.numeric(identlist.Adduct[, "nrCl"]) + 1
+    #       identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 1
+    #     } else if (look4[p] == "Ac") {
+    #       identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 2
+    #       identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 3
+    #       identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 2
+    #     } else if (look4[p] == "CH3OH+H") {
+    #       identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 1
+    #       identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 4
+    #       identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 1
+    #     } else {
+    #       identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1
+    #     }
+    identlist <- rbind(identlist, identlist.Adduct) 
+  } # end for p adducts in look4
+  
+  
+  if (scanmode == "Positive") { theor.mcol <- as.numeric(identlist[ , "Mpos"]) } else {
+    theor.mcol <- as.numeric(identlist[ , "MNeg"]) }
+  # apply correction using regression line obtained with ISses
+  theor.mcol <- (1+slope)*theor.mcol + incpt
+  
+  # get mz information from peaklist
+  mcol <- peaklist[ , "mzmed.pgrp"]
+  # if column with average intensities is missing, calculate it:
+  if (!("avg.int" %in% colnames(peaklist))){
+    mzmaxcol <- which(colnames(peaklist) == "mzmax.pgrp")
+    endcol <- ncol(peaklist)
+    peaklist[ , "avg.int"] <- apply(peaklist[ ,(mzmaxcol+1):(endcol)], 1, mean)
+  }
+  
+  #   # generate URL for Metlin:
+  #   for (p in 1:nrow(peaklist)) { # for each peak  # p <- 1
+  #     mzpeak <- as.numeric(mcol[p])
+  #     # resolution as function of mz:
+  #     resol.mz <- resol*(1/sqrt(2)^(log2(mzpeak/200)))
+  #     fwhm <- mzpeak/resol.mz
+  #     #    massmin <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) - fwhm  # for Metlin db
+  #     #    massmax <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) + fwhm  # for Metlin db
+  #     if (scanmode == "Positive") {
+  #       massmin <- mzpeak - (1.0078250321 - 0.00054858) - fwhm  # for Metlin db
+  #       massmax <- mzpeak - (1.0078250321 - 0.00054858) + fwhm  # for Metlin db
+  #     } else {
+  #       massmin <- mzpeak + (1.0078250321 - 0.00054858) - fwhm  # for Metlin db
+  #       massmax <- mzpeak + (1.0078250321 - 0.00054858) + fwhm  # for Metlin db
+  #     }
+  #     metlin[p] <- paste("http://metlin.scripps.edu/metabo_list.php?mass_min=", massmin, "&mass_max=", massmax, sep="")
+  #   }
+  
+  
+  # do indentification using own database:
+  for (t in 1:nrow(identlist)) { # theoretical mass  # t <- 45
+    # for (t in 1:169) {
+    #print(as.character(identlist[t,"CompoundName"]))
+    theor.mz <- theor.mcol[t]
+    
+    theor.comp <- as.character(identlist[t, "Composition"])
+    #theor.comp <- mol.formula(identlist[t, ])
+    
+    #     # if there's Deuterium, Tritium, 13C or 15N in the composition:
+    #     mass.incr <- 0 + (as.numeric(identlist[t,"nrD"])*Dmass) +
+    #       (as.numeric(identlist[t,"nrT"])*Tmass) +
+    #       (as.numeric(identlist[t,"nrC13"])*C13mass) +
+    #       (as.numeric(identlist[t,"nrN15"])*N15mass)
+    #     theor.comp <- strsplit(theor.comp, "iso")[[1]][1]
+    mass.incr <- 0
+    
+    # resolution as function of mz:
+    resol.mz <- resol*(1/sqrt(2)^(log2(theor.mz/200)))
+    # calculate fine-grained isotopic distribution using MIDAs
+    fwhm <- round(theor.mz/resol.mz,6)
+    
+    #system(paste("C:/Users/awillem7/tools/MIDAs_New/MIDAs_Example ", theor.comp, " 2  C00 \"\"  \"\"  ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE)
+    #system(paste("C:/Users/mraves/Metabolomics/MIDAs_New/MIDAs_Example ", theor.comp, " 2  C00 \"\"  \"\"  ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE)
+    #system(paste(path2MIDAS, theor.comp, " 2  C00 \"\"  \"\"  ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE)
+    
+    #system(paste("/data/home/luyf/Metabolomics/MIDAs/MIDAs_Example ", theor.comp, " 2  C00 \"\"  \"\"  ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE)
+    options(stringsAsFactors = FALSE)
+    #fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE)
+    
+    #     res <- try(fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE))
+    #     if(inherits(res, "try-error"))
+    #     {
+    #       #error handling code, maybe just skip this iteration using
+    #       message("Skipped")
+    #       next
+    #     }
+    #     
+    #     # correct mass for D, T, 13C and 15N
+    #     fgid[ ,1] <- as.numeric(fgid[ ,1]) + mass.incr
+    #     # calculate percentage intensities from relative intensities
+    #     firstone <- as.numeric(fgid[1,2])
+    #     fgid[ ,3] <- as.numeric(fgid[ , 2]) / firstone
+    #     #fgid <- as.matrix(fgid, ncol=3)
+    #     # the mz in the MIDAs file are of the neutral molecule
+    #     if (scanmode == "Positive") { mz.iso <- as.numeric(fgid[ , 1]) + Hmass - electron }
+    #     if (scanmode == "Negative") { mz.iso <- as.numeric(fgid[ , 1]) - Hmass + electron }
+    #     fgid <- cbind(fgid, mz.iso)
+    #     colnames(fgid) <- c("mz","rel.int", "perc.int", "mz.iso")
+    
+    # compensate mz for presence of adduct
+    #     Adduct.mass <- theor.mz - mz.iso[1]
+    #     fgid[ , "mz.iso"] <- as.numeric(fgid[ , "mz.iso"]) + Adduct.mass
+    
+    # set tolerance for mz accuracy of main peak
+    mtol <- theor.mz*ppm.fixed/1000000
+    # find main peak
+    selp <- which(mcol > (theor.mz - mtol) & mcol < (theor.mz + mtol))
+    # selp <- which(mcol > (theor.mz - 0.01) & mcol < (theor.mz + 0.01))
+    # peaklist[selp, c(1:4,(endcol+1))]
+    
+    # set tolerance for mz accuracy of isotope peaks
+    itol <- theor.mz*ppm.iso.fixed/1000000
+    
+    #     if (length(selp) > 1) { # more than one candidate peak for main; select best one based on isotope pattern
+    #       #cat(as.character(identlist[t, "CompoundName"])); print(" has >1 candidate peaks")
+    #       conf.local <- rep(0, length(selp))
+    #       for (p in 1:length(selp)) { # p <- 2
+    #         # determine isotope pattern for each candidate peak
+    #         # obs.mz <- peaklist[selp[p],"mzmed.pgrp"]
+    #         conf.local[p] <- match.isotope.pattern(peaklist, scanmode, selp[p], fgid, ppm.iso.fixed)
+    #       }
+    #       # selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] }
+    #       selp <- na.exclude(selp[conf.local == max(conf.local)])
+    #     }
+    if (length(selp) > 1) { # more than one candidate peak for main; select best one based on mz.diff
+      selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] 
+    }
+    if (length(selp) == 1) { # match for main
+      assi[selp] <- paste(assi[selp], as.character(identlist[t,"CompoundName"]), sep=";")
+      theormz[selp] <- theor.mz
+      # conf[selp] <- match.isotope.pattern(peaklist, scanmode, selp, fgid, ppm.iso.fixed)
+      
+      #       nrH[selp] <- identlist[t,"nrH"]
+      #       nrD[selp] <- identlist[t,"nrD"]
+      #       nrC[selp] <- identlist[t,"nrC"]
+      #       nr13C[selp] <- identlist[t,"nrC13"]
+      #       nrN[selp] <- identlist[t,"nrN"]
+      #       nr15N[selp] <- identlist[t,"nrN15"]
+      #       nrO[selp] <- identlist[t,"nrO"]
+      #       nrP[selp] <- identlist[t,"nrP"]
+      #       nrS[selp] <- identlist[t,"nrS"]
+      #       nrCl[selp] <- identlist[t,"nrCl"]   
+      
+      # assign isotope peaks
+      #       mz.main <- peaklist[selp, "mzmed.pgrp"]  # mz of main peak
+      #       int.main <- peaklist[selp, "avg.int"]  # intensity of main peak (= 100%)
+      #       # deviation from theoretical mass:
+      #       diff <- theor.mz - mz.main
+      #       # calculate expected intensities and select isotopes with exp.int > threshold
+      #       fgid[ , "exp.int"] <- fgid[ , "perc.int"] * int.main      
+      #       fgid.subset <- fgid[(fgid[ , "exp.int"] > thresh), ]
+      #       nisos[selp] <- nrow(fgid.subset) - 1
+      #       if (nrow(fgid.subset) > 1) { # avoid error message if fgid.subset has only 1 line
+      #         for (f in 2:nrow(fgid.subset)) { # f <- 2
+      #           mz.target <- fgid.subset[f, "mz.iso"] - diff
+      #           int.target <- fgid.subset[f, "exp.int"]
+      #           # print(itarget)
+      #           sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - itol) & peaklist[ , "mzmed.pgrp"] < (mz.target + itol)
+      #           # sum(sel.iso)
+      #           # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - 0.01) & peaklist[ , "mzmed.pgrp"] < (mz.target + 0.01)
+      #           # peaklist[sel.iso, c(1:4,23)]
+      #           if (sum(sel.iso) == 1) { # 2 separate if-statements because of error if sum(sel.iso) = 0
+      #             if (peaklist[sel.iso, "avg.int"] >  (int.target/2)) { # match
+      #               iso[sel.iso] <- paste(paste(iso[sel.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="")
+      #               # peaklist[sel.iso, ]
+      #               expint[sel.iso] <- fgid.subset[f, "exp.int"]
+      #             }
+      #           } else if (sum(sel.iso) > 1) {
+      #             nrs.iso <- which(sel.iso)
+      #             nr.iso <- nrs.iso[which(abs(peaklist[sel.iso, "avg.int"] - int.target) == min(abs(peaklist[sel.iso, "avg.int"] - int.target)))]
+      #             if (peaklist[nr.iso, "avg.int"] >  (int.target/2)) {  # match
+      #               # print(peaklist[nr.iso, "avg.int"])
+      #               iso[nr.iso] <- paste(paste(iso[nr.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="")
+      #               expint[nr.iso] <- fgid.subset[f, "exp.int"]
+      #             } # end if
+      #           } # end else if
+      #         } # end for f 
+      #       } # end if
+    } # end if 
+  } # end for t
+  #   cbind(peaklist, nrH, nrD, nrC, nr13C, nrN, nr15N, nrO, nrP, nrS, nrCl, assi, theormz, conf, nisos, iso, expint, metlin)
+  cbind(peaklist, assi, theormz, conf, nisos, iso, expint, metlin)
+}
diff --git a/DIMS/AddOnFunctions/isWithinXppm.R b/DIMS/AddOnFunctions/isWithinXppm.R
new file mode 100644
index 0000000..5da43c6
--- /dev/null
+++ b/DIMS/AddOnFunctions/isWithinXppm.R
@@ -0,0 +1,51 @@
+isWithinXppm <- function(mean, scale, sigma, area, x2, x, ppm=4, resol, plot) {
+  #   mean=retVal$mean
+  #   scale=retVal$scale
+  #   sigma=retVal$sigma
+  #   area=retVal$area
+  #   ppm=3
+  
+  # sort!!!!!!!!!!!!!!!!
+  index = order(mean)
+  mean = mean[index]
+  scale = scale[index]
+  sigma = sigma[index]
+  area = area[index]
+  
+  summed = NULL
+  remove = NULL
+  
+  if (length(mean)>1){
+    for (i in 2:length(mean)){
+      if ((abs(mean[i-1]-mean[i])/mean[i-1])*10^6 < ppm) {
+        
+        # avoid double occurance in sum
+        if ((i-1) %in% summed) next
+        
+        retVal = sumCurves(mean[i-1], mean[i], scale[i-1], scale[i], sigma[i-1], sigma[i], x2, x, resol, plot)
+        summed = c(summed, i-1, i)
+        if (is.nan(retVal$mean)) retVal$mean=0 
+        mean[i-1] = retVal$mean
+        mean[i] = retVal$mean
+        area[i-1] = retVal$area
+        area[i] = retVal$area
+        scale[i-1] = retVal$scale
+        scale[i] = retVal$scale
+        sigma[i-1] = retVal$sigma
+        sigma[i] = retVal$sigma
+        
+        remove = c(remove, i)
+      }
+    }
+  }
+  
+  if (length(remove)!=0){
+    mean=mean[-c(remove)]
+    area=area[-c(remove)]
+    scale=scale[-c(remove)]
+    sigma=sigma[-c(remove)]
+  }
+  
+  return(list("mean"=mean, "area"=area, "scale"=scale, "sigma"=sigma, "qual"=NULL))
+  
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/mergeDuplicatedRows.R b/DIMS/AddOnFunctions/mergeDuplicatedRows.R
new file mode 100644
index 0000000..39cb8b4
--- /dev/null
+++ b/DIMS/AddOnFunctions/mergeDuplicatedRows.R
@@ -0,0 +1,49 @@
+mergeDuplicatedRows <- function(peaklist) {
+  # peaklist = outlist.tot
+  # resultDir = "./results"
+  # scanmode = "positive"
+  
+  # peaklist_index=which(peaklist[,"mzmed.pgrp"]=="94.9984524624111")
+  # peaklist[peaklist_index,]
+  
+  collapse <- function(label,pklst,index){
+    # label = "iso_HMDB"
+    # pklst = peaklist
+    # index = peaklist_index
+    tmp2=as.vector(pklst[index,label])
+    if (length(which(is.na(tmp2)))>0) tmp2=tmp2[-which(is.na(tmp2))]
+    return(paste(tmp2,collapse = ";"))
+  } 
+  
+  options(digits=16)
+  
+  collect=NULL
+  remove=NULL
+  
+  index = which(duplicated(peaklist[, "mzmed.pgrp"]))
+  
+  while (length(index) > 0){
+    
+    peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"])
+    # peaklist[peaklist_index,"iso_HMDB",drop=FALSE]
+    tmp=peaklist[peaklist_index[1],,drop=FALSE]
+    
+    tmp[,"assi_HMDB"]=collapse("assi_HMDB",peaklist,peaklist_index)
+    tmp[,"iso_HMDB"]=collapse("iso_HMDB",peaklist,peaklist_index)
+    tmp[,"HMDB_code"]=collapse("HMDB_code",peaklist,peaklist_index)
+    tmp[,"assi_noise"]=collapse("assi_noise",peaklist,peaklist_index)
+    if (tmp[,"assi_noise"]==";") tmp[,"assi_noise"]=NA
+    tmp[,"theormz_noise"]=collapse("theormz_noise",peaklist,peaklist_index)
+    if (tmp[,"theormz_noise"]=="0;0") tmp[,"theormz_noise"]=NA
+    
+    collect = rbind(collect, tmp)
+    remove = c(remove, peaklist_index)
+    
+    index=index[-which(peaklist[index, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"])]
+  } 
+  
+  if (!is.null(remove)) peaklist = peaklist[-remove,]
+  peaklist = rbind(peaklist,collect)
+  
+  return(peaklist)
+}
diff --git a/DIMS/AddOnFunctions/normalization_2.1.R b/DIMS/AddOnFunctions/normalization_2.1.R
new file mode 100644
index 0000000..ed99964
--- /dev/null
+++ b/DIMS/AddOnFunctions/normalization_2.1.R
@@ -0,0 +1,75 @@
+# normalization_2.1(outlist.pos.id, "Intensity_all_peaks_positive_norm", groupNames.pos, on="total", assi_label="assi_HMDB")
+
+normalization_2.1 <- function(data, filename, groupNames, on="total", assi_label="assi"){
+  # data=outlist.pos.id
+  # filename = "Intensity_all_peaks_positive_norm"
+  # groupNames = groupNames.pos
+  # on="total"
+  # assi_label="assi_HMDB"
+  
+  lastcol = length(groupNames) + 6
+  before = data[ ,c(7:lastcol)]
+  
+  if (on=="total_IS") {
+    
+    assi = which(colnames(data)==assi_label)
+    data.int <- data[ ,c(assi,7:lastcol)]  # assi and samples columns
+    data.assi = data.int[grep("(IS", data.int[,1], ignore.case=FALSE, fixed = TRUE),]
+    
+  } else if (on=="total_ident"){
+    
+    assi = which(colnames(data)==assi_label)
+    assi.hmdb = which(colnames(data)=="assi.hmdb")
+    index = sort(union(which(data[,assi]!=""), which(data[,assi.hmdb]!="")))
+    
+    data.int = data[ ,c(assi,7:lastcol)]  # assi and samples columns
+    data.assi = data.int[index,]
+    
+  } else if (on=="total") {
+    
+    assi = which(colnames(data)==assi_label)
+    data.int = data[ ,c(assi,7:lastcol)]  # assi and samples columns
+    data.assi = data.int
+    
+  }
+  
+  sum <- 0
+  for (c in 2:ncol(data.assi)) {
+    sum <- sum + sum(as.numeric(data.assi[,c]))
+  }
+  average <- sum/(ncol(data.assi)-1)
+  for (c in 2:ncol(data.int)) {
+    factor <- sum(as.numeric(data.assi[,c]))/average
+    if (factor==0) {
+      data.int[ ,c]=0
+      cat(colnames(data.int)[c])
+      cat("factor==0 !!!")
+    } else {
+      data.int[ ,c] <- as.numeric(data.int[ ,c])/factor
+    }
+  }
+  
+  # colnames(data.int[,2:ncol(data.int)])
+  
+  if (dim(data)[2]==lastcol){
+    final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)])
+  } else {
+    final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)],data[,(lastcol + 1):ncol(data)])
+  }
+  
+  #outdir="./results/normalization"
+  #dir.create(outdir, showWarnings = FALSE)
+  
+  #CairoPNG(filename=paste(outdir, paste(filename, "_before.png", sep=""), sep="/"), width, height)
+  #sub=apply(before,2, function(x) sum(as.numeric(x)))
+  #barplot(as.vector(unlist(sub)), main="Not normalized",names.arg = colnames(before),las=2)
+  #dev.off()
+  
+  #CairoPNG(filename=paste(outdir, paste(filename, "_", on, ".png", sep=""), sep="/"), width, height)
+  #sub=apply(final.outlist.Pos.idpat.norm[,c(7:lastcol)],2, function(x) sum(as.numeric(x)))
+  #barplot(as.vector(unlist(sub)), main=filename,names.arg = colnames(final.outlist.Pos.idpat.norm[,c(7:lastcol)]),las=2)
+  #dev.off()
+  
+  return(final.outlist.Pos.idpat.norm)
+  
+}
diff --git a/DIMS/AddOnFunctions/optimizeGauss.R b/DIMS/AddOnFunctions/optimizeGauss.R
new file mode 100644
index 0000000..0fbce75
--- /dev/null
+++ b/DIMS/AddOnFunctions/optimizeGauss.R
@@ -0,0 +1,11 @@
+optimizeGauss <- function(x,y,sigma,mu) {
+  
+  f = function(p,x,y,sigma,mu) {
+    curve = p*dnorm(x,mu,sigma)
+    return((max(curve)-max(y))^2)
+  }
+  
+  rval = optimize(f, c(0, 100000), tol = 0.0001,x,y,sigma,mu)
+  
+  return(rval$minimum)
+}
diff --git a/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R
new file mode 100644
index 0000000..c36259a
--- /dev/null
+++ b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R
@@ -0,0 +1,88 @@
+peak.grouping.Gauss.HPC <- function(outdir, fileIn, scanmode, resol, groupNames) {
+  # fileIn="./results/specpks_all/positive_outlist_i_min_1.197.RData"
+  
+  # ppm/2
+  #range = 1.5e-06
+  range = 2e-06  
+  startcol=7
+  
+  # outlist.copy <- read.table(file=fileIn, sep="\t", header=TRUE)
+  load(fileIn)
+  outlist.copy = outlist_i_min_1
+  batch = strsplit(fileIn, ".",fixed = TRUE)[[1]][3]
+  
+  outpgrlist = NULL
+  
+  while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) {
+    
+    sel = which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1]
+    
+    # 3ppm range around max
+    mzref = as.numeric(outlist.copy[sel, "mzmed.pkt"])
+    pkmin = -(range*mzref - mzref)
+    pkmax = 2*mzref-pkmin
+    
+    selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax
+    tmplist = outlist.copy[selp, ]
+    
+    nrsamples = sum(selp)
+    if (nrsamples > 1) {
+      # 3ppm range around mean
+      mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
+      mzmin.pgrp = -(range*mzmed.pgrp - mzmed.pgrp)
+      mzmax.pgrp = 2*mzmed.pgrp - mzmin.pgrp
+      
+      selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp
+      tmplist = outlist.copy[selp, ]
+      
+      fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"]))
+      fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"]))
+      ints.allsamps = rep(0, length(groupNames))
+      names(ints.allsamps) = groupNames # same order as sample list!!!  
+      
+      #       if (length(unique(as.vector(tmplist[,"samplenr"]))) != length(as.vector(tmplist[,"samplenr"]))) {
+      #         message(paste("bingo", sel))
+      #         break
+      #       }
+      
+      # Check for each sample if multiple peaks exists, if so take the sum! 
+      labels=unique(tmplist[,"samplenr"])
+      ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))})))
+      
+      outpgrlist = rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps))
+    }
+    outlist.copy[selp, "height.pkt"] = -1
+  }
+  
+  outpgrlist = as.data.frame(outpgrlist)  # ignore warnings of duplicate row names
+  colnames(outpgrlist)[1:6] = c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp")
+  
+  #   # filtering ##################################################################################################################
+  #   final.outlist=outpgrlist[,c("mzmed.pgrp", "fq.best", "fq.worst","nrsamples","mzmin.pgrp","mzmax.pgrp", sampleNames)]
+  #   # NB: in centroided mode, data files contains many "-1.000" values, from peak finding. Set these to zero.
+  #   final.outlist[final.outlist == -1] = 0
+  #   
+  #   # keep only peaks which occur in 3 out of 3 technical replicates in at least one sample in peak group list
+  # #   peakFiltering(repl.pattern, final.outlist, nsampgrps, outdir, scanmode, startcol=7)
+  # #   peakFiltering <- function(repl.pattern, final.outlist, nsampgrps, resultDir, scanmode, startcol){
+  #   nsamp = length(repl.pattern)
+  #   nsampgrps = length(repl.pattern[[1]])
+  #   
+  #   keep <- rep(0, nrow(final.outlist))
+  #   for (p in 1:nrow(final.outlist)) {  
+  #     for (g in 1:nsampgrps) {  # g <- 31
+  #       if (keep[p] == 0 & sum(final.outlist[p, repl.pattern[[g]]] > 0) == length(repl.pattern[[g]]) ) { keep[p] <- 1 }
+  #     }
+  #   }
+  #   
+  #   tmp <- cbind(final.outlist, keep)
+  #   final.outlist.filt <- tmp[keep == 1, ]
+  # 
+  #   # omit keep column
+  #   final.outlist.filt <- final.outlist.filt[ , -ncol(final.outlist.filt)]
+  
+  #save(outpgrlist_part, file=paste(outdir, paste(scanmode, "_", mzstart, "_", mzend, ".RData", sep=""), sep="/"))
+  # save(final.outlist.filt, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/"))
+  save(outpgrlist, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/"))
+  
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/remove.dupl.2.1.R b/DIMS/AddOnFunctions/remove.dupl.2.1.R
new file mode 100644
index 0000000..89e307d
--- /dev/null
+++ b/DIMS/AddOnFunctions/remove.dupl.2.1.R
@@ -0,0 +1,32 @@
+remove.dupl.2.1 <- function(peaklist) {
+  # peaklist = outlist.tot
+  # resultDir = "./results"  
+  # scanmode = "positive"  
+  
+  # peaklist=peaklist[1:100000,]
+  
+  collect=NULL
+  remove=NULL
+  
+  index = which(duplicated(peaklist[, "mzmed.pgrp"]))
+  
+  while (length(index) > 0){
+    
+    peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"])
+    tmp=peaklist[peaklist_index[1],,drop=FALSE]
+    if (!is.na(peaklist[peaklist_index[1],"assi_HMDB"])) tmp[,"assi_HMDB"]=paste(peaklist[peaklist_index,"assi_HMDB"],collapse = ";") else tmp[,"assi_HMDB"]=NA
+    if (!is.na(peaklist[peaklist_index[1],"iso_HMDB"])) tmp[,"iso_HMDB"]=paste(peaklist[peaklist_index,"iso_HMDB"],collapse = ";") else tmp[,"iso_HMDB"]=NA
+    if (!is.na(peaklist[peaklist_index[1],"HMDB_code"])) tmp[,"HMDB_code"]=paste(peaklist[peaklist_index,"HMDB_code"],collapse = ";") else tmp[,"HMDB_code"]=NA
+    if (peaklist[peaklist_index[1],"assi_noise"]!="") tmp[,"assi_noise"]=paste(peaklist[peaklist_index,"assi_noise"],collapse = ";") else tmp[,"assi_noise"]=""
+    
+    collect = rbind(collect, tmp)
+    remove = c(remove, peaklist_index)
+    
+    index=index[-which(index==index[1])]
+  } 
+  
+  peaklist = peaklist[-remove,]
+  peaklist = rbind(peaklist,collect)
+  
+  return(peaklist)
+}
diff --git a/DIMS/AddOnFunctions/remove.dupl.R b/DIMS/AddOnFunctions/remove.dupl.R
new file mode 100644
index 0000000..227f207
--- /dev/null
+++ b/DIMS/AddOnFunctions/remove.dupl.R
@@ -0,0 +1,39 @@
+# remove duplicates, peak groups with mz within a few ppm
+# ppmdef should be 2 times bigger than during identification!!!
+remove.dupl <- function(peaklist, ppmdef=4, tolint=0.01) {
+  
+  #   peaklist = outpgrlist
+  #   ppmdef = 2
+  #   tolint = 0.01
+  
+  int.cols = 7:ncol(peaklist)
+  
+  p <- 1
+  while (p < nrow(peaklist)) {
+    mzref <- peaklist[p, "mzmed.pgrp"]
+    # print(mzref)
+    dist.ppm <- ppmdef * mzref / 1000000 
+    sel <- peaklist[ , "mzmed.pgrp"] > (mzref - dist.ppm) & peaklist[ , "mzmed.pgrp"] < (mzref + dist.ppm)
+    subset <- peaklist[sel, ]
+    if (nrow(subset) > 1) {
+      avi <- rep(1, max(int.cols))
+      for (c in int.cols) { avi[c] <- max(subset[ ,c])/mean(subset[ ,c]) }
+      
+      # remove NaN
+      avi[which(is.nan(avi))] = 1
+      
+      if (mean(avi) > (1-tolint) & mean(avi < (1+tolint))) { 
+        peaklist <- peaklist[-which(sel), ]
+        newline <- subset[1, ]
+        newline[ , "mzmed.pgrp"] <- mean(subset[ , "mzmed.pgrp"])
+        newline[ , "mzmin.pgrp"] <- min(subset[ , "mzmin.pgrp"])
+        newline[ , "mzmax.pgrp"] <- max(subset[ , "mzmax.pgrp"])
+        newline[ , int.cols] <- apply(subset[ , int.cols], 2, max)
+        # newline[ , "avg.int"] <- mean(as.numeric(newline[ , int.cols]))
+        peaklist <- rbind(peaklist, newline)
+      }
+    }
+    p <- p + 1
+  }
+  peaklist
+}
diff --git a/DIMS/AddOnFunctions/replaceZeros.R b/DIMS/AddOnFunctions/replaceZeros.R
new file mode 100644
index 0000000..4fafc9a
--- /dev/null
+++ b/DIMS/AddOnFunctions/replaceZeros.R
@@ -0,0 +1,108 @@
+replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){
+  # file="./results/grouping_rest/negative_1.RData"
+  # scanmode= "negative"
+  # scriptDir="./scripts"
+  # resol=140000
+  # thresh=2000
+  # outdir="./results"
+  
+  control_label="C"
+  
+  source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/"))
+  sourceDir(paste(scriptDir, "AddOnFunctions", sep="/"))
+  
+  dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE)
+  
+  # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+  # scale=2 # Initial value used to estimate scaling parameter
+  # width=1024
+  # height=768
+  
+  # message(paste("file", file))
+  # message(paste("scanmode", scanmode))
+  # message(paste("resol", resol))
+  # message(paste("outdir", outdir))
+  # message(paste("thresh", thresh))
+  # message(paste("scriptDir", scriptDir))
+  
+  load(paste0(outdir, "/repl.pattern.",scanmode, ".RData"))
+  
+  name = as.vector(unlist(strsplit(file, "/", fixed=TRUE)))
+  name = name[length(name)]
+  # message(paste("File name: ", name))
+  
+  # load samplePeaks
+  # load  outpgrlist
+  load(file)
+  
+  # #################################################################################
+  # # filter on at least signal in two control samples
+  # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE)
+  # # barplot(as.numeric(outpgrlist[753, int.cols]))
+  # keep = NULL
+  # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE))
+  # outpgrlist = outpgrlist[keep,]
+  # #################################################################################
+  
+  ################################################################################
+  # For now only replace zeros
+  if (!is.null(outpgrlist)) {
+    for (i in 1:length(names(repl.pattern.filtered))){
+      samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]]
+      index=which(samplePeaks<=0)
+      if (!length(index)){
+        next
+      }
+      for (j in 1:length(index)){
+        area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area
+        # for testing purposes, add a fixed random seed
+        set.seed(123)
+        outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area)
+      }
+    }
+  
+  ################################################################################
+  
+  
+  #################### identification #########################################################
+    # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016
+    
+    # Add average column
+    outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean))
+    
+    if (scanmode=="negative"){
+      label = "MNeg"
+      label2 = "Negative"
+      # take out multiple NaCl adducts
+      look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H"
+      # HMDB_add_iso=HMDB_add_iso.Neg
+    } else {
+      label = "Mpos"
+      label2 = "Positive"
+      # take out NaCl adducts
+      look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5")
+      # HMDB_add_iso=HMDB_add_iso.Pos
+    }
+    
+    # # Identification using large database
+    # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label)
+    # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups"))
+    # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres"))
+    
+    # Identify noise peaks
+    noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "")
+    noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4]
+    
+    # Replace "Negative" by "negative" in ident.hires.noise
+    final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm)
+    # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks"))
+    tmp <- final.outlist.idpat2[ , c("assi", "theormz")]
+    colnames(tmp) <- c("assi_noise",  "theormz_noise")
+    
+    final.outlist.idpat3 <- cbind(outpgrlist, tmp)
+    #############################################################################################
+    
+    # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep="")))
+    save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep=""))
+  }
+}
diff --git a/DIMS/AddOnFunctions/replaceZeros_setseed.R b/DIMS/AddOnFunctions/replaceZeros_setseed.R
new file mode 100644
index 0000000..4fafc9a
--- /dev/null
+++ b/DIMS/AddOnFunctions/replaceZeros_setseed.R
@@ -0,0 +1,108 @@
+replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){
+  # file="./results/grouping_rest/negative_1.RData"
+  # scanmode= "negative"
+  # scriptDir="./scripts"
+  # resol=140000
+  # thresh=2000
+  # outdir="./results"
+  
+  control_label="C"
+  
+  source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/"))
+  sourceDir(paste(scriptDir, "AddOnFunctions", sep="/"))
+  
+  dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE)
+  
+  # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+  # scale=2 # Initial value used to estimate scaling parameter
+  # width=1024
+  # height=768
+  
+  # message(paste("file", file))
+  # message(paste("scanmode", scanmode))
+  # message(paste("resol", resol))
+  # message(paste("outdir", outdir))
+  # message(paste("thresh", thresh))
+  # message(paste("scriptDir", scriptDir))
+  
+  load(paste0(outdir, "/repl.pattern.",scanmode, ".RData"))
+  
+  name = as.vector(unlist(strsplit(file, "/", fixed=TRUE)))
+  name = name[length(name)]
+  # message(paste("File name: ", name))
+  
+  # load samplePeaks
+  # load  outpgrlist
+  load(file)
+  
+  # #################################################################################
+  # # filter on at least signal in two control samples
+  # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE)
+  # # barplot(as.numeric(outpgrlist[753, int.cols]))
+  # keep = NULL
+  # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE))
+  # outpgrlist = outpgrlist[keep,]
+  # #################################################################################
+  
+  ################################################################################
+  # For now only replace zeros
+  if (!is.null(outpgrlist)) {
+    for (i in 1:length(names(repl.pattern.filtered))){
+      samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]]
+      index=which(samplePeaks<=0)
+      if (!length(index)){
+        next
+      }
+      for (j in 1:length(index)){
+        area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area
+        # for testing purposes, add a fixed random seed
+        set.seed(123)
+        outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area)
+      }
+    }
+  
+  ################################################################################
+  
+  
+  #################### identification #########################################################
+    # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016
+    
+    # Add average column
+    outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean))
+    
+    if (scanmode=="negative"){
+      label = "MNeg"
+      label2 = "Negative"
+      # take out multiple NaCl adducts
+      look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H"
+      # HMDB_add_iso=HMDB_add_iso.Neg
+    } else {
+      label = "Mpos"
+      label2 = "Positive"
+      # take out NaCl adducts
+      look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5")
+      # HMDB_add_iso=HMDB_add_iso.Pos
+    }
+    
+    # # Identification using large database
+    # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label)
+    # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups"))
+    # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres"))
+    
+    # Identify noise peaks
+    noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "")
+    noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4]
+    
+    # Replace "Negative" by "negative" in ident.hires.noise
+    final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm)
+    # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks"))
+    tmp <- final.outlist.idpat2[ , c("assi", "theormz")]
+    colnames(tmp) <- c("assi_noise",  "theormz_noise")
+    
+    final.outlist.idpat3 <- cbind(outpgrlist, tmp)
+    #############################################################################################
+    
+    # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep="")))
+    save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep=""))
+  }
+}
diff --git a/DIMS/AddOnFunctions/run.vbs b/DIMS/AddOnFunctions/run.vbs
new file mode 100644
index 0000000..640116a
--- /dev/null
+++ b/DIMS/AddOnFunctions/run.vbs
@@ -0,0 +1,22 @@
+Option Explicit
+On Error Resume Next
+RunExcelMacro
+
+Sub RunExcelMacro() 
+
+Dim xlApp 
+Dim xlBook 
+Dim xlBook_persenal
+
+Set xlApp = CreateObject("Excel.Application")
+xlApp.DisplayAlerts = FALSE
+Set xlBook_persenal = xlApp.Workbooks.Open("C:\Users\awillem8\AppData\Roaming\Microsoft\Excel\XLSTART\PERSONAL.XLSB", 0, True)
+Set xlBook = xlApp.Workbooks.Open("C:\Users\mkerkho7\testmap\xls\P181_Aberant_HMDB.xlsx", 0, True)
+xlApp.Run "PERSONAL.XLSB!FixAndResize"
+xlBook.SaveAs "C:\Users\mkerkho7\testmap\results\P181_Aberant_HMDB.xlsx"
+xlBook.Close False 
+xlApp.Quit
+Set xlBook = Nothing
+Set xlBook_persenal = Nothing 
+Set xlApp = Nothing 
+End Sub
diff --git a/DIMS/AddOnFunctions/runVBAMacro.R b/DIMS/AddOnFunctions/runVBAMacro.R
new file mode 100644
index 0000000..3836b2f
--- /dev/null
+++ b/DIMS/AddOnFunctions/runVBAMacro.R
@@ -0,0 +1,48 @@
+runVBAMacro <- function(dir, dir2, vb_script) {
+  
+#   dir="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls\\"
+#   dir2="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls_fixed\\"  
+  
+dir.create(dir2)
+files=list.files(dir)
+
+script_1 = paste("Option Explicit
+On Error Resume Next
+RunExcelMacro
+
+Sub RunExcelMacro() 
+
+Dim xlApp 
+Dim xlBook 
+Dim xlBook_persenal
+
+Set xlApp = CreateObject(\"Excel.Application\")
+xlApp.DisplayAlerts = FALSE
+Set xlBook_persenal = xlApp.Workbooks.Open(\"C:\\Users\\awillem8\\AppData\\Roaming\\Microsoft\\Excel\\XLSTART\\PERSONAL.XLSB\", 0, True)
+Set xlBook = xlApp.Workbooks.Open(\"", dir, sep="") 
+
+script_2 = "\", 0, True)
+xlApp.Run \"PERSONAL.XLSB!FixAndResize\"
+xlBook.SaveAs \""
+
+script_3 = "\"
+xlBook.Close False 
+xlApp.Quit
+Set xlBook = Nothing
+Set xlBook_persenal = Nothing 
+Set xlApp = Nothing 
+End Sub"
+
+for (i in 1:length(files)){
+  script = paste(script_1, files[i], script_2, dir2, files[i], script_3, sep="")  # dir2, files[i], 
+  
+  message(script)
+  
+  fileConn = file("./src/run.vbs")
+  writeLines(script, fileConn)
+  close(fileConn)
+  
+  system(vb_script)
+}
+}
+
diff --git a/DIMS/AddOnFunctions/searchMZRange.R b/DIMS/AddOnFunctions/searchMZRange.R
new file mode 100644
index 0000000..43a7171
--- /dev/null
+++ b/DIMS/AddOnFunctions/searchMZRange.R
@@ -0,0 +1,187 @@
+searchMZRange <- function(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh){
+  # range=sub_range
+  
+  end=NULL
+  index=as.vector(which(range!=0))
+  
+  # bad infusion
+  if (length(index)==0) return(values)
+  
+  start=index[1]
+  subRangeLength = 15
+  
+  for (i in 1:length(index)){
+    # for (i in 1:129000){
+    
+    if (i<length(index) & (index[i+1] - index[i]) > 1){
+      
+      end=index[i]
+      
+      # start=395626
+      # end=395640
+      # i=25824
+      
+      # gaan met de banaan
+      # 128836
+      # start 1272853
+      # mz start 316.83302
+      # end 1272870
+      # mz end 316.84269
+      
+      x = as.numeric(names(range)[c(start:end)])
+      y = as.vector(range[c(start:end)])
+      #       # Trim zeros
+      #       x = as.vector(trimZeros(x,y)[[1]])
+      #       y = as.vector(trimZeros(x,y)[[2]])
+      
+      if (length(y)!=0) {
+        
+        # check if intensity above thresh
+        if (max(y) < thresh | is.nan(max(y))) {
+          start=index[i+1]
+          next
+        }
+        
+        # cat("gaan met de banaan")
+        # cat(i)
+        # cat(paste("start", start, sep=" "))
+        # cat(paste("mz start", x[1], sep=" "))
+        # cat(paste("end", end, sep=" "))
+        # cat(paste("mz end", x[length(x)], sep=" "))
+        
+        if (length(y)>subRangeLength) {
+          
+          y[which(y<min(y)*1.1)] = 0
+          sub_range = y
+          names(sub_range) = x
+          
+          # Range to long!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+          values = searchMZRange(sub_range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh)
+          
+        } else if (length(y)>3) {
+          # Check only zeros
+          if (sum(y)==0) next
+          
+          rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname, scanmode, plot,width,height,i,start,end)
+          
+          if (rval$qual[1]==1) {
+            
+            rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height)
+            
+            values$mean = c(values$mean, rval$mean)
+            values$area = c(values$area, rval$area)
+            values$nr = c(values$nr, sampname)
+            values$min = c(values$min, rval$min)
+            values$max = c(values$max, rval$max)
+            values$qual = c(values$qual, 0)
+            
+            values$spikes = values$spikes + 1
+            
+          } else {
+            for (j in 1:length(rval$mean)){
+              values$mean = c(values$mean, rval$mean[j])
+              values$area = c(values$area, rval$area[j])
+              values$nr = c(values$nr, sampname)
+              values$min = c(values$min, rval$min[1])
+              values$max = c(values$max, rval$max[1])
+              values$qual = c(values$qual, rval$qual[1])
+            }
+          }
+          
+        } else {
+          
+          rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height)
+          
+          values$mean = c(values$mean, rval$mean)
+          values$area = c(values$area, rval$area)
+          values$nr = c(values$nr, sampname)
+          values$min = c(values$min, rval$min)
+          values$max = c(values$max, rval$max)
+          values$qual = c(values$qual, 0)
+          
+          values$spikes = values$spikes + 1
+        }
+      }
+      start=index[i+1]
+    }
+  }
+  
+  # last little range
+  end = index[length(index)]
+  x = as.numeric(names(range)[c(start:end)])
+  y = as.vector(range[c(start:end)])
+  
+  #   x = as.vector(trimZeros(x,y)[[1]])
+  #   y = as.vector(trimZeros(x,y)[[2]])
+  
+  if (length(y)!=0) {
+    
+    # check if intensity above thresh
+    if (max(y) < thresh | is.nan(max(y))) {
+      #start=index[i+1]
+      # do nothing!!
+    } else {
+      
+      #       cat("gaan met de banaan")
+      #       cat(paste("start", start, sep=" "))
+      #       cat(paste("mz start", x[1], sep=" "))
+      #       cat(paste("end", end, sep=" "))
+      #       cat(paste("mz end", x[length(x)], sep=" "))
+      
+      if (length(y)>subRangeLength) {
+        
+        y[which(y<min(y)*1.1)] = 0
+        sub_range = y
+        names(sub_range) = x
+        
+        # Range to long!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+        values = searchMZRange(sub_range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh)
+        
+      } else if (length(y)>3) {
+        
+        # Check only zeros
+        if (sum(y)==0) next
+        
+        rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end)
+        
+        if (rval$qual[1]==1) {
+          #cat("Quality = 1!!!")
+          rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height)
+          
+          values$mean = c(values$mean, rval$mean)
+          values$area = c(values$area, rval$area)
+          values$nr = c(values$nr, sampname)
+          values$min = c(values$min, rval$min)
+          values$max = c(values$max, rval$max)
+          values$qual = c(values$qual, 0)
+          
+          values$spikes = values$spikes + 1
+          
+        } else {
+          for (j in 1:length(rval$mean)){
+            values$mean = c(values$mean, rval$mean[j])
+            values$area = c(values$area, rval$area[j])
+            values$nr = c(values$nr, sampname)
+            values$min = c(values$min, rval$min[1])
+            values$max = c(values$max, rval$max[1])
+            values$qual = c(values$qual, rval$qual[1])
+          }
+        }
+      } else {
+        rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height)
+        
+        values$mean = c(values$mean, rval$mean)
+        values$area = c(values$area, rval$area)
+        values$nr = c(values$nr, sampname)
+        values$min = c(values$min, rval$min)
+        values$max = c(values$max, rval$max)
+        values$qual = c(values$qual, 0)
+        
+        values$spikes = values$spikes + 1
+      }
+    }
+  }
+  
+  return(values)
+  #return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max, "nr"=sample.nr, "spikes"=spikes))
+}
diff --git a/DIMS/AddOnFunctions/sourceDir.R b/DIMS/AddOnFunctions/sourceDir.R
new file mode 100644
index 0000000..abf4c9e
--- /dev/null
+++ b/DIMS/AddOnFunctions/sourceDir.R
@@ -0,0 +1,8 @@
+# source add on functions
+sourceDir <- function(path, trace = TRUE, ...) {
+  for (nm in list.files(path, pattern = "[.][RrSsQq]$")) {
+    #if(trace) cat(nm,":")
+    source(file.path(path, nm), ...)
+    #if(trace) cat("\n")
+  }
+}
diff --git a/DIMS/AddOnFunctions/statistics_z.R b/DIMS/AddOnFunctions/statistics_z.R
new file mode 100644
index 0000000..0a92d3d
--- /dev/null
+++ b/DIMS/AddOnFunctions/statistics_z.R
@@ -0,0 +1,69 @@
+statistics_z <- function(peaklist, sortCol, adducts){
+  # peaklist=as.data.frame(outlist.adducts.HMDB)
+  # plotdir="./results/plots/adducts"
+  # filename="./results/allpgrps_stats.txt"
+  # adducts=TRUE
+  
+  # peaklist=outlist.tot
+  # sortCol=NULL
+  # adducts=FALSE
+  
+  case_label = "P"
+  control_label = "C"
+  startcol = dim(peaklist)[2]+3 
+  
+  # calculate mean and sd for Control group
+  ctrl.cols <- grep(control_label, colnames(peaklist),fixed = TRUE) # 5:41
+  int.cols <- c(grep(control_label, colnames(peaklist),fixed = TRUE), grep(case_label, colnames(peaklist),fixed = TRUE))
+  peaklist[,int.cols][peaklist[,int.cols]==0] = NA
+  
+  # tmp = data.matrix(peaklist[ , ctrl.cols], rownames.force = TRUE)
+  tmp = peaklist[ , ctrl.cols, drop=FALSE]
+  
+  peaklist$avg.ctrls = apply(tmp, 1, function(x) mean(as.numeric(x),na.rm = TRUE))
+  peaklist$sd.ctrls = apply(tmp, 1, function(x) sd(as.numeric(x),na.rm = TRUE))
+  
+  cnames.z = NULL
+  
+  for (i in int.cols) {
+    # message(i)
+    cname = colnames(peaklist)[i]
+    cnames.z = c(cnames.z, paste(cname, "Zscore", sep="_"))
+    zscores.1col = (as.numeric(as.vector(unlist(peaklist[ , i]))) - peaklist$avg.ctrls) / peaklist$sd.ctrls
+    peaklist = cbind(peaklist, zscores.1col)
+  }
+  
+  colnames(peaklist)[startcol:ncol(peaklist)] = cnames.z
+  
+  z.cols = grep("Zscore", colnames(peaklist),fixed = TRUE)
+  
+  if (!adducts){
+    if ((dim(peaklist[, z.cols])[2]+6)!=(startcol-1)){
+      ppmdev=array(1:dim(peaklist)[1], dim=c(dim(peaklist)[1]))
+      
+      # calculate ppm deviation
+      for (i in 1:dim(peaklist)[1]){
+        if (!is.na(peaklist$theormz_HMDB[i]) & !is.null(peaklist$theormz_HMDB[i]) & (peaklist$theormz_HMDB[i]!="")){
+          ppmdev[i] = 10^6*(as.numeric(as.vector(peaklist$mzmed.pgrp[i]))-as.numeric(as.vector(peaklist$theormz_HMDB[i])))/as.numeric(as.vector(peaklist$theormz_HMDB[i]))
+        } else {
+          ppmdev[i]=NA
+        }
+      }
+      
+      peaklist = cbind(peaklist[, 1:6], ppmdev=ppmdev, peaklist[ , 7:ncol(peaklist)])
+    }  
+  }
+  
+  #peaklist = peaklist[order(peaklist[,sortCol]),]
+  
+  #   # Order on average Z-score
+  #   tmp = peaklist[,grep("Zscore", colnames(peaklist))]
+  #   tmp.p = tmp[,grep("P", colnames(tmp)),drop=FALSE]
+  #   tmp.p.avg = apply(tmp.p, 1, mean)
+  #   
+  #   peaklist = cbind(peaklist, "avg.z.score"=tmp.p.avg)
+  #   peaklist = peaklist[order(abs(tmp.p.avg), decreasing = TRUE),]
+  
+  return(peaklist)
+  
+}  
diff --git a/DIMS/AddOnFunctions/sumCurves.R b/DIMS/AddOnFunctions/sumCurves.R
new file mode 100644
index 0000000..bcc1485
--- /dev/null
+++ b/DIMS/AddOnFunctions/sumCurves.R
@@ -0,0 +1,39 @@
+sumCurves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, x2, x, resol, plot) {
+  #   mean1=mean[i-1]
+  #   mean2=mean[i]
+  #   scale1=scale[i-1]
+  #   scale2=scale[i]
+  #   sigma1=sigma[i-1]
+  #   sigma2=sigma[i]
+  
+  # message("=============================================================> sum 2 curves!")
+  
+  sumFit=(scale1*dnorm(x2,mean1,sigma1))+(scale2*dnorm(x2,mean2,sigma2))
+  if (plot) lines(x2,sumFit,col="brown")
+  
+  #mean1plus2 = mean(c(mean1,mean2))
+  mean1plus2 = weighted.mean(c(mean1,mean2),c(max(scale1*dnorm(x2,mean1,sigma1)),max(scale2*dnorm(x2,mean2,sigma2))))
+  
+  if (plot) abline(v = mean1plus2, col="brown")
+  fwhm = getFwhm(mean1plus2, resol)
+  half_max = max(sumFit)*0.5
+  if (plot) lines(c(mean1plus2 - 0.5*fwhm, mean1plus2 + 0.5*fwhm),c(half_max,half_max),col="orange")
+  
+  # sumFit=(scale1*dnorm(x,mean1,sigma1))+(scale2*dnorm(x,mean2,sigma2))
+  #  fq=abs(sum(y) - sum(sumFit))/sum(y)
+  
+  #   h2=c(paste("mean =", mean1plus2, sep=" "),
+  #        paste("fq =", fq, sep=" "))
+  #   
+  #   legend("topright", legend=h2)
+  
+  # I assume that the sum of the distributions if also normal, which is not
+  #area = sum(scale1*dnorm(x2,mean1,sigma1))+sum(scale2*dnorm(x2,mean2,sigma2))
+  #area = max(scale1*dnorm(x2,mean1,sigma1))+max(scale2*dnorm(x2,mean2,sigma2))
+  area = max(sumFit) 
+  scale = scale1 + scale2
+  sigma = (fwhm/2)*0.85
+  
+  return(list("mean"= mean1plus2,"area"=area, "scale"=scale, "sigma"=sigma)) # "qual"=fq
+  
+} 
diff --git a/DIMS/AddOnFunctions/trimZeros.R b/DIMS/AddOnFunctions/trimZeros.R
new file mode 100644
index 0000000..0fc5ea3
--- /dev/null
+++ b/DIMS/AddOnFunctions/trimZeros.R
@@ -0,0 +1,8 @@
+trimZeros <- function(x, y) {
+  tmp = which(y==0)
+  if (length(tmp)!=0){
+    y = y[-tmp]
+    x = x[-tmp]
+  }
+  return(list(x,y))
+}
diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 90f510a..a218c4d 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -19,9 +19,9 @@ print(filepath)
 print(breaks_filepath)
 print(resol)
 
-#sampname <- sub('\\..*$', '', basename(filepath))
+sampname <- sub('\\..*$', '', basename(filepath))
 #cat(paste0("\n", sampname))
-sampname <- "AssignToBins"
+# sampname <- "AssignToBins"
 
 #suppressPackageStartupMessages(library("Cairo"))
 options(digits=16)
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index 872e32f..0bb7855 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -4,12 +4,11 @@ process AssignToBins {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(file_id, path(mzML_filename))
-       file(breaks_file)
+       tuple(file_id, path(mzML_filename), path(breaks_file))
        val(resolution)
 
     output:
-       file 'AssignToBins.RData'
+       path("${file_id}.RData")
 
     script:
         """
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
new file mode 100644
index 0000000..3bfc484
--- /dev/null
+++ b/DIMS/AverageTechReplicates.R
@@ -0,0 +1,119 @@
+#!/usr/bin/Rscript
+# adapted from 3-AverageTechReplicates.R
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+init_filepath <- cmd_args[1]
+nr_replicates <- as.numeric(cmd_args[2])
+thresh2remove <- 2000
+dimsThresh <- 100
+
+removeFromRepl.pat <- function(bad_samples, repl.pattern, nr_replicates) {
+  # bad_samples=remove_pos
+  
+  tmp = repl.pattern
+  
+  removeFromGroup=NULL
+  
+  for (i in 1:length(tmp)){
+    tmp2 = repl.pattern[[i]]
+    
+    remove=NULL
+    
+    for (j in 1:length(tmp2)){
+      if (tmp2[j] %in% bad_samples){
+        #cat(tmp2[j])
+        #cat(paste("remove",tmp2[j]))
+        #cat(paste("remove i",i))
+        #cat(paste("remove j",j))
+        remove = c(remove, j)
+      }
+    }
+    
+    if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i)
+    if (!is.null(remove)) repl.pattern[[i]]=repl.pattern[[i]][-remove]
+  }
+  
+  if (length(removeFromGroup)!=0) {
+    repl.pattern=repl.pattern[-removeFromGroup]
+  }
+  
+  return(list("pattern"=repl.pattern))
+}
+
+
+# get repl.pattern
+load("./init.RData")
+
+remove_neg=NULL
+remove_pos=NULL
+cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
+for (i in 1:length(repl.pattern)) {
+  techRepsArray.pos = NULL
+  techRepsArray.neg = NULL
+  
+  tech_reps = as.vector(unlist(repl.pattern[i]))
+  sum_neg=0
+  sum_pos=0
+  n_pos=0
+  n_neg=0
+  cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl.pattern))
+  for (j in 1:length(tech_reps)) {
+    load(paste("./", tech_reps[j], ".RData", sep=""))
+    cat("\n\nParsing", tech_reps[j])
+
+    cat("\n\tNegative pklist sum",sum(pklist$neg[,1]))
+    if (sum(pklist$neg[,1])<thresh2remove){
+      cat(" ... Removed")
+      remove_neg=c(remove_neg, tech_reps[j])
+    } else {
+      n_neg=n_neg+1
+      sum_neg=sum_neg+pklist$neg
+    }
+    
+    techRepsArray.neg = cbind(techRepsArray.neg, pklist$neg)
+    
+    cat("\n\tPositive pklist sum",sum(pklist$pos[,1]))
+    if (sum(pklist$pos[,1])<thresh2remove){
+      cat(" ... Removed")
+      remove_pos=c(remove_pos, tech_reps[j])
+    } else {
+      n_pos=n_pos+1
+      sum_pos=sum_pos+pklist$pos
+    }
+    
+    techRepsArray.pos = cbind(techRepsArray.pos, pklist$pos)
+  }
+  
+  # filter within bins on at least signal in more than one tech. rep.!!!
+  if (!is.null(dim(sum_pos))) sum_pos[apply(techRepsArray.pos,1,function(x) length(which(x>dimsThresh))==1),1]=0
+  if (!is.null(dim(sum_neg))) sum_neg[apply(techRepsArray.neg,1,function(x) length(which(x>dimsThresh))==1),1]=0
+  
+  if (n_neg != 0){
+    sum_neg[,1] <- sum_neg[,1]/n_neg
+    colnames(sum_neg) <- names(repl.pattern)[i]
+    save(sum_neg, file=paste("./", names(repl.pattern)[i], "_neg.RData", sep=""))
+  }
+  if (n_pos != 0) {
+    sum_pos[,1] <- sum_pos[,1]/n_pos
+    colnames(sum_pos) <- names(repl.pattern)[i]
+    save(sum_pos, file=paste("./", names(repl.pattern)[i], "_pos.RData", sep=""))
+  }
+}
+
+retVal <- removeFromRepl.pat(remove_neg, repl.pattern, nr_replicates)
+repl.pattern.filtered <- retVal$pattern
+save(repl.pattern.filtered, file="./repl_pattern_negative.RData")
+write.table(remove_neg, file="./miss_infusions_neg.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
+
+retVal <- removeFromRepl.pat(remove_pos, repl.pattern, nr_replicates)
+repl.pattern.filtered <- retVal$pattern
+save(repl.pattern.filtered, file="./repl_pattern_positive.RData")
+write.table(remove_pos, file="./miss_infusions_pos.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
+
+
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
new file mode 100644
index 0000000..aa3b99d
--- /dev/null
+++ b/DIMS/AverageTechReplicates.nf
@@ -0,0 +1,24 @@
+process AverageTechReplicates {
+    label 'AverageTechReplicates'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(RData_file)
+       path(init_filepath)
+       val(nr_replicates)
+
+    output:
+       path '*_*.RData', emit: binned
+       path 'repl.pattern.negative.RData'
+       path 'repl.pattern.positive.RData'
+       path 'miss_infusions_neg.txt'
+       path 'miss_infusions_pos.txt'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $nr_replicates
+        """
+}
+
+
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 85065ac..22b80c9 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -54,4 +54,6 @@ for (i in 1:nsegment) {
 }
 
 save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/"))
+# temporary fix for breaks file:
+# save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir/../, "breaks.fwhm.RData", sep="/"))
 
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index f73ad54..f60d5cc 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -8,7 +8,7 @@ process GenerateBreaks {
 
 
     output:
-       file 'breaks.fwhm.RData' 
+       path 'breaks.fwhm.RData'
 
     script:
         """
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
new file mode 100644
index 0000000..10fdc37
--- /dev/null
+++ b/DIMS/HMDBparts.R
@@ -0,0 +1,126 @@
+#!/usr/bin/Rscript
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+# outdir <- cmd_args[1]
+# scanmode <- cmd_args[2]
+db_path <- cmd_args[1] # location of HMDB db file
+breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
+standard_run  <- "no" # cmd_args[5] # "yes"
+
+# Cut up entire HMDB into small parts based on the new binning/breaks 
+
+# load(paste(outdir, "breaks.fwhm.RData", sep = "/"))
+load(breaks_filepath)
+#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/")
+#dir.create(outdir_hmdb, showWarnings = FALSE)
+
+# New: in case of a standard run (m/z 69-606) use external HMDB parts
+min_mz <- round(breaks.fwhm[1])
+max_mz <- round(breaks.fwhm[length(breaks.fwhm)])
+# test if standard mz range is used
+if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) {
+  # skip generating HMDB parts
+  use_external_HMDB <- TRUE
+  save(use_external_HMDB, file="./using_external_HMDB_parts.RData")
+  hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/"
+  hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files
+  # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/"))
+  for (hmdb_file in hmdb_parts) {
+    file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE)
+  }
+} else { 
+  # generate HMDB parts in case of non-standard mz range
+  use_external_HMDB <- FALSE
+  save(use_external_HMDB, file="not_using_external_HMDB_parts.RData")
+  # db <- cmd_args[3]
+  load(db_path)
+  ppm <- as.numeric(cmd_args[4])
+  if (scanmode=="negative"){
+    label = "MNeg"
+    HMDB_add_iso=HMDB_add_iso.Neg
+  } else {
+    label = "Mpos"
+    HMDB_add_iso=HMDB_add_iso.Pos
+  }
+
+  # filter mass range meassured!!!
+  HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),]
+
+  # sort on mass
+  outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),]
+
+  n=dim(outlist)[1]
+  sub=20000 # max rows per file
+  end=0
+  min_1_last=sub
+  check=0
+  outlist_part=NULL
+
+
+  if (n < sub) {
+    outlist_part <- outlist
+    save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/"))
+  } else {
+  
+    if (n >= sub & (floor(n/sub) - 1) >= 2){
+      for (i in 2:floor(n/sub) - 1){
+        start <- -(sub - 1) + i*sub
+        end <- i*sub
+      
+        if (i > 1){
+          outlist_i = outlist[c(start:end),]
+        
+          n_moved = 0
+        
+          # Calculate 3ppm and replace border, avoid cut within peakgroup!
+          while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
+            outlist_part <- rbind(outlist_part, outlist_i[1,])
+            outlist_i <- outlist_i[-1,]
+            n_moved <- n_moved + 1
+          }
+        
+          # message(paste("Process", i-1,":", dim(outlist_part)[1]))
+          save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/"))
+          check <- check + dim(outlist_part)[1]
+        
+          outlist_part <- outlist_i
+          min_1_last <- dim(outlist_part)[1]
+        
+        } else {
+          outlist_part <- outlist[c(start:end),]
+        }
+      }
+    }
+  
+    start <- end + 1
+    end <- n
+    outlist_i <- outlist[c(start:end),]
+    n_moved <- 0
+  
+    if (!is.null(outlist_part)) {
+      # Calculate 3ppm and replace border, avoid cut within peakgroup!
+      while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
+        outlist_part = rbind(outlist_part, outlist_i[1,])
+        outlist_i = outlist_i[-1,]
+        n_moved = n_moved + 1
+      }
+    
+      # message(paste("Process", i+1-1,":", dim(outlist_part)[1]))
+      save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_"))
+      check <- check + dim(outlist_part)[1]
+    }
+  
+    outlist_part <- outlist_i
+    # message(paste("Process", i+2-1,":", dim(outlist_part)[1]))
+    save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_"))
+    check <- check + dim(outlist_part)[1]
+    cat("\n", "Check", check == dim(outlist)[1])
+  
+  }
+}
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
new file mode 100644
index 0000000..8ed9130
--- /dev/null
+++ b/DIMS/HMDBparts.nf
@@ -0,0 +1,20 @@
+process HMDBparts {
+    // Custom process to cut HMDB db into parts
+    tag {"DIMS HMDBparts"}
+    label 'HMDBparts'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+        path(hmdb_db_file)
+        path(breaks_file)
+
+    output:
+        path('*.RData')
+
+    script:
+
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file
+        """
+}
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index 064225c..4908047 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -7,7 +7,7 @@ process MakeInit {
        tuple(path(samplesheet), val(nr_replicates))
 
     output:
-       file 'init.RData' 
+       path 'init.RData'
 
     script:
         """
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
new file mode 100644
index 0000000..4820ef5
--- /dev/null
+++ b/DIMS/PeakFinding.R
@@ -0,0 +1,68 @@
+#!/usr/bin/Rscript
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+filepath <- cmd_args[1]
+resol <- as.numeric(cmd_args[2])
+scripts <- cmd_args[3]
+breaks_filepath <- cmd_args[4] # location of breaks.fwhm.RData
+
+thresh <- 2000
+
+# if file extension is .txt, do nothing.
+if (grepl(".txt$", filepath) { stop("not run on txt file") }
+
+# create output folder
+dir.create(paste(outdir, "4-specpks", sep="/"),showWarnings = F)
+
+# load in function scripts
+source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/"))
+sourceDir(paste(scripts, "AddOnFunctions", sep="/"))
+
+# load(paste(outdir, "breaks.fwhm.RData", sep="/"))
+load(breaks_filepath)
+load(filepath)
+if (grepl("_pos", filepath)) { scanmode = "positive" } else
+  if (grepl("_neg", filepath)) { scanmode = "negative" }
+
+options(digits=16)
+int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+scale=2 # Initial value used to estimate scaling parameter
+width=1024
+height=768
+
+### fit Gaussian estimate mean and integrate to obtain intensity
+findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) {
+  sampname <- colnames(plist)[1]
+  
+  range = as.vector(plist)
+  names(range) = rownames(plist)
+  
+  values = list("mean"=NULL,"area"=NULL,"nr"=NULL,"min"=NULL,"max"=NULL,"qual"=NULL,"spikes"=0)
+  
+  values = searchMZRange(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh)
+  
+  outlist.persample=NULL
+  outlist.persample=cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area)
+
+  index=which(outlist.persample[,"height.pkt"]==0)
+  if (length(index)>0) {
+    outlist.persample=outlist.persample[-index,]
+  }
+  
+  save(outlist.persample, file=paste(outdir, "4-specpks", paste(sampname, "_", scanmode, ".RData", sep=""), sep="/"))
+  
+  cat(paste("There were", values$spikes, "spikes!"))
+}
+
+
+if (scanmode == "negative") {
+  findPeaks.Gauss.HPC(sum_neg, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+} else {
+  findPeaks.Gauss.HPC(sum_pos, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+}
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
new file mode 100644
index 0000000..fb86239
--- /dev/null
+++ b/DIMS/PeakFinding.nf
@@ -0,0 +1,23 @@
+process PeakFinding {
+    label 'PeakFinding'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(RData_file)
+       val(resolution)
+       path(scripts)
+       path(breaks_file)
+
+    output:
+       path '*_neg.RData'
+       path '*_pos.RData'
+       path 'repl.pattern.*.RData', emit: pattern
+       path 'miss_infusions_neg.txt'
+       path 'miss_infusions_pos.txt'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $resolution $scripts $breaks_file
+        """
+}
diff --git a/DIMS/PeakGroupingIdentified.R b/DIMS/PeakGroupingIdentified.R
new file mode 100644
index 0000000..7c8d998
--- /dev/null
+++ b/DIMS/PeakGroupingIdentified.R
@@ -0,0 +1,180 @@
+#!/usr/bin/Rscript
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+SpecPeaks_file <- cmd_args[1]
+#outdir <- cmd_args[2]
+#scanmode <- cmd_args[3]
+HMDB_part_file <- cmd_args[2]
+pattern_file <- cmd_args[3]
+resol <- as.numeric(cmd_args[4])
+ppm <- as.numeric(cmd_args[5])
+
+options(digits=16)
+
+cat(paste("File to group:", fileIn))
+
+
+options(digits=16)
+load(HMDB_part_file) # outlist_part(HMDB_add_iso)
+HMDB_add_iso = outlist_part
+
+# Windows and Unix-like
+batch = strsplit(fileIn, "/",fixed = TRUE)[[1]]
+batch = batch[length(batch)] 
+batch = strsplit(batch, ".",fixed = TRUE)[[1]][2]
+
+load(SpecPeaks_file)
+outlist.copy = outlist.tot
+rm(outlist.tot)
+
+load(pattern_file)
+load("./breaks.fwhm.RData")
+
+outpgrlist.identified = NULL
+
+if (scanmode=="negative"){
+  label = "MNeg"
+} else {
+  label = "Mpos"
+}
+
+outlist.grouped <- NULL
+
+# First group on HMDB masses
+while (dim(HMDB_add_iso)[1] > 0) { 
+  index = 1
+  # message(HMDB_add_iso[index,"CompoundName"])
+  
+  mass = as.numeric(HMDB_add_iso[index,label])
+  mtol = (mass*ppm)/10^6
+  
+  mzmed = as.numeric(outlist.copy[,"mzmed.pkt"])
+  selp = which((mzmed > (mass - mtol)) & (mzmed < (mass + mtol)))
+  tmplist = outlist.copy[selp,,drop=FALSE]
+  outlist.grouped <- rbind(outlist.grouped, tmplist)
+  
+  nrsamples = length(selp)
+  if (nrsamples > 0) {
+    # message(paste("Bingo ",n))
+    
+    mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
+    # mzmed.pgrp = median(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
+    mzmin.pgrp = mass - mtol
+    mzmax.pgrp = mass + mtol
+    
+    fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"]))
+    fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"]))
+    ints.allsamps = rep(0, length(names(repl.pattern.filtered)))
+    names(ints.allsamps) = names(repl.pattern.filtered) # same order as sample list!!!
+    
+    # # Check for each sample if multiple peaks exists, if so take the sum!
+    labels=unique(tmplist[,"samplenr"])
+    ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))})))
+    # ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])})))
+    
+    # Identification
+    # assi_HMDB = iso_HMDB = HMDB_code = ""
+    assi_HMDB = iso_HMDB = HMDB_code = NA
+    tmplist.mass.iso = tmplist.mass.adduct = NULL
+    # index = which(HMDB_add_iso[,label]==mass)
+    
+    # Consider groups off 4ppm
+    mass.all = as.numeric(HMDB_add_iso[,label])
+    index = which((mass.all > (mass - mtol)) & (mass.all < (mass + mtol)))
+    # index = which(mass.all < (mass + 2*mtol))
+    tmplist.mass = HMDB_add_iso[index,,drop=FALSE]
+    
+    if (dim(tmplist.mass)[1]>0) {
+      
+      index.iso = grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE)
+      if (length(index.iso)>0){
+        tmplist.mass.iso = tmplist.mass[index.iso,,drop=FALSE]
+        tmplist.mass = tmplist.mass[-index.iso,,drop=FALSE]
+      }
+      
+      if (dim(tmplist.mass)[1]>0) {        
+        index.adduct = grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE)
+        if (length(index.adduct)>0){
+          tmplist.mass.adduct = tmplist.mass[index.adduct,,drop=FALSE]
+          tmplist.mass = tmplist.mass[-index.adduct,,drop=FALSE]
+        }  
+      }  
+      
+      # First compouds without adducts or isotopes
+      if (dim(tmplist.mass)[1]>0) {
+        
+        # pure compounds
+        assi_HMDB = as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";"))
+        HMDB_code = as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";"))
+        theormz_HMDB = as.numeric(tmplist.mass[1,label])
+        
+        # adducts
+        if (!is.null(tmplist.mass.adduct)) {
+          if (dim(tmplist.mass.adduct)[1]>0) {
+            if (is.na(assi_HMDB)){
+              assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";"))
+              HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
+            } else {
+              assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+              HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
+            }}}
+        
+        # isotopes
+        if (!is.null(tmplist.mass.iso)) {
+          if (dim(tmplist.mass.iso)[1]>0) {
+            iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+          }}
+        
+        # No pure compounts  
+      } else if (!is.null(tmplist.mass.adduct)) {
+        
+        theormz_HMDB = as.numeric(tmplist.mass.adduct[1,label])
+        
+        # adducts
+        if (!is.null(tmplist.mass.adduct)) {
+          if (dim(tmplist.mass.adduct)[1]>0) {
+            if (is.na(assi_HMDB)){
+              assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";"))
+              HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
+            } else {
+              assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+              HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
+            }}}
+        
+        # isotopes
+        if (!is.null(tmplist.mass.iso)) {
+          if (dim(tmplist.mass.iso)[1]>0) {
+            iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+          }}
+        
+        # only isotopes  
+      } else if (!is.null(tmplist.mass.iso)) {
+        
+        if (dim(tmplist.mass.iso)[1]>0) {
+          theormz_HMDB = as.numeric(tmplist.mass.iso[1,label])
+          iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+        }
+      }  
+      
+    }
+    
+    outpgrlist.identified = rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp),
+                                         t(as.matrix(ints.allsamps)),
+                                         data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB)))
+  }
+  
+  HMDB_add_iso = HMDB_add_iso[-index,]
+  
+}
+
+#dir.create(paste(outdir, "6-grouping_hmdb", sep="/"), showWarnings = FALSE)
+save(outpgrlist.identified, file=paste(paste(outdir, "6-grouping_hmdb", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/"))
+
+#dir.create(paste(outdir, "6-grouping_hmdb_done", sep="/"), showWarnings = FALSE)
+save(outlist.grouped, file=paste(paste(outdir, "6-grouping_hmdb_grouped", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/"))
diff --git a/DIMS/PeakGroupingIdentified.nf b/DIMS/PeakGroupingIdentified.nf
new file mode 100644
index 0000000..8100780
--- /dev/null
+++ b/DIMS/PeakGroupingIdentified.nf
@@ -0,0 +1,18 @@
+process PeakGroupingIdentified {
+    label 'PeakGroupingIdentified'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(HMDBpart_file)
+       val(resolution)
+
+    output:
+       path 'negative.RData'
+       path 'positive.RData'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $HMDBpart_file $resolution $ppm
+        """
+}
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
new file mode 100644
index 0000000..f85d9c5
--- /dev/null
+++ b/DIMS/SpectrumPeakFinding.R
@@ -0,0 +1,77 @@
+#!/usr/bin/Rscript
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+# outdir <- cmd_args[1]
+# scanmode <- cmd_args[2]
+scanmodes <- c("positive", "negative")
+
+# Check if all jobs terminated correct!
+notRun = NULL
+
+# new: run for each scanmode
+for (scanmode in scanmodes) {
+  load(paste0("./repl.pattern.", scanmode, ".RData"))
+  groupNames = names(repl.pattern.filtered)
+
+  indir <- "./"
+  object.files = list.files(indir, full.names=TRUE, pattern="*.RData")
+
+  for (i in 1:length(groupNames)) {
+    group <- paste0(indir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData"))
+    if (!(group %in% object.files)) {
+      notRun = c(notRun, group)
+    }
+  }
+
+    #if (is.null(notRun)){
+    cat("\nCollecting samples!")
+
+    # negative
+    filepath <- "./")
+    files <- list.files(filepath, recursive=TRUE, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep=""))
+
+    outlist.tot=NULL
+    for (i in 1:length(files)) {
+
+      cat("\n", files[i])
+      load(files[i])
+
+      if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)){
+        tmp=strsplit(files[i], "/")[[1]]
+        fname = tmp[length(tmp)]
+        #fname = strsplit(files[i], "/")[[1]][8]
+        fname = strsplit(fname, ".RData")[[1]][1]
+        fname = substr(fname, 13, nchar(fname))
+
+        if (i == 1) { outlist.tot <- c(fname, rep("-1",5)) } else { outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5)))}
+      } else {
+        if (i == 1) { outlist.tot <- outlist.persample } else { outlist.tot <- rbind(outlist.tot, outlist.persample)}
+      }
+    }
+
+    # remove negative values
+    index=which(outlist.tot[,"height.pkt"]<=0)
+    if (length(index)>0) outlist.tot = outlist.tot[-index,]
+    index=which(outlist.tot[,"mzmed.pkt"]<=0)
+    if (length(index)>0) outlist.tot = outlist.tot[-index,]
+
+    # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/")
+    # dir.create(outdir_specpks, showWarnings = F)
+    # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/"))
+    save(outlist.tot, file = paste(scanmode, "RData", sep = ".")
+
+
+    if (!is.null(notRun)){
+      for (i in 1:length(notRun)){
+        message(paste(notRun[i], "was not generated"))
+      }
+    }
+
+}
+
diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf
new file mode 100644
index 0000000..66e04a8
--- /dev/null
+++ b/DIMS/SpectrumPeakFinding.nf
@@ -0,0 +1,17 @@
+process SpectrumPeakFinding {
+    label 'SpectrumPeakFinding'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(RData_file)
+
+    output:
+       path 'negative.RData'
+       path 'positive.RData'
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $RData_file
+        """
+}

From 5461f865ab93e04852f7c791d79d2c0133fd5355 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 11 Aug 2023 15:23:30 +0200
Subject: [PATCH 06/73] added ThermoRawFileParser.nf to CustomModules

---
 DIMS/ThermoRawFileParser.nf | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 DIMS/ThermoRawFileParser.nf

diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf
new file mode 100644
index 0000000..af86aa3
--- /dev/null
+++ b/DIMS/ThermoRawFileParser.nf
@@ -0,0 +1,19 @@
+process ConvertRawFile {
+    // Custom process to convert raw file to mzML format
+    tag {"DIMS ConvertRawFile ${file_id}"}
+    label 'ThermoRawFileParser_1_1_11'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+        tuple(file_id, path(raw_file))
+
+    output:
+        tuple(file_id, path("${file_id}.mzML"))
+
+    script:
+
+        """
+        source /hpc/dbg_mz/tools/mono/etc/profile
+        mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./
+        """
+}

From 0d8f0f0a5e72f69685dde4eb2551b7291e13ce33 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 14 Aug 2023 13:55:37 +0200
Subject: [PATCH 07/73] added file for listing of raw files

---
 DIMS/Utils/RawFiles.nf | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 DIMS/Utils/RawFiles.nf

diff --git a/DIMS/Utils/RawFiles.nf b/DIMS/Utils/RawFiles.nf
new file mode 100644
index 0000000..8cc0efe
--- /dev/null
+++ b/DIMS/Utils/RawFiles.nf
@@ -0,0 +1,12 @@
+def extractRawfilesFromDir(dir) {
+    // Original code from: https://github.com/SciLifeLab/Sarek - MIT License - Copyright (c) 2016 SciLifeLab
+    dir = dir.tokenize().collect{"$it/*.raw"}
+    Channel
+    .fromPath(dir, type:'file')
+    .ifEmpty { error "No raw files found in ${dir}." }
+    .map { rawfiles_path ->
+        def file_id = rawfiles_path.getSimpleName()
+        [file_id, rawfiles_path]
+    }
+}
+

From 4c29eb8818fd41cd9da0d3061bf6878d1d6459e4 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 21 Aug 2023 16:17:05 +0200
Subject: [PATCH 08/73] new files added, existing files modified

---
 DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R |  24 +++
 DIMS/AddOnFunctions/removeFromRepl.pat.R  |  31 ++++
 DIMS/AssignToBins.R                       |  31 ++--
 DIMS/AssignToBins.nf                      |   3 +-
 DIMS/AverageTechReplicates.R              |  48 +++---
 DIMS/AverageTechReplicates.nf             |  12 +-
 DIMS/GenerateBreaks.R                     |  22 ++-
 DIMS/HMDBparts.R                          | 181 +++++++++++-----------
 DIMS/HMDBparts.nf                         |   5 +-
 DIMS/HMDBparts_old.R                      | 126 +++++++++++++++
 DIMS/MakeInit.R                           |  15 +-
 DIMS/PeakFinding.R                        | 100 ++++++------
 DIMS/PeakFinding.nf                       |  15 +-
 DIMS/PeakGroupingIdentified.R             |   2 +-
 DIMS/PeakGroupingIdentified.nf            |   9 +-
 DIMS/SpectrumPeakFinding.R                | 117 +++++++-------
 DIMS/SpectrumPeakFinding.nf               |   3 +-
 17 files changed, 464 insertions(+), 280 deletions(-)
 create mode 100644 DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R
 create mode 100644 DIMS/AddOnFunctions/removeFromRepl.pat.R
 create mode 100644 DIMS/HMDBparts_old.R

diff --git a/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R
new file mode 100644
index 0000000..c00aa92
--- /dev/null
+++ b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R
@@ -0,0 +1,24 @@
+### fit Gaussian estimate mean and integrate to obtain intensity
+findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) {
+  sampname <- colnames(plist)[1]
+  
+  range <- as.vector(plist)
+  names(range) <- rownames(plist)
+  
+  values <- list("mean"=NULL, "area"=NULL, "nr"=NULL, "min"=NULL, "max"=NULL, "qual"=NULL, "spikes"=0)
+  
+  values <- searchMZRange(range, values, int.factor, scale, resol, outdir, sampname, scanmode, plot, width, height, thresh)
+  
+  outlist.persample <- NULL
+  outlist.persample <- cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area)
+
+  index <- which(outlist.persample[ ,"height.pkt"]==0)
+  if (length(index) > 0) {
+    outlist.persample <- outlist.persample[-index,]
+  }
+  
+  # save(outlist.persample, file=paste(outdir, paste(sampname, "_", scanmode, ".RData", sep=""), sep="/"))
+  save(outlist.persample, file=paste("./", sampname, "_", scanmode, ".RData", sep=""))
+  
+  cat(paste("There were", values$spikes, "spikes!"))
+}
diff --git a/DIMS/AddOnFunctions/removeFromRepl.pat.R b/DIMS/AddOnFunctions/removeFromRepl.pat.R
new file mode 100644
index 0000000..ad5b1c8
--- /dev/null
+++ b/DIMS/AddOnFunctions/removeFromRepl.pat.R
@@ -0,0 +1,31 @@
+removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
+
+  tmp = repl_pattern
+
+  removeFromGroup=NULL
+
+  for (i in 1:length(tmp)){
+    tmp2 = repl_pattern[[i]]
+
+    remove=NULL
+
+    for (j in 1:length(tmp2)){
+      if (tmp2[j] %in% bad_samples){
+        #cat(tmp2[j])
+        #cat(paste("remove",tmp2[j]))
+        #cat(paste("remove i",i))
+        #cat(paste("remove j",j))
+        remove = c(remove, j)
+      }
+    }
+
+    if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i)
+    if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove]
+  }
+
+  if (length(removeFromGroup)!=0) {
+    repl_pattern=repl_pattern[-removeFromGroup]
+  }
+
+  return(list("pattern"=repl_pattern))
+}
diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index a218c4d..c6fdaba 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -1,8 +1,6 @@
 #!/usr/bin/Rscript
 ## adapted from 2-DIMS.R
 
-#.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.6.2")
-
 # load required packages 
 suppressPackageStartupMessages(library("xcms"))
 
@@ -20,10 +18,7 @@ print(breaks_filepath)
 print(resol)
 
 sampname <- sub('\\..*$', '', basename(filepath))
-#cat(paste0("\n", sampname))
-# sampname <- "AssignToBins"
 
-#suppressPackageStartupMessages(library("Cairo"))
 options(digits=16)
 
 ### process one sample at a time and find peaks FOR BOTH SCAN MODES! #
@@ -32,11 +27,11 @@ scale=2 # Initial value used to estimate scaling parameter
 width=1024
 height=768
 
-# Aggregate
+# Initiate  
 trimLeft=NULL
 trimRight=NULL
-breaks.fwhm=NULL
-breaks.fwhm.avg=NULL
+breaks_fwhm=NULL
+breaks_fwhm_avg=NULL
 bins=NULL
 pos_results=NULL
 neg_results=NULL
@@ -44,11 +39,11 @@ neg_results=NULL
 # read in the data for 1 sample
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
-# load breaks.fwhm
+# load breaks_fwhm
 load(breaks_filepath)
 
 # Create empty placeholders for later use
-bins <- rep(0, length(breaks.fwhm) - 1)
+bins <- rep(0, length(breaks_fwhm) - 1)
 pos_bins <- bins
 neg_bins <- bins
 
@@ -70,8 +65,8 @@ pos_raw_data_matrix <- raw_data_matrix[pos_index, ]
 neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
 
 # Get index for binning intensity values
-bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
-bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
+bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
+bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
 
 # Get the list of intensity values for each bin, and add the
 # intensity values which are in the same bin
@@ -109,19 +104,19 @@ neg_results_transpose = t(neg_results)
 rownames(pos_results_transpose) = sampname
 rownames(neg_results_transpose) = sampname
 
-# delete the last value of breaks.fwhm.avg to match dimensions of pos_results and neg_results
-breaks.fwhm.avg.minus1 <- breaks.fwhm.avg[-length(breaks.fwhm.avg)]
+# delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results
+breaks_fwhm_avg_minus1 <- breaks_fwhm_avg[-length(breaks_fwhm_avg)]
 # Format as string and show precision of float to 5 digits
-breaks.fwhm.avg.minus1 <- sprintf("%.5f", breaks.fwhm.avg.minus1)
+breaks_fwhm_avg_minus1 <- sprintf("%.5f", breaks_fwhm_avg_minus1)
 
 # Use this as the column names
-colnames(pos_results_transpose) <- breaks.fwhm.avg.minus1
-colnames(neg_results_transpose) <- breaks.fwhm.avg.minus1
+colnames(pos_results_transpose) <- breaks_fwhm_avg_minus1
+colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1
 
 # transpose back
 pos_results_final <- t(pos_results_transpose)
 neg_results_final <- t(neg_results_transpose)
 
-pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks.fwhm)
+pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm)
 
 save(pklist, file=paste("./", sampname, ".RData", sep=""))
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index 0bb7855..c7103cd 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -5,14 +5,13 @@ process AssignToBins {
 
     input:
        tuple(file_id, path(mzML_filename), path(breaks_file))
-       val(resolution)
 
     output:
        path("${file_id}.RData")
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $resolution
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $params.resolution
         """
 }
 
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 3bfc484..4303a99 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -1,8 +1,8 @@
 #!/usr/bin/Rscript
 # adapted from 3-AverageTechReplicates.R
 
-# load required packages 
-# none 
+# load required functions 
+# source(paste(scripts_dir, "AddOnFunctions/removeFromRepl.patR", sep=""))
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
@@ -13,15 +13,15 @@ nr_replicates <- as.numeric(cmd_args[2])
 thresh2remove <- 2000
 dimsThresh <- 100
 
-removeFromRepl.pat <- function(bad_samples, repl.pattern, nr_replicates) {
+removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
   # bad_samples=remove_pos
   
-  tmp = repl.pattern
+  tmp = repl_pattern
   
   removeFromGroup=NULL
   
   for (i in 1:length(tmp)){
-    tmp2 = repl.pattern[[i]]
+    tmp2 = repl_pattern[[i]]
     
     remove=NULL
     
@@ -36,33 +36,33 @@ removeFromRepl.pat <- function(bad_samples, repl.pattern, nr_replicates) {
     }
     
     if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i)
-    if (!is.null(remove)) repl.pattern[[i]]=repl.pattern[[i]][-remove]
+    if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove]
   }
   
   if (length(removeFromGroup)!=0) {
-    repl.pattern=repl.pattern[-removeFromGroup]
+    repl_pattern=repl_pattern[-removeFromGroup]
   }
   
-  return(list("pattern"=repl.pattern))
+  return(list("pattern"=repl_pattern))
 }
 
 
-# get repl.pattern
+# get repl_pattern
 load("./init.RData")
 
 remove_neg=NULL
 remove_pos=NULL
 cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
-for (i in 1:length(repl.pattern)) {
+for (i in 1:length(repl_pattern)) {
   techRepsArray.pos = NULL
   techRepsArray.neg = NULL
   
-  tech_reps = as.vector(unlist(repl.pattern[i]))
+  tech_reps = as.vector(unlist(repl_pattern[i]))
   sum_neg=0
   sum_pos=0
   n_pos=0
   n_neg=0
-  cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl.pattern))
+  cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl_pattern))
   for (j in 1:length(tech_reps)) {
     load(paste("./", tech_reps[j], ".RData", sep=""))
     cat("\n\nParsing", tech_reps[j])
@@ -96,24 +96,24 @@ for (i in 1:length(repl.pattern)) {
   
   if (n_neg != 0){
     sum_neg[,1] <- sum_neg[,1]/n_neg
-    colnames(sum_neg) <- names(repl.pattern)[i]
-    save(sum_neg, file=paste("./", names(repl.pattern)[i], "_neg.RData", sep=""))
+    colnames(sum_neg) <- names(repl_pattern)[i]
+    save(sum_neg, file=paste("./", names(repl_pattern)[i], "_neg_avg.RData", sep=""))
   }
   if (n_pos != 0) {
     sum_pos[,1] <- sum_pos[,1]/n_pos
-    colnames(sum_pos) <- names(repl.pattern)[i]
-    save(sum_pos, file=paste("./", names(repl.pattern)[i], "_pos.RData", sep=""))
+    colnames(sum_pos) <- names(repl_pattern)[i]
+    save(sum_pos, file=paste("./", names(repl_pattern)[i], "_pos_avg.RData", sep=""))
   }
 }
 
-retVal <- removeFromRepl.pat(remove_neg, repl.pattern, nr_replicates)
-repl.pattern.filtered <- retVal$pattern
-save(repl.pattern.filtered, file="./repl_pattern_negative.RData")
-write.table(remove_neg, file="./miss_infusions_neg.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
+retVal <- removeFromRepl.pat(remove_neg, repl_pattern, nr_replicates)
+repl_pattern_filtered <- retVal$pattern
+save(repl_pattern_filtered, file="./negative_repl_pattern.RData")
+write.table(remove_neg, file="./miss_infusions_negative.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
 
-retVal <- removeFromRepl.pat(remove_pos, repl.pattern, nr_replicates)
-repl.pattern.filtered <- retVal$pattern
-save(repl.pattern.filtered, file="./repl_pattern_positive.RData")
-write.table(remove_pos, file="./miss_infusions_pos.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
+retVal <- removeFromRepl.pat(remove_pos, repl_pattern, nr_replicates)
+repl_pattern_filtered <- retVal$pattern
+save(repl_pattern_filtered, file="./positive_repl_pattern.RData")
+write.table(remove_pos, file="./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
 
 
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index aa3b99d..26f9211 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -6,18 +6,16 @@ process AverageTechReplicates {
     input:
        path(RData_file)
        path(init_filepath)
-       val(nr_replicates)
 
     output:
-       path '*_*.RData', emit: binned
-       path 'repl.pattern.negative.RData'
-       path 'repl.pattern.positive.RData'
-       path 'miss_infusions_neg.txt'
-       path 'miss_infusions_pos.txt'
+       path '*_repl_pattern.RData', emit: patterns
+       path '*_avg.RData', emit: binned
+       path 'miss_infusions_negative.txt'
+       path 'miss_infusions_positive.txt'
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $nr_replicates
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates
         """
 }
 
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 22b80c9..285da0d 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -2,8 +2,6 @@
 ## adapted from 1-generateBreaksFwhm.HPC.R ##
 #!/usr/bin/Rscript
 
-# .libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2")
-
 # load required package 
 suppressPackageStartupMessages(library("xcms"))
 
@@ -19,8 +17,8 @@ resol    <- as.numeric(cmd_args[4]) # 140000
 # initialize
 trimLeft        = NULL
 trimRight       = NULL
-breaks.fwhm     = NULL
-breaks.fwhm.avg = NULL
+breaks_fwhm     = NULL
+breaks_fwhm_avg = NULL
 bins            = NULL
 posRes          = NULL
 negRes          = NULL
@@ -28,7 +26,7 @@ negRes          = NULL
 # read in mzML file
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
-# trim scans at the start and end
+# trim (remove) scans at the start and end
 trimLeft  = round(raw_data@scantime[length(raw_data@scantime)*trim])
 trimRight = round(raw_data@scantime[length(raw_data@scantime)*(1-trim)])
 
@@ -44,16 +42,16 @@ segment  = seq(from=lowMZ, to=highMZ, length.out=nsegment+1)
 for (i in 1:nsegment) {
   startsegm   <- segment[i]
   endsegm     <- segment[i+1]
-  resol.mz    <- resol*(1/sqrt(2)^(log2(startsegm/200)))
-  fwhmsegm    <- startsegm/resol.mz
-  breaks.fwhm <- c(breaks.fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm))
+  resol_mz    <- resol*(1/sqrt(2)^(log2(startsegm/200)))
+  fwhmsegm    <- startsegm/resol_mz
+  breaks_fwhm <- c(breaks_fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm))
   # average the m/z instead of start value
   range = seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)
   deltaMZ = range[2] - range[1]
-  breaks.fwhm.avg <- c(breaks.fwhm.avg, range + 0.5*deltaMZ)
+  breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5*deltaMZ)
 }
 
-save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/"))
-# temporary fix for breaks file:
-# save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir/../, "breaks.fwhm.RData", sep="/"))
+# remove one of these:
+# save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/"))
+save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData")
 
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 10fdc37..6fb1816 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -1,7 +1,5 @@
 #!/usr/bin/Rscript
-
-# load required packages 
-# none 
+# adapted from hmdb_parts.R
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
@@ -11,116 +9,123 @@ for (arg in cmd_args) cat("  ", arg, "\n")
 # scanmode <- cmd_args[2]
 db_path <- cmd_args[1] # location of HMDB db file
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-standard_run  <- "no" # cmd_args[5] # "yes"
+standard_run  <- cmd_args[3] # "yes"
 
 # Cut up entire HMDB into small parts based on the new binning/breaks 
 
-# load(paste(outdir, "breaks.fwhm.RData", sep = "/"))
 load(breaks_filepath)
-#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/")
-#dir.create(outdir_hmdb, showWarnings = FALSE)
 
-# New: in case of a standard run (m/z 69-606) use external HMDB parts
-min_mz <- round(breaks.fwhm[1])
-max_mz <- round(breaks.fwhm[length(breaks.fwhm)])
+# In case of a standard run (m/z 69-606) use external HMDB parts
+min_mz <- round(breaks_fwhm[1])
+max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
 # test if standard mz range is used
 if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) {
   # skip generating HMDB parts
   use_external_HMDB <- TRUE
   save(use_external_HMDB, file="./using_external_HMDB_parts.RData")
   hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/"
-  hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files
+  # hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files
+  hmdb_parts <- list.files(hmdb_parts_dir, pattern=hmdb) # all files containing hmdb in file name
   # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/"))
   for (hmdb_file in hmdb_parts) {
-    file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE)
+    # file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE)
+    file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), "./", recursive = TRUE)
   }
 } else { 
   # generate HMDB parts in case of non-standard mz range
   use_external_HMDB <- FALSE
-  save(use_external_HMDB, file="not_using_external_HMDB_parts.RData")
+  save(use_external_HMDB, file="./not_using_external_HMDB_parts.RData")
   # db <- cmd_args[3]
   load(db_path)
   ppm <- as.numeric(cmd_args[4])
-  if (scanmode=="negative"){
-    label = "MNeg"
-    HMDB_add_iso=HMDB_add_iso.Neg
-  } else {
-    label = "Mpos"
-    HMDB_add_iso=HMDB_add_iso.Pos
-  }
 
-  # filter mass range meassured!!!
-  HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),]
-
-  # sort on mass
-  outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),]
-
-  n=dim(outlist)[1]
-  sub=20000 # max rows per file
-  end=0
-  min_1_last=sub
-  check=0
-  outlist_part=NULL
-
-
-  if (n < sub) {
-    outlist_part <- outlist
-    save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/"))
-  } else {
-  
-    if (n >= sub & (floor(n/sub) - 1) >= 2){
-      for (i in 2:floor(n/sub) - 1){
-        start <- -(sub - 1) + i*sub
-        end <- i*sub
-      
-        if (i > 1){
-          outlist_i = outlist[c(start:end),]
-        
-          n_moved = 0
-        
+  scanmodes <- c("positive", "negative")
+
+  for (scanmode in scanmodes) {
+    if (scanmode == "negative") {
+      column_label <- "MNeg"
+      HMDB_add_iso <- HMDB_add_iso.Neg
+    } else if (scanmode == "positive") {
+      column_label <- "Mpos"
+      HMDB_add_iso <- HMDB_add_iso.Pos
+    }
+
+    # filter mass range meassured!!!
+    HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] &
+                                      HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+
+    # sort on mass
+    outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),]
+
+    n=dim(outlist)[1]
+    sub=20000 # max rows per file
+    end=0
+    min_1_last=sub
+    check=0
+    outlist_part=NULL
+
+
+    if (n < sub) {
+      outlist_part <- outlist
+      save(outlist_part, file = paste0("./", scanmode, "_hmdb.1.RData"))
+    } else {
+
+        if (n >= sub & (floor(n/sub) - 1) >= 2){
+          for (i in 2:floor(n/sub) - 1){
+            start <- -(sub - 1) + i*sub
+            end <- i*sub
+
+            if (i > 1){
+              outlist_i = outlist[c(start:end),]
+
+              n_moved = 0
+
+              # Calculate 3ppm and replace border, avoid cut within peakgroup!
+              while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) {
+                outlist_part <- rbind(outlist_part, outlist_i[1,])
+                outlist_i <- outlist_i[-1,]
+                n_moved <- n_moved + 1
+              }
+
+              # message(paste("Process", i-1,":", dim(outlist_part)[1]))
+              save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep="."), sep=""))
+              check <- check + dim(outlist_part)[1]
+
+              outlist_part <- outlist_i
+              min_1_last <- dim(outlist_part)[1]
+
+            } else {
+              outlist_part <- outlist[c(start:end),]
+            }
+          }
+        }
+
+        start <- end + 1
+        end <- n
+        outlist_i <- outlist[c(start:end),]
+        n_moved <- 0
+
+        if (!is.null(outlist_part)) {
           # Calculate 3ppm and replace border, avoid cut within peakgroup!
-          while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
-            outlist_part <- rbind(outlist_part, outlist_i[1,])
-            outlist_i <- outlist_i[-1,]
-            n_moved <- n_moved + 1
+          while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) {
+            outlist_part = rbind(outlist_part, outlist_i[1,])
+            outlist_i = outlist_i[-1,]
+            n_moved = n_moved + 1
           }
-        
-          # message(paste("Process", i-1,":", dim(outlist_part)[1]))
-          save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/"))
+
+          # message(paste("Process", i+1-1,":", dim(outlist_part)[1]))
+          save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i, "RData", sep = "."), sep = ""))
           check <- check + dim(outlist_part)[1]
-        
-          outlist_part <- outlist_i
-          min_1_last <- dim(outlist_part)[1]
-        
-        } else {
-          outlist_part <- outlist[c(start:end),]
         }
+
+        outlist_part <- outlist_i
+        # message(paste("Process", i+2-1,":", dim(outlist_part)[1]))
+        save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep=""))
+        check <- check + dim(outlist_part)[1]
+        cat("\n", "Check", check == dim(outlist)[1])
+
       }
     }
-  
-    start <- end + 1
-    end <- n
-    outlist_i <- outlist[c(start:end),]
-    n_moved <- 0
-  
-    if (!is.null(outlist_part)) {
-      # Calculate 3ppm and replace border, avoid cut within peakgroup!
-      while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
-        outlist_part = rbind(outlist_part, outlist_i[1,])
-        outlist_i = outlist_i[-1,]
-        n_moved = n_moved + 1
-      }
-    
-      # message(paste("Process", i+1-1,":", dim(outlist_part)[1]))
-      save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_"))
-      check <- check + dim(outlist_part)[1]
-    }
-  
-    outlist_part <- outlist_i
-    # message(paste("Process", i+2-1,":", dim(outlist_part)[1]))
-    save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_"))
-    check <- check + dim(outlist_part)[1]
-    cat("\n", "Check", check == dim(outlist)[1])
-  
-  }
+
 }
+
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
index 8ed9130..7d58bfe 100644
--- a/DIMS/HMDBparts.nf
+++ b/DIMS/HMDBparts.nf
@@ -6,8 +6,11 @@ process HMDBparts {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
+        // tuple(path(hmdb_db_file), path(breaks_file))
         path(hmdb_db_file)
         path(breaks_file)
+        // val(standard_run)
+        // val(ppm)
 
     output:
         path('*.RData')
@@ -15,6 +18,6 @@ process HMDBparts {
     script:
 
         """
-        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.standard_run $params.ppm
         """
 }
diff --git a/DIMS/HMDBparts_old.R b/DIMS/HMDBparts_old.R
new file mode 100644
index 0000000..10fdc37
--- /dev/null
+++ b/DIMS/HMDBparts_old.R
@@ -0,0 +1,126 @@
+#!/usr/bin/Rscript
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+# outdir <- cmd_args[1]
+# scanmode <- cmd_args[2]
+db_path <- cmd_args[1] # location of HMDB db file
+breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
+standard_run  <- "no" # cmd_args[5] # "yes"
+
+# Cut up entire HMDB into small parts based on the new binning/breaks 
+
+# load(paste(outdir, "breaks.fwhm.RData", sep = "/"))
+load(breaks_filepath)
+#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/")
+#dir.create(outdir_hmdb, showWarnings = FALSE)
+
+# New: in case of a standard run (m/z 69-606) use external HMDB parts
+min_mz <- round(breaks.fwhm[1])
+max_mz <- round(breaks.fwhm[length(breaks.fwhm)])
+# test if standard mz range is used
+if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) {
+  # skip generating HMDB parts
+  use_external_HMDB <- TRUE
+  save(use_external_HMDB, file="./using_external_HMDB_parts.RData")
+  hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/"
+  hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files
+  # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/"))
+  for (hmdb_file in hmdb_parts) {
+    file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE)
+  }
+} else { 
+  # generate HMDB parts in case of non-standard mz range
+  use_external_HMDB <- FALSE
+  save(use_external_HMDB, file="not_using_external_HMDB_parts.RData")
+  # db <- cmd_args[3]
+  load(db_path)
+  ppm <- as.numeric(cmd_args[4])
+  if (scanmode=="negative"){
+    label = "MNeg"
+    HMDB_add_iso=HMDB_add_iso.Neg
+  } else {
+    label = "Mpos"
+    HMDB_add_iso=HMDB_add_iso.Pos
+  }
+
+  # filter mass range meassured!!!
+  HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),]
+
+  # sort on mass
+  outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),]
+
+  n=dim(outlist)[1]
+  sub=20000 # max rows per file
+  end=0
+  min_1_last=sub
+  check=0
+  outlist_part=NULL
+
+
+  if (n < sub) {
+    outlist_part <- outlist
+    save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/"))
+  } else {
+  
+    if (n >= sub & (floor(n/sub) - 1) >= 2){
+      for (i in 2:floor(n/sub) - 1){
+        start <- -(sub - 1) + i*sub
+        end <- i*sub
+      
+        if (i > 1){
+          outlist_i = outlist[c(start:end),]
+        
+          n_moved = 0
+        
+          # Calculate 3ppm and replace border, avoid cut within peakgroup!
+          while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
+            outlist_part <- rbind(outlist_part, outlist_i[1,])
+            outlist_i <- outlist_i[-1,]
+            n_moved <- n_moved + 1
+          }
+        
+          # message(paste("Process", i-1,":", dim(outlist_part)[1]))
+          save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/"))
+          check <- check + dim(outlist_part)[1]
+        
+          outlist_part <- outlist_i
+          min_1_last <- dim(outlist_part)[1]
+        
+        } else {
+          outlist_part <- outlist[c(start:end),]
+        }
+      }
+    }
+  
+    start <- end + 1
+    end <- n
+    outlist_i <- outlist[c(start:end),]
+    n_moved <- 0
+  
+    if (!is.null(outlist_part)) {
+      # Calculate 3ppm and replace border, avoid cut within peakgroup!
+      while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
+        outlist_part = rbind(outlist_part, outlist_i[1,])
+        outlist_i = outlist_i[-1,]
+        n_moved = n_moved + 1
+      }
+    
+      # message(paste("Process", i+1-1,":", dim(outlist_part)[1]))
+      save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_"))
+      check <- check + dim(outlist_part)[1]
+    }
+  
+    outlist_part <- outlist_i
+    # message(paste("Process", i+2-1,":", dim(outlist_part)[1]))
+    save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_"))
+    check <- check + dim(outlist_part)[1]
+    cat("\n", "Check", check == dim(outlist)[1])
+  
+  }
+}
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 2657b66..2f2a25a 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,10 +1,6 @@
 #!/usr/bin/env Rscript
 ## adapted from makeInit in old pipeline
 
-# used for when init.RData has to be created manually
-# arg1 : path to sampleNames.txt or whatever the name of the samplesheet txt file may be
-# arg2 : amount of technical replicates (usually 3)
-
 args <- commandArgs(trailingOnly=TRUE)
 sample_sheet <- read.csv(args[1], sep="\t") 
 nr_replicates <- as.numeric(args[2])
@@ -14,21 +10,20 @@ nr_sampgrps <- length(sampleNames)/nr_replicates
 groupNames <- trimws(as.vector(unlist(sample_sheet[2])))
 groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames)
 groupNamesUnique <- unique(groupNames)
-#groupNamesNotUnique <- groupNames[duplicated(groupNames)]
 
-repl.pattern <- c()
+repl_pattern <- c()
 for (sampgrp in 1:nr_sampgrps) {
   tmp <- c()
   for (repl in nr_replicates:1) {
     index <- ((sampgrp*nr_replicates) - repl) + 1
     tmp <- c(tmp, sampleNames[index])
   }
-  repl.pattern <- c(repl.pattern, list(tmp))
+  repl_pattern <- c(repl_pattern, list(tmp))
 }
 
-names(repl.pattern) <- groupNamesUnique
+names(repl_pattern) <- groupNamesUnique
 
 # just to preview
-head(repl.pattern)
+head(repl_pattern)
 
-save(repl.pattern, file="init.RData")
+save(repl_pattern, file="./init.RData")
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 4820ef5..a739257 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,68 +1,72 @@
 #!/usr/bin/Rscript
-
-# load required packages 
-# none 
+# adapted from 4-peakFinding.R
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 filepath <- cmd_args[1]
-resol <- as.numeric(cmd_args[2])
-scripts <- cmd_args[3]
-breaks_filepath <- cmd_args[4] # location of breaks.fwhm.RData
+breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
+resol <- as.numeric(cmd_args[3])
+scripts_dir <- cmd_args[4]
 
 thresh <- 2000
 
 # if file extension is .txt, do nothing.
-if (grepl(".txt$", filepath) { stop("not run on txt file") }
+if (grepl(".txt$", filepath)) { stop("not run on txt file") }
 
-# create output folder
-dir.create(paste(outdir, "4-specpks", sep="/"),showWarnings = F)
+# for debugging:
+print(filepath)
+print(breaks_filepath)
+print(resol)
+print(scripts_dir)
+print(paste(scripts_dir, "AddOnFunctions/", sep=""))
 
 # load in function scripts
-source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/"))
-sourceDir(paste(scripts, "AddOnFunctions", sep="/"))
+# source(paste(scripts_dir, "AddOnFunctions/sourceDir.R", sep="/"))
+# the sourceDir function no longer works
+# sourceDir(paste(scripts_dir, "AddOnFunctions", sep="/"))
+source(paste(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/searchMZRange.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/generateGaussian.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fitGaussian.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fitGaussianInit.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/getFwhm.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/getSD.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/optimizeGauss.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit1Peak.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit2peaks.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit3peaks.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit4peaks.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fitG.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit2G.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit3G.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/fit4G.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/getArea.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/getFitQuality.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/checkOverlap.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/sumCurves.R", sep=""))
+source(paste(scripts_dir, "AddOnFunctions/isWithinXppm.R", sep=""))
 
-# load(paste(outdir, "breaks.fwhm.RData", sep="/"))
 load(breaks_filepath)
-load(filepath)
-if (grepl("_pos", filepath)) { scanmode = "positive" } else
-  if (grepl("_neg", filepath)) { scanmode = "negative" }
-
-options(digits=16)
-int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
-scale=2 # Initial value used to estimate scaling parameter
-width=1024
-height=768
+# load(filepath)
 
-### fit Gaussian estimate mean and integrate to obtain intensity
-findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) {
-  sampname <- colnames(plist)[1]
-  
-  range = as.vector(plist)
-  names(range) = rownames(plist)
-  
-  values = list("mean"=NULL,"area"=NULL,"nr"=NULL,"min"=NULL,"max"=NULL,"qual"=NULL,"spikes"=0)
-  
-  values = searchMZRange(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh)
-  
-  outlist.persample=NULL
-  outlist.persample=cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area)
-
-  index=which(outlist.persample[,"height.pkt"]==0)
-  if (length(index)>0) {
-    outlist.persample=outlist.persample[-index,]
-  }
-  
-  save(outlist.persample, file=paste(outdir, "4-specpks", paste(sampname, "_", scanmode, ".RData", sep=""), sep="/"))
-  
-  cat(paste("There were", values$spikes, "spikes!"))
-}
+# for some reason, the repl_pattern_positive.RData and repl_pattern_negative.RData files are also read in.
+if (!grepl("repl_pattern", filepath)) {
+  # Load output of AverageTechReplicates for a sample
+  sample_avgtechrepl <- get(load(filepath))
+  if (grepl("_pos", filepath)) { scanmode = "positive" } else
+    if (grepl("_neg", filepath)) { scanmode = "negative" }
 
+  # for debugging:
+  print(filepath)
+  print(scanmode)
 
-if (scanmode == "negative") {
-  findPeaks.Gauss.HPC(sum_neg, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
-} else {
-  findPeaks.Gauss.HPC(sum_pos, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+  # Initialize
+  options(digits = 16)
+  int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+  scale <- 2 # Initial value used to estimate scaling parameter
+  width <- 1024
+  height <- 768
+  findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
 }
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index fb86239..343c4c1 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -4,20 +4,15 @@ process PeakFinding {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(RData_file)
-       val(resolution)
-       path(scripts)
-       path(breaks_file)
+       // path(RData_file)
+       // path(breaks_file)
+       tuple(path(RData_file), path(breaks_file))
 
     output:
-       path '*_neg.RData'
-       path '*_pos.RData'
-       path 'repl.pattern.*.RData', emit: pattern
-       path 'miss_infusions_neg.txt'
-       path 'miss_infusions_pos.txt'
+       path '*tive.RData'
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $resolution $scripts $breaks_file
+        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $breaks_file $params.resolution $params.scripts_dir
         """
 }
diff --git a/DIMS/PeakGroupingIdentified.R b/DIMS/PeakGroupingIdentified.R
index 7c8d998..e6bf3fc 100644
--- a/DIMS/PeakGroupingIdentified.R
+++ b/DIMS/PeakGroupingIdentified.R
@@ -7,9 +7,9 @@
 cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
-SpecPeaks_file <- cmd_args[1]
 #outdir <- cmd_args[2]
 #scanmode <- cmd_args[3]
+SpecPeaks_file <- cmd_args[1]
 HMDB_part_file <- cmd_args[2]
 pattern_file <- cmd_args[3]
 resol <- as.numeric(cmd_args[4])
diff --git a/DIMS/PeakGroupingIdentified.nf b/DIMS/PeakGroupingIdentified.nf
index 8100780..2181cae 100644
--- a/DIMS/PeakGroupingIdentified.nf
+++ b/DIMS/PeakGroupingIdentified.nf
@@ -4,15 +4,16 @@ process PeakGroupingIdentified {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
+       path(SpectrumPeak_file)
        path(HMDBpart_file)
-       val(resolution)
+       path(pattern_file)
 
     output:
-       path 'negative.RData'
-       path 'positive.RData'
+       path '*_negative.RData'
+       path '*_positive.RData'
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $HMDBpart_file $resolution $ppm
+        Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $SpectrumPeak_file $HMDBpart_file $pattern_file $params.resolution $params.ppm
         """
 }
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index f85d9c5..175579f 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -1,77 +1,86 @@
 #!/usr/bin/Rscript
+# adapted from 5-collectSamples.R 
 
 # load required packages 
 # none 
 
 # define parameters 
-cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n")
+#cmd_args <- commandArgs(trailingOnly = TRUE)
+#for (arg in cmd_args) cat("  ", arg, "\n")
 
 # outdir <- cmd_args[1]
 # scanmode <- cmd_args[2]
 scanmodes <- c("positive", "negative")
 
-# Check if all jobs terminated correct!
+# Check whether all jobs terminated correct!
 notRun = NULL
 
-# new: run for each scanmode
-for (scanmode in scanmodes) {
-  load(paste0("./repl.pattern.", scanmode, ".RData"))
-  groupNames = names(repl.pattern.filtered)
-
-  indir <- "./"
-  object.files = list.files(indir, full.names=TRUE, pattern="*.RData")
+print("one")
 
+# collect spectrum peaks for each scanmode
+for (scanmode in scanmodes) {
+  print(scanmode)
+  # load peak lists of all biological samples
+  input_dir <- getwd() # "./"
+  peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep=""))
+
+  # remove any repl_pattern files from the list
+  #if (sum(grepl("repl_pattern", peaklist_files)i > 0) { 
+  #  peaklist_files <- peaklist_files[-grep("repl_pattern", peaklist_files)] }
+  print(peaklist_files)
+
+  # get sample names
+  load(paste0("./", scanmode, "_repl_pattern", ".RData"))
+  groupNames = names(repl_pattern_filtered)
   for (i in 1:length(groupNames)) {
-    group <- paste0(indir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData"))
-    if (!(group %in% object.files)) {
+    group <- paste0(input_dir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData"))
+    if (!(group %in% peaklist_files)) {
       notRun = c(notRun, group)
     }
   }
-
-    #if (is.null(notRun)){
-    cat("\nCollecting samples!")
-
-    # negative
-    filepath <- "./")
-    files <- list.files(filepath, recursive=TRUE, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep=""))
-
-    outlist.tot=NULL
-    for (i in 1:length(files)) {
-
-      cat("\n", files[i])
-      load(files[i])
-
-      if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)){
-        tmp=strsplit(files[i], "/")[[1]]
-        fname = tmp[length(tmp)]
-        #fname = strsplit(files[i], "/")[[1]][8]
-        fname = strsplit(fname, ".RData")[[1]][1]
-        fname = substr(fname, 13, nchar(fname))
-
-        if (i == 1) { outlist.tot <- c(fname, rep("-1",5)) } else { outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5)))}
-      } else {
-        if (i == 1) { outlist.tot <- outlist.persample } else { outlist.tot <- rbind(outlist.tot, outlist.persample)}
+  print("two")
+  cat("\nCollecting samples!")
+
+  outlist.tot=NULL
+  for (i in 1:length(peaklist_files)) {
+
+    cat("\n", peaklist_files[i])
+    load(peaklist_files[i])
+    if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)) {
+      tmp=strsplit(peaklist_files[i], "/")[[1]]
+      fname = tmp[length(tmp)]
+      #fname = strsplit(peaklist_files[i], "/")[[1]][8]
+      fname = strsplit(fname, ".RData")[[1]][1]
+      fname = substr(fname, 13, nchar(fname))
+      if (i == 1) { 
+        outlist.tot <- c(fname, rep("-1",5)) 
+      } else { 
+	outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5)))
       }
-    }
-
-    # remove negative values
-    index=which(outlist.tot[,"height.pkt"]<=0)
-    if (length(index)>0) outlist.tot = outlist.tot[-index,]
-    index=which(outlist.tot[,"mzmed.pkt"]<=0)
-    if (length(index)>0) outlist.tot = outlist.tot[-index,]
-
-    # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/")
-    # dir.create(outdir_specpks, showWarnings = F)
-    # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/"))
-    save(outlist.tot, file = paste(scanmode, "RData", sep = ".")
-
-
-    if (!is.null(notRun)){
-      for (i in 1:length(notRun)){
-        message(paste(notRun[i], "was not generated"))
+    } else {
+      if (i == 1) { 
+	outlist.tot <- outlist.persample 
+      } else { 
+	outlist.tot <- rbind(outlist.tot, outlist.persample) 
       }
     }
-
+  }
+  print("three")
+  # remove negative values
+  index=which(outlist.tot[,"height.pkt"]<=0)
+  if (length(index)>0) outlist.tot = outlist.tot[-index,]
+  index=which(outlist.tot[,"mzmed.pkt"]<=0)
+  if (length(index)>0) outlist.tot = outlist.tot[-index,]
+
+  # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/")
+  # dir.create(outdir_specpks, showWarnings = F)
+  # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/"))
+  save(outlist.tot, file = paste("./", scanmode, ".RData", sep = ""))
+
+  if (!is.null(notRun)){
+    for (i in 1:length(notRun)){
+      message(paste(notRun[i], "was not generated"))
+    }
+  }
 }
 
diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf
index 66e04a8..6a7c73a 100644
--- a/DIMS/SpectrumPeakFinding.nf
+++ b/DIMS/SpectrumPeakFinding.nf
@@ -5,6 +5,7 @@ process SpectrumPeakFinding {
 
     input:
        path(RData_file)
+       path(replication_pattern)
 
     output:
        path 'negative.RData'
@@ -12,6 +13,6 @@ process SpectrumPeakFinding {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $RData_file
+        Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $replication_pattern
         """
 }

From 6b6c5316cf273d32a11ae42031adb878f4700098 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 25 Aug 2023 17:16:17 +0200
Subject: [PATCH 09/73] code cleaned up

---
 DIMS/AddOnFunctions/replaceZeros.R | 64 ++++++++-------------
 DIMS/AssignToBins.R                | 57 +++++++++----------
 DIMS/AverageTechReplicates.R       | 73 +++++++++++-------------
 DIMS/GenerateBreaks.R              | 33 ++++++-----
 DIMS/HMDBparts.R                   | 36 ++++--------
 DIMS/MakeInit.R                    | 12 ++--
 DIMS/PeakFinding.R                 | 91 ++++++++++++------------------
 DIMS/SpectrumPeakFinding.R         | 36 +++---------
 DIMS/SpectrumPeakFinding.nf        |  4 +-
 9 files changed, 163 insertions(+), 243 deletions(-)

diff --git a/DIMS/AddOnFunctions/replaceZeros.R b/DIMS/AddOnFunctions/replaceZeros.R
index 4fafc9a..234d961 100644
--- a/DIMS/AddOnFunctions/replaceZeros.R
+++ b/DIMS/AddOnFunctions/replaceZeros.R
@@ -1,4 +1,4 @@
-replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){
+replaceZeros <- function(outpgrlist, repl_pattern, scanmode, resol, outdir, thresh, ppm) {
   # file="./results/grouping_rest/negative_1.RData"
   # scanmode= "negative"
   # scriptDir="./scripts"
@@ -6,49 +6,43 @@ replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){
   # thresh=2000
   # outdir="./results"
   
-  control_label="C"
+  # control_label="C"
   
-  source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/"))
-  sourceDir(paste(scriptDir, "AddOnFunctions", sep="/"))
+  # source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/"))
+  # sourceDir(paste(scriptDir, "AddOnFunctions", sep="/"))
   
-  dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE)
+  # dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE)
   
   # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
   # scale=2 # Initial value used to estimate scaling parameter
   # width=1024
   # height=768
   
-  # message(paste("file", file))
-  # message(paste("scanmode", scanmode))
-  # message(paste("resol", resol))
-  # message(paste("outdir", outdir))
-  # message(paste("thresh", thresh))
-  # message(paste("scriptDir", scriptDir))
-  
-  load(paste0(outdir, "/repl.pattern.",scanmode, ".RData"))
-  
-  name = as.vector(unlist(strsplit(file, "/", fixed=TRUE)))
-  name = name[length(name)]
+  # load(paste0(outdir, "/repl.pattern.",scanmode, ".RData"))
+ 
+  # batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2]
+  # name = as.vector(unlist(strsplit(file, "/", fixed=TRUE)))
+  # name = name[length(name)]
   # message(paste("File name: ", name))
   
   # load samplePeaks
   # load  outpgrlist
-  load(file)
+  # load(file)
   
-  # #################################################################################
   # # filter on at least signal in two control samples
   # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE)
   # # barplot(as.numeric(outpgrlist[753, int.cols]))
   # keep = NULL
   # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE))
   # outpgrlist = outpgrlist[keep,]
-  # #################################################################################
   
-  ################################################################################
-  # For now only replace zeros
+  # replace zeros
   if (!is.null(outpgrlist)) {
-    for (i in 1:length(names(repl.pattern.filtered))){
-      samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]]
+	  print(dim(outpgrlist))
+	  print(colnames(outpgrlist))
+    for (i in 1:length(names(repl_pattern))){
+	    print(names(repl_pattern)[i])
+      samplePeaks=outpgrlist[,names(repl_pattern)[i]]
       index=which(samplePeaks<=0)
       if (!length(index)){
         next
@@ -56,16 +50,13 @@ replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){
       for (j in 1:length(index)){
         area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area
         # for testing purposes, add a fixed random seed
-        set.seed(123)
-        outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area)
+        # set.seed(123)
+        outpgrlist[index[j], names(repl_pattern)[i]] = rnorm(n=1, mean=area, sd=0.25*area)
       }
     }
   
-  ################################################################################
-  
   
-  #################### identification #########################################################
-    # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016
+    # Identification 
     
     # Add average column
     outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean))
@@ -84,25 +75,16 @@ replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){
       # HMDB_add_iso=HMDB_add_iso.Pos
     }
     
-    # # Identification using large database
-    # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label)
-    # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups"))
-    # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres"))
-    
     # Identify noise peaks
     noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "")
     noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4]
-    
-    # Replace "Negative" by "negative" in ident.hires.noise
     final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm)
-    # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks"))
     tmp <- final.outlist.idpat2[ , c("assi", "theormz")]
     colnames(tmp) <- c("assi_noise",  "theormz_noise")
     
     final.outlist.idpat3 <- cbind(outpgrlist, tmp)
-    #############################################################################################
-    
-    # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep="")))
-    save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep=""))
+   
+    return(final.outlist.idpat3) 
+    # save(final.outlist.idpat3, file=paste("./", name, sep=""))
   }
 }
diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index c6fdaba..4e320d3 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -8,34 +8,31 @@ suppressPackageStartupMessages(library("xcms"))
 cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
-filepath <- cmd_args[1] # location of mzML file
+filepath        <- cmd_args[1] # location of mzML file
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-resol <- as.numeric(cmd_args[3]) # 140000
-trim <- 0.1
-dimsThresh <- 100
-print(filepath)
-print(breaks_filepath)
-print(resol)
+resol           <- as.numeric(cmd_args[3]) # 140000
+trim            <- 0.1
+dimsThresh      <- 100
 
-sampname <- sub('\\..*$', '', basename(filepath))
+# get sample name
+sample_name <- sub('\\..*$', '', basename(filepath))
 
 options(digits=16)
 
-### process one sample at a time and find peaks FOR BOTH SCAN MODES! #
-int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
-scale=2 # Initial value used to estimate scaling parameter
-width=1024
-height=768
-
-# Initiate  
-trimLeft=NULL
-trimRight=NULL
-breaks_fwhm=NULL
-breaks_fwhm_avg=NULL
-bins=NULL
-pos_results=NULL
-neg_results=NULL
+# Initialize
+int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+scale      <- 2 # Initial value used to estimate scaling parameter
+width      <- 1024
+height     <- 768
+trimLeft        <- NULL
+trimRight       <- NULL
+breaks_fwhm     <- NULL
+breaks_fwhm_avg <- NULL
+bins            <- NULL
+pos_results     <- NULL
+neg_results     <- NULL
 
+### process one sample at a time and find peaks FOR BOTH SCAN MODES! #
 # read in the data for 1 sample
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
@@ -93,16 +90,16 @@ if (nrow(neg_raw_data_matrix) > 0) {
 pos_bins[pos_bins < dimsThresh] <- 0
 neg_bins[neg_bins < dimsThresh] <- 0
 
-pos_results = cbind(pos_results, pos_bins)
-neg_results = cbind(neg_results, neg_bins)
+pos_results <- cbind(pos_results, pos_bins)
+neg_results <- cbind(neg_results, neg_bins)
 
 # transpose
-pos_results_transpose = t(pos_results)
-neg_results_transpose = t(neg_results)
+pos_results_transpose <- t(pos_results)
+neg_results_transpose <- t(neg_results)
 
 # Add file names as row names
-rownames(pos_results_transpose) = sampname
-rownames(neg_results_transpose) = sampname
+rownames(pos_results_transpose) <- sample_name
+rownames(neg_results_transpose) <- sample_name
 
 # delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results
 breaks_fwhm_avg_minus1 <- breaks_fwhm_avg[-length(breaks_fwhm_avg)]
@@ -117,6 +114,6 @@ colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1
 pos_results_final <- t(pos_results_transpose)
 neg_results_final <- t(neg_results_transpose)
 
-pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm)
+peak_list <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm)
 
-save(pklist, file=paste("./", sampname, ".RData", sep=""))
+save(peak_list, file=paste("./", sample_name, ".RData", sep=""))
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 4303a99..b880f13 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -1,9 +1,6 @@
 #!/usr/bin/Rscript
 # adapted from 3-AverageTechReplicates.R
 
-# load required functions 
-# source(paste(scripts_dir, "AddOnFunctions/removeFromRepl.patR", sep=""))
-
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n", sep="")
@@ -11,36 +8,32 @@ for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 init_filepath <- cmd_args[1]
 nr_replicates <- as.numeric(cmd_args[2])
 thresh2remove <- 2000
-dimsThresh <- 100
+dimsThresh    <- 100
 
 removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
   # bad_samples=remove_pos
   
-  tmp = repl_pattern
+  tmp <- repl_pattern
   
-  removeFromGroup=NULL
+  removeFromGroup <- NULL
   
   for (i in 1:length(tmp)){
-    tmp2 = repl_pattern[[i]]
+    tmp2 <- repl_pattern[[i]]
     
-    remove=NULL
+    remove <- NULL
     
     for (j in 1:length(tmp2)){
       if (tmp2[j] %in% bad_samples){
-        #cat(tmp2[j])
-        #cat(paste("remove",tmp2[j]))
-        #cat(paste("remove i",i))
-        #cat(paste("remove j",j))
         remove = c(remove, j)
       }
     }
     
-    if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i)
-    if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove]
+    if (length(remove) == nr_replicates) removeFromGroup <- c(removeFromGroup,i)
+    if (!is.null(remove)) repl_pattern[[i]] <- repl_pattern[[i]][-remove]
   }
   
   if (length(removeFromGroup)!=0) {
-    repl_pattern=repl_pattern[-removeFromGroup]
+    repl_pattern <- repl_pattern[-removeFromGroup]
   }
   
   return(list("pattern"=repl_pattern))
@@ -50,44 +43,44 @@ removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
 # get repl_pattern
 load("./init.RData")
 
-remove_neg=NULL
-remove_pos=NULL
+remove_neg <- NULL
+remove_pos <- NULL
 cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
 for (i in 1:length(repl_pattern)) {
-  techRepsArray.pos = NULL
-  techRepsArray.neg = NULL
+  techRepsArray.pos <- NULL
+  techRepsArray.neg <- NULL
   
-  tech_reps = as.vector(unlist(repl_pattern[i]))
-  sum_neg=0
-  sum_pos=0
-  n_pos=0
-  n_neg=0
+  tech_reps <- as.vector(unlist(repl_pattern[i]))
+  sum_neg <- 0
+  sum_pos <- 0
+  n_pos <- 0
+  n_neg <- 0
   cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl_pattern))
   for (j in 1:length(tech_reps)) {
     load(paste("./", tech_reps[j], ".RData", sep=""))
     cat("\n\nParsing", tech_reps[j])
 
-    cat("\n\tNegative pklist sum",sum(pklist$neg[,1]))
-    if (sum(pklist$neg[,1])<thresh2remove){
+    cat("\n\tNegative peak_list sum", sum(peak_list$neg[,1]))
+    if (sum(peak_list$neg[,1]) < thresh2remove){
       cat(" ... Removed")
-      remove_neg=c(remove_neg, tech_reps[j])
+      remove_neg <- c(remove_neg, tech_reps[j])
     } else {
-      n_neg=n_neg+1
-      sum_neg=sum_neg+pklist$neg
+      n_neg <- n_neg + 1
+      sum_neg <- sum_neg + peak_list$neg
     }
     
-    techRepsArray.neg = cbind(techRepsArray.neg, pklist$neg)
+    techRepsArray.neg <- cbind(techRepsArray.neg, peak_list$neg)
     
-    cat("\n\tPositive pklist sum",sum(pklist$pos[,1]))
-    if (sum(pklist$pos[,1])<thresh2remove){
+    cat("\n\tPositive peak_list sum", sum(peak_list$pos[,1]))
+    if (sum(peak_list$pos[,1]) < thresh2remove){
       cat(" ... Removed")
-      remove_pos=c(remove_pos, tech_reps[j])
+      remove_pos <- c(remove_pos, tech_reps[j])
     } else {
-      n_pos=n_pos+1
-      sum_pos=sum_pos+pklist$pos
+      n_pos <- n_pos + 1
+      sum_pos <- sum_pos + peak_list$pos
     }
     
-    techRepsArray.pos = cbind(techRepsArray.pos, pklist$pos)
+    techRepsArray.pos <- cbind(techRepsArray.pos, peak_list$pos)
   }
   
   # filter within bins on at least signal in more than one tech. rep.!!!
@@ -108,12 +101,12 @@ for (i in 1:length(repl_pattern)) {
 
 retVal <- removeFromRepl.pat(remove_neg, repl_pattern, nr_replicates)
 repl_pattern_filtered <- retVal$pattern
-save(repl_pattern_filtered, file="./negative_repl_pattern.RData")
-write.table(remove_neg, file="./miss_infusions_negative.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
+save(repl_pattern_filtered, file = "./negative_repl_pattern.RData")
+write.table(remove_neg, file = "./miss_infusions_negative.txt", row.names=FALSE, col.names=FALSE , sep= "\t")
 
 retVal <- removeFromRepl.pat(remove_pos, repl_pattern, nr_replicates)
 repl_pattern_filtered <- retVal$pattern
-save(repl_pattern_filtered, file="./positive_repl_pattern.RData")
-write.table(remove_pos, file="./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE ,sep= "\t")
+save(repl_pattern_filtered, file = "./positive_repl_pattern.RData")
+write.table(remove_pos, file = "./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE , sep= "\t")
 
 
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 285da0d..38161fb 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -15,28 +15,28 @@ trim     <- as.numeric(cmd_args[3]) # 0.1
 resol    <- as.numeric(cmd_args[4]) # 140000
 
 # initialize
-trimLeft        = NULL
-trimRight       = NULL
-breaks_fwhm     = NULL
-breaks_fwhm_avg = NULL
-bins            = NULL
-posRes          = NULL
-negRes          = NULL
+trimLeft        <- NULL
+trimRight       <- NULL
+breaks_fwhm     <- NULL
+breaks_fwhm_avg <- NULL
+bins            <- NULL
+posRes          <- NULL
+negRes          <- NULL
 
 # read in mzML file
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
 # trim (remove) scans at the start and end
-trimLeft  = round(raw_data@scantime[length(raw_data@scantime)*trim])
-trimRight = round(raw_data@scantime[length(raw_data@scantime)*(1-trim)])
+trimLeft  <- round(raw_data@scantime[length(raw_data@scantime)*trim])
+trimRight <- round(raw_data@scantime[length(raw_data@scantime)*(1-trim)])
 
 # Mass range m/z
-lowMZ = raw_data@mzrange[1]
-highMZ = raw_data@mzrange[2]
+lowMZ  <- raw_data@mzrange[1]
+highMZ <- raw_data@mzrange[2]
 
 # determine number of segments (bins)
-nsegment = 2*(highMZ-lowMZ)
-segment  = seq(from=lowMZ, to=highMZ, length.out=nsegment+1)
+nsegment <- 2*(highMZ-lowMZ)
+segment  <- seq(from=lowMZ, to=highMZ, length.out=nsegment+1)
 
 # determine start and end of each bin.
 for (i in 1:nsegment) {
@@ -46,12 +46,11 @@ for (i in 1:nsegment) {
   fwhmsegm    <- startsegm/resol_mz
   breaks_fwhm <- c(breaks_fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm))
   # average the m/z instead of start value
-  range = seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)
-  deltaMZ = range[2] - range[1]
+  range <- seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)
+  deltaMZ <- range[2] - range[1]
   breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5*deltaMZ)
 }
 
-# remove one of these:
-# save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/"))
+# generate output file
 save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData")
 
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 6fb1816..51490b6 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -5,14 +5,11 @@
 cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n")
 
-# outdir <- cmd_args[1]
-# scanmode <- cmd_args[2]
 db_path <- cmd_args[1] # location of HMDB db file
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
 standard_run  <- cmd_args[3] # "yes"
 
 # Cut up entire HMDB into small parts based on the new binning/breaks 
-
 load(breaks_filepath)
 
 # In case of a standard run (m/z 69-606) use external HMDB parts
@@ -21,21 +18,13 @@ max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
 # test if standard mz range is used
 if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) {
   # skip generating HMDB parts
-  use_external_HMDB <- TRUE
-  save(use_external_HMDB, file="./using_external_HMDB_parts.RData")
   hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/"
-  # hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files
   hmdb_parts <- list.files(hmdb_parts_dir, pattern=hmdb) # all files containing hmdb in file name
-  # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/"))
   for (hmdb_file in hmdb_parts) {
-    # file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE)
     file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), "./", recursive = TRUE)
   }
 } else { 
   # generate HMDB parts in case of non-standard mz range
-  use_external_HMDB <- FALSE
-  save(use_external_HMDB, file="./not_using_external_HMDB_parts.RData")
-  # db <- cmd_args[3]
   load(db_path)
   ppm <- as.numeric(cmd_args[4])
 
@@ -55,14 +44,14 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz <
                                       HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
     # sort on mass
-    outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),]
+    outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),]
 
-    n=dim(outlist)[1]
-    sub=20000 # max rows per file
-    end=0
-    min_1_last=sub
-    check=0
-    outlist_part=NULL
+    n            <- dim(outlist)[1]
+    sub          <- 20000 # max rows per file
+    end          <- 0
+    min_1_last   <- sub
+    check        <- 0
+    outlist_part <- NULL
 
 
     if (n < sub) {
@@ -106,26 +95,23 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz <
         n_moved <- 0
 
         if (!is.null(outlist_part)) {
-          # Calculate 3ppm and replace border, avoid cut within peakgroup!
+          # Calculate ppm and replace border, avoid cut within peakgroup!
           while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) {
-            outlist_part = rbind(outlist_part, outlist_i[1,])
-            outlist_i = outlist_i[-1,]
-            n_moved = n_moved + 1
+            outlist_part <- rbind(outlist_part, outlist_i[1,])
+            outlist_i <- outlist_i[-1,]
+            n_moved <- n_moved + 1
           }
 
-          # message(paste("Process", i+1-1,":", dim(outlist_part)[1]))
           save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i, "RData", sep = "."), sep = ""))
           check <- check + dim(outlist_part)[1]
         }
 
         outlist_part <- outlist_i
-        # message(paste("Process", i+2-1,":", dim(outlist_part)[1]))
         save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep=""))
         check <- check + dim(outlist_part)[1]
         cat("\n", "Check", check == dim(outlist)[1])
 
       }
     }
-
 }
 
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 2f2a25a..3b125e3 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,14 +1,14 @@
 #!/usr/bin/env Rscript
 ## adapted from makeInit in old pipeline
 
-args <- commandArgs(trailingOnly=TRUE)
-sample_sheet <- read.csv(args[1], sep="\t") 
+args          <- commandArgs(trailingOnly=TRUE)
+sample_sheet  <- read.csv(args[1], sep="\t") 
 nr_replicates <- as.numeric(args[2])
 
 sampleNames <- trimws(as.vector(unlist(sample_sheet[1])))
 nr_sampgrps <- length(sampleNames)/nr_replicates
-groupNames <- trimws(as.vector(unlist(sample_sheet[2])))
-groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames)
+groupNames  <- trimws(as.vector(unlist(sample_sheet[2])))
+groupNames  <- gsub('[^-.[:alnum:]]', '_', groupNames)
 groupNamesUnique <- unique(groupNames)
 
 repl_pattern <- c()
@@ -23,7 +23,7 @@ for (sampgrp in 1:nr_sampgrps) {
 
 names(repl_pattern) <- groupNamesUnique
 
-# just to preview
-head(repl_pattern)
+# preview the replication pattern
+print(head(repl_pattern))
 
 save(repl_pattern, file="./init.RData")
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index a739257..379a0fa 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -5,68 +5,49 @@
 cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
-filepath <- cmd_args[1]
+filepath        <- cmd_args[1]
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-resol <- as.numeric(cmd_args[3])
-scripts_dir <- cmd_args[4]
+resol           <- as.numeric(cmd_args[3])
+scripts_dir     <- cmd_args[4]
 
 thresh <- 2000
 
-# if file extension is .txt, do nothing.
-if (grepl(".txt$", filepath)) { stop("not run on txt file") }
-
-# for debugging:
-print(filepath)
-print(breaks_filepath)
-print(resol)
-print(scripts_dir)
-print(paste(scripts_dir, "AddOnFunctions/", sep=""))
-
 # load in function scripts
-# source(paste(scripts_dir, "AddOnFunctions/sourceDir.R", sep="/"))
-# the sourceDir function no longer works
-# sourceDir(paste(scripts_dir, "AddOnFunctions", sep="/"))
-source(paste(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/searchMZRange.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/generateGaussian.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fitGaussian.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fitGaussianInit.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/getFwhm.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/getSD.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/optimizeGauss.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit1Peak.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit2peaks.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit3peaks.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit4peaks.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fitG.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit2G.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit3G.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/fit4G.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/getArea.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/getFitQuality.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/checkOverlap.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/sumCurves.R", sep=""))
-source(paste(scripts_dir, "AddOnFunctions/isWithinXppm.R", sep=""))
+source(paste0(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R"))
+source(paste0(scripts_dir, "AddOnFunctions/searchMZRange.R"))
+source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fitGaussian.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fitGaussianInit.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getSD.R"))
+source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit1Peak.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit2peaks.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit3peaks.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit4peaks.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fitG.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit2G.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit3G.R"))
+source(paste0(scripts_dir, "AddOnFunctions/fit4G.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getArea.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getFitQuality.R"))
+source(paste0(scripts_dir, "AddOnFunctions/checkOverlap.R"))
+source(paste0(scripts_dir, "AddOnFunctions/sumCurves.R"))
+source(paste0(scripts_dir, "AddOnFunctions/isWithinXppm.R"))
 
 load(breaks_filepath)
-# load(filepath)
 
-# for some reason, the repl_pattern_positive.RData and repl_pattern_negative.RData files are also read in.
-if (!grepl("repl_pattern", filepath)) {
-  # Load output of AverageTechReplicates for a sample
-  sample_avgtechrepl <- get(load(filepath))
-  if (grepl("_pos", filepath)) { scanmode = "positive" } else
-    if (grepl("_neg", filepath)) { scanmode = "negative" }
+# Load output of AverageTechReplicates for a sample
+sample_avgtechrepl <- get(load(filepath))
+if (grepl("_pos", filepath)) { scanmode = "positive" } else
+  if (grepl("_neg", filepath)) { scanmode = "negative" }
 
-  # for debugging:
-  print(filepath)
-  print(scanmode)
+# Initialize
+options(digits = 16)
+int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+scale      <- 2 # Initial value used to estimate scaling parameter
+width      <- 1024
+height     <- 768
 
-  # Initialize
-  options(digits = 16)
-  int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
-  scale <- 2 # Initial value used to estimate scaling parameter
-  width <- 1024
-  height <- 768
-  findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
-}
+# run the findPeaks function
+findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index 175579f..9935b05 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -1,15 +1,7 @@
 #!/usr/bin/Rscript
 # adapted from 5-collectSamples.R 
 
-# load required packages 
-# none 
-
 # define parameters 
-#cmd_args <- commandArgs(trailingOnly = TRUE)
-#for (arg in cmd_args) cat("  ", arg, "\n")
-
-# outdir <- cmd_args[1]
-# scanmode <- cmd_args[2]
 scanmodes <- c("positive", "negative")
 
 # Check whether all jobs terminated correct!
@@ -24,11 +16,6 @@ for (scanmode in scanmodes) {
   input_dir <- getwd() # "./"
   peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep=""))
 
-  # remove any repl_pattern files from the list
-  #if (sum(grepl("repl_pattern", peaklist_files)i > 0) { 
-  #  peaklist_files <- peaklist_files[-grep("repl_pattern", peaklist_files)] }
-  print(peaklist_files)
-
   # get sample names
   load(paste0("./", scanmode, "_repl_pattern", ".RData"))
   groupNames = names(repl_pattern_filtered)
@@ -38,18 +25,16 @@ for (scanmode in scanmodes) {
       notRun = c(notRun, group)
     }
   }
-  print("two")
-  cat("\nCollecting samples!")
+  
+  cat("\nCollecting samples")
 
   outlist.tot=NULL
   for (i in 1:length(peaklist_files)) {
-
     cat("\n", peaklist_files[i])
     load(peaklist_files[i])
-    if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)) {
+    if (is.null(outlist.persample) || (dim(outlist.persample)[1] == 0)) {
       tmp=strsplit(peaklist_files[i], "/")[[1]]
       fname = tmp[length(tmp)]
-      #fname = strsplit(peaklist_files[i], "/")[[1]][8]
       fname = strsplit(fname, ".RData")[[1]][1]
       fname = substr(fname, 13, nchar(fname))
       if (i == 1) { 
@@ -65,17 +50,14 @@ for (scanmode in scanmodes) {
       }
     }
   }
-  print("three")
+  
   # remove negative values
-  index=which(outlist.tot[,"height.pkt"]<=0)
-  if (length(index)>0) outlist.tot = outlist.tot[-index,]
-  index=which(outlist.tot[,"mzmed.pkt"]<=0)
-  if (length(index)>0) outlist.tot = outlist.tot[-index,]
+  index <- which(outlist.tot[ ,"height.pkt"] <= 0)
+  if (length(index) > 0) outlist.tot <- outlist.tot[-index, ]
+  index <- which(outlist.tot[ ,"mzmed.pkt"] <= 0)
+  if (length(index) > 0) outlist.tot <- outlist.tot[-index, ]
 
-  # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/")
-  # dir.create(outdir_specpks, showWarnings = F)
-  # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/"))
-  save(outlist.tot, file = paste("./", scanmode, ".RData", sep = ""))
+  save(outlist.tot, file = paste0("./SpectrumPeaks_", scanmode, ".RData"))
 
   if (!is.null(notRun)){
     for (i in 1:length(notRun)){
diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf
index 6a7c73a..786131c 100644
--- a/DIMS/SpectrumPeakFinding.nf
+++ b/DIMS/SpectrumPeakFinding.nf
@@ -8,8 +8,8 @@ process SpectrumPeakFinding {
        path(replication_pattern)
 
     output:
-       path 'negative.RData'
-       path 'positive.RData'
+       path 'SpectrumPeaks_*.RData'
+       // path 'SpectrumPeaks_positive.RData'
 
     script:
         """

From 39e2cb77b49784cec0a2598d5dcd412da1e964a2 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 25 Aug 2023 17:17:30 +0200
Subject: [PATCH 10/73] added functionality

---
 DIMS/FillMissing.R   |  58 ++++++++++++++
 DIMS/FillMissing.nf  |  17 ++++
 DIMS/PeakGrouping.R  | 186 +++++++++++++++++++++++++++++++++++++++++++
 DIMS/PeakGrouping.nf |  19 +++++
 4 files changed, 280 insertions(+)
 create mode 100755 DIMS/FillMissing.R
 create mode 100644 DIMS/FillMissing.nf
 create mode 100644 DIMS/PeakGrouping.R
 create mode 100644 DIMS/PeakGrouping.nf

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
new file mode 100755
index 0000000..8c3c86c
--- /dev/null
+++ b/DIMS/FillMissing.R
@@ -0,0 +1,58 @@
+#!/usr/bin/Rscript
+# adapted from 9-runFillMissing.R
+
+# define parameters
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+# define parameters 
+peakgrouplist_file <- cmd_args[1]
+#pattern_file       <- cmd_args[2]
+scripts_dir        <- cmd_args[2]
+#thresh  <- as.numeric(cmd_args[4])
+#resol   <- as.numeric(cmd_args[5])
+#ppm     <- as.numeric(cmd_args[6])
+thresh  <- cmd_args[3]
+resol   <- cmd_args[4]
+ppm     <- cmd_args[5]
+# outdir <- cmd_args[2]
+# scanmode <- cmd_args[3]
+outdir <- "./"
+if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else
+    if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" }
+
+# load in function scripts
+source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
+source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
+source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R"))
+
+# for debugging:
+print(peakgrouplist_file)
+#print(pattern_file)
+print(scripts_dir)
+print(thresh)
+print(resol)
+print(ppm)
+
+outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
+print(outputfile_name)
+
+# get replication pattern for sample names
+pattern_file <- paste0(scanmode, "_repl_pattern.RData")
+repl_pattern <- get(load(pattern_file))
+print(repl_pattern[[1]])
+
+# peakgrouplist <- get(load(peakgrouplist_file))
+load(peakgrouplist_file)
+batch_number <- strsplit(basename(peakgrouplist_file), ".",fixed = TRUE)[[1]][1]
+
+print(dim(outpgrlist.identified))
+print(colnames(outpgrlist.identified))
+print(batch_number)
+
+# replace missing values (zeros) with random noise
+peakgrouplist_filled <- replaceZeros(outpgrlist.identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+
+# save output
+save(peakgrouplist_filled, file=paste0("./", outputfile_name)
+save(peakgrouplist_filled, file=paste0("./", batch_number, scanmode, "identified_filled.RData")
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
new file mode 100644
index 0000000..c88ea5c
--- /dev/null
+++ b/DIMS/FillMissing.nf
@@ -0,0 +1,17 @@
+process FillMissing {
+    label 'FillMissing'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(GroupedList_file)
+       path(replication_pattern)
+
+    output:
+       path('.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        """
+}
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
new file mode 100644
index 0000000..b5038e8
--- /dev/null
+++ b/DIMS/PeakGrouping.R
@@ -0,0 +1,186 @@
+#!/usr/bin/Rscript
+# adapted from 6-peakGrouping.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+HMDB_part_file <- cmd_args[1]
+SpecPeaks_file <- cmd_args[2]
+pattern_file   <- cmd_args[3]
+ppm <- as.numeric(cmd_args[4])
+
+options(digits=16)
+
+# load part of the HMDB
+HMDB_add_iso <- get(load(HMDB_part_file)) 
+# load(HMDB_part_file) 
+# HMDB_add_iso <- outlist_part
+
+# determine appropriate scanmode based on HMDB_part_file 
+if (grepl("negative", basename(HMDB_part_file))) { scanmode <- "negative" } else
+  if (grepl("positive", basename(HMDB_part_file))) { scanmode <- "positive" }
+
+# determine batch number of HMDB part file
+batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2]
+
+# load file with spectrum peaks
+SpecPeaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData")
+load(SpecPeaks_file)
+outlist.copy <- outlist.tot
+rm(outlist.tot)
+
+# load replication pattern
+# load(paste0("./", scanmode, "_repl_pattern", ".RData"))
+pattern_file <- paste0(scanmode, "_repl_pattern.RData")
+load(pattern_file)
+# load("./breaks.fwhm.RData")
+
+# determine appropriate column name in HMDB part 
+if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" }
+
+# for debugging:
+print(head(HMDB_add_iso))
+print(scanmode)
+print(column_label)
+
+# Initialize
+outpgrlist.identified <- NULL
+outlist.grouped <- NULL
+
+# First find peak groups identified based on HMDB masses
+while (dim(HMDB_add_iso)[1] > 0) { 
+  index <- 1
+ 
+  # take one m/z value from the HMDB part and calculate mass tolerance
+  reference_mass <- as.numeric(HMDB_add_iso[index, column_label])
+  mass_tolerance <- (reference_mass * ppm) / 10^6
+  
+  # find the peaks in the dataset with corresponding m/z
+  mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"])
+  selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance)))
+  tmplist <- outlist.copy[selp,,drop=FALSE]
+  outlist.grouped <- rbind(outlist.grouped, tmplist)
+  nrsamples <- length(selp)
+  if (nrsamples > 0) {
+    mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
+    mzmin.pgrp <- reference_mass - mass_tolerance
+    mzmax.pgrp <- reference_mass + mass_tolerance
+    
+    # determine fit quality fq
+    fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"]))
+    fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"]))
+    
+    # set up object for intensities for all samples
+    ints.allsamps <- rep(0, length(names(repl.pattern.filtered)))
+    names(ints.allsamps) <- names(repl.pattern.filtered) 
+    
+    # Check for each sample if multiple peaks exist, if so take the sum of the intensities
+    labels <- unique(tmplist[ ,"samplenr"])
+    ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])) } )))
+    
+    # Initialize 
+    assi_HMDB <- iso_HMDB <- HMDB_code <- NA
+    tmplist.mass.iso <- tmplist.mass.adduct <- NULL
+    
+    # Identification: find all entries in HMDB part with mass within ppm range
+    mass.all <- as.numeric(HMDB_add_iso[ , column_label])
+    index <- which((mass.all > (reference_mass - mass_tolerance)) & (mass.all < (reference_mass + mass_tolerance)))
+    tmplist.mass <- HMDB_add_iso[index,,drop=FALSE]
+    
+    if (dim(tmplist.mass)[1]>0) {
+      # find isotope entries    
+      index.iso <- grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE)
+      if (length(index.iso) > 0){
+        tmplist.mass.iso <- tmplist.mass[index.iso,,drop=FALSE]
+        tmplist.mass <- tmplist.mass[-index.iso,,drop=FALSE]
+      }
+      
+      if (dim(tmplist.mass)[1] > 0) {        
+	# find adduct entries
+        index.adduct <- grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE)
+        if (length(index.adduct) > 0) {
+          tmplist.mass.adduct <- tmplist.mass[index.adduct,,drop=FALSE]
+          tmplist.mass <- tmplist.mass[-index.adduct,,drop=FALSE]
+        }  
+      }  
+      
+      # Compose a list compounds, adducts or isotopes with corresponding m/z
+      if (dim(tmplist.mass)[1]>0) {
+        
+        # metabolites
+        assi_HMDB <- as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";"))
+        HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";"))
+        theormz_HMDB <- as.numeric(tmplist.mass[1, column_label])
+        
+        # adducts of metabolites
+        if (!is.null(tmplist.mass.adduct)) {
+          if (dim(tmplist.mass.adduct)[1] > 0) {
+            if (is.na(assi_HMDB)){
+              assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";"))
+              HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
+            } else {
+              assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+              HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
+            }
+	  }
+	}
+        
+        # isotopes of metabolites
+        if (!is.null(tmplist.mass.iso)) {
+          if (dim(tmplist.mass.iso)[1]>0) {
+            iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+          }
+	}
+        
+      # if no metabolites have the correct m/z, look for adducts and isotopes only
+      } else if (!is.null(tmplist.mass.adduct)) {
+        
+        theormz_HMDB <- as.numeric(tmplist.mass.adduct[1, column_label])
+        
+        # adducts of metabolites
+        if (!is.null(tmplist.mass.adduct)) {
+          if (dim(tmplist.mass.adduct)[1] > 0) {
+            if (is.na(assi_HMDB)) {
+              assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[ , "CompoundName"]), collapse = ";"))
+              HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
+            } else {
+              assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+              HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
+            }
+	  }
+	}
+        
+        # isotopes of metabolites
+        if (!is.null(tmplist.mass.iso)) {
+          if (dim(tmplist.mass.iso)[1]>0) {
+            iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+          }
+	}
+        
+      # if no metabolites or adducts can be found, only look for isotopes  
+      } else if (!is.null(tmplist.mass.iso)) {
+        
+        if (dim(tmplist.mass.iso)[1]>0) {
+          theormz_HMDB <- as.numeric(tmplist.mass.iso[1,column_label])
+          iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+        }
+      }  
+      
+    }
+   
+    # combine all information 
+    outpgrlist.identified <- rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp),
+                                         t(as.matrix(ints.allsamps)),
+                                         data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB)))
+  }
+ 
+  # remove index metabolite from HMDB part and continue while loop 
+  HMDB_add_iso <- HMDB_add_iso[-index,]
+  
+}
+
+# save peak list corresponding to masses in HMDB part
+# save(outlist.grouped, file=paste0(batch_number, "_", scanmode, "_all.RData"))
+# save peak group list, identified part
+save(outpgrlist.identified, file=paste0(batch_number, "_", scanmode, "_identified.RData"))
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
new file mode 100644
index 0000000..be9d542
--- /dev/null
+++ b/DIMS/PeakGrouping.nf
@@ -0,0 +1,19 @@
+process PeakGrouping {
+    label 'PeakGrouping'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(HMDBpart_file)
+       path(SpectrumPeak_file)
+       path(pattern_file)
+
+    output:
+       // path '*_all.RData', emit: peaklist_all
+       path '*_identified.RData', emit: grouped_identified
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $SpectrumPeak_file $pattern_file $params.ppm
+        """
+}

From ab5c4574e7de7df394af9aedd3fece156b17d4a3 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 1 Sep 2023 17:10:46 +0200
Subject: [PATCH 11/73] modified files for Nextflow

---
 DIMS/FillMissing.R             |  44 +++-----
 DIMS/FillMissing.nf            |   2 +-
 DIMS/PeakGrouping.R            |  22 ++--
 DIMS/PeakGrouping.nf           |   4 +-
 DIMS/PeakGroupingIdentified.R  | 180 ---------------------------------
 DIMS/PeakGroupingIdentified.nf |  19 ----
 6 files changed, 33 insertions(+), 238 deletions(-)
 delete mode 100644 DIMS/PeakGroupingIdentified.R
 delete mode 100644 DIMS/PeakGroupingIdentified.nf

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index 8c3c86c..127c557 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -7,52 +7,36 @@ for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 # define parameters 
 peakgrouplist_file <- cmd_args[1]
-#pattern_file       <- cmd_args[2]
 scripts_dir        <- cmd_args[2]
-#thresh  <- as.numeric(cmd_args[4])
-#resol   <- as.numeric(cmd_args[5])
-#ppm     <- as.numeric(cmd_args[6])
-thresh  <- cmd_args[3]
-resol   <- cmd_args[4]
-ppm     <- cmd_args[5]
-# outdir <- cmd_args[2]
-# scanmode <- cmd_args[3]
+thresh  <- as.numeric(cmd_args[3])
+resol   <- as.numeric(cmd_args[4])
+ppm     <- as.numeric(cmd_args[5])
 outdir <- "./"
+
 if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else
     if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" }
 
 # load in function scripts
 source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
 source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getSD.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getArea.R"))
+source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R"))
 source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R"))
-
-# for debugging:
-print(peakgrouplist_file)
-#print(pattern_file)
-print(scripts_dir)
-print(thresh)
-print(resol)
-print(ppm)
-
-outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
-print(outputfile_name)
+source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R"))
+source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R"))
 
 # get replication pattern for sample names
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 repl_pattern <- get(load(pattern_file))
-print(repl_pattern[[1]])
 
-# peakgrouplist <- get(load(peakgrouplist_file))
+# load peak group list and determine output file name
 load(peakgrouplist_file)
-batch_number <- strsplit(basename(peakgrouplist_file), ".",fixed = TRUE)[[1]][1]
-
-print(dim(outpgrlist.identified))
-print(colnames(outpgrlist.identified))
-print(batch_number)
+outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # replace missing values (zeros) with random noise
 peakgrouplist_filled <- replaceZeros(outpgrlist.identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
-# save output
-save(peakgrouplist_filled, file=paste0("./", outputfile_name)
-save(peakgrouplist_filled, file=paste0("./", batch_number, scanmode, "identified_filled.RData")
+# save output 
+save(peakgrouplist_filled, file=paste0("./", outputfile_name))
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index c88ea5c..4749b87 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -8,7 +8,7 @@ process FillMissing {
        path(replication_pattern)
 
     output:
-       path('.RData')
+       path('*_filled.RData')
 
     script:
         """
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index b5038e8..756c0a7 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -6,9 +6,9 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 HMDB_part_file <- cmd_args[1]
-SpecPeaks_file <- cmd_args[2]
-pattern_file   <- cmd_args[3]
-ppm <- as.numeric(cmd_args[4])
+# SpecPeaks_file <- cmd_args[2]
+# pattern_file   <- cmd_args[3]
+ppm <- as.numeric(cmd_args[2])
 
 options(digits=16)
 
@@ -30,6 +30,8 @@ load(SpecPeaks_file)
 outlist.copy <- outlist.tot
 rm(outlist.tot)
 
+print(dim(outlist.copy))
+
 # load replication pattern
 # load(paste0("./", scanmode, "_repl_pattern", ".RData"))
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
@@ -40,9 +42,11 @@ load(pattern_file)
 if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" }
 
 # for debugging:
-print(head(HMDB_add_iso))
+print(dim(HMDB_add_iso))
 print(scanmode)
 print(column_label)
+print(ppm)
+print(head(repl_pattern_filtered, 1))
 
 # Initialize
 outpgrlist.identified <- NULL
@@ -56,11 +60,14 @@ while (dim(HMDB_add_iso)[1] > 0) {
   reference_mass <- as.numeric(HMDB_add_iso[index, column_label])
   mass_tolerance <- (reference_mass * ppm) / 10^6
   
+  print(paste0("ref_mass ", reference_mass, " mtol ", mass_tolerance))
+
   # find the peaks in the dataset with corresponding m/z
   mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"])
   selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance)))
   tmplist <- outlist.copy[selp,,drop=FALSE]
   outlist.grouped <- rbind(outlist.grouped, tmplist)
+
   nrsamples <- length(selp)
   if (nrsamples > 0) {
     mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
@@ -72,8 +79,8 @@ while (dim(HMDB_add_iso)[1] > 0) {
     fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"]))
     
     # set up object for intensities for all samples
-    ints.allsamps <- rep(0, length(names(repl.pattern.filtered)))
-    names(ints.allsamps) <- names(repl.pattern.filtered) 
+    ints.allsamps <- rep(0, length(names(repl_pattern_filtered)))
+    names(ints.allsamps) <- names(repl_pattern_filtered) 
     
     # Check for each sample if multiple peaks exist, if so take the sum of the intensities
     labels <- unique(tmplist[ ,"samplenr"])
@@ -180,6 +187,9 @@ while (dim(HMDB_add_iso)[1] > 0) {
   
 }
 
+
+print(head(outpgrlist.identified))
+
 # save peak list corresponding to masses in HMDB part
 # save(outlist.grouped, file=paste0(batch_number, "_", scanmode, "_all.RData"))
 # save peak group list, identified part
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index be9d542..89328fb 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -5,7 +5,7 @@ process PeakGrouping {
 
     input:
        path(HMDBpart_file)
-       path(SpectrumPeak_file)
+       path(SpectrumPeak_file) // input files need to be linked, but called within R script
        path(pattern_file)
 
     output:
@@ -14,6 +14,6 @@ process PeakGrouping {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $SpectrumPeak_file $pattern_file $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $params.ppm
         """
 }
diff --git a/DIMS/PeakGroupingIdentified.R b/DIMS/PeakGroupingIdentified.R
deleted file mode 100644
index e6bf3fc..0000000
--- a/DIMS/PeakGroupingIdentified.R
+++ /dev/null
@@ -1,180 +0,0 @@
-#!/usr/bin/Rscript
-
-# load required packages 
-# none 
-
-# define parameters 
-cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
-
-#outdir <- cmd_args[2]
-#scanmode <- cmd_args[3]
-SpecPeaks_file <- cmd_args[1]
-HMDB_part_file <- cmd_args[2]
-pattern_file <- cmd_args[3]
-resol <- as.numeric(cmd_args[4])
-ppm <- as.numeric(cmd_args[5])
-
-options(digits=16)
-
-cat(paste("File to group:", fileIn))
-
-
-options(digits=16)
-load(HMDB_part_file) # outlist_part(HMDB_add_iso)
-HMDB_add_iso = outlist_part
-
-# Windows and Unix-like
-batch = strsplit(fileIn, "/",fixed = TRUE)[[1]]
-batch = batch[length(batch)] 
-batch = strsplit(batch, ".",fixed = TRUE)[[1]][2]
-
-load(SpecPeaks_file)
-outlist.copy = outlist.tot
-rm(outlist.tot)
-
-load(pattern_file)
-load("./breaks.fwhm.RData")
-
-outpgrlist.identified = NULL
-
-if (scanmode=="negative"){
-  label = "MNeg"
-} else {
-  label = "Mpos"
-}
-
-outlist.grouped <- NULL
-
-# First group on HMDB masses
-while (dim(HMDB_add_iso)[1] > 0) { 
-  index = 1
-  # message(HMDB_add_iso[index,"CompoundName"])
-  
-  mass = as.numeric(HMDB_add_iso[index,label])
-  mtol = (mass*ppm)/10^6
-  
-  mzmed = as.numeric(outlist.copy[,"mzmed.pkt"])
-  selp = which((mzmed > (mass - mtol)) & (mzmed < (mass + mtol)))
-  tmplist = outlist.copy[selp,,drop=FALSE]
-  outlist.grouped <- rbind(outlist.grouped, tmplist)
-  
-  nrsamples = length(selp)
-  if (nrsamples > 0) {
-    # message(paste("Bingo ",n))
-    
-    mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
-    # mzmed.pgrp = median(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
-    mzmin.pgrp = mass - mtol
-    mzmax.pgrp = mass + mtol
-    
-    fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"]))
-    fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"]))
-    ints.allsamps = rep(0, length(names(repl.pattern.filtered)))
-    names(ints.allsamps) = names(repl.pattern.filtered) # same order as sample list!!!
-    
-    # # Check for each sample if multiple peaks exists, if so take the sum!
-    labels=unique(tmplist[,"samplenr"])
-    ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))})))
-    # ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])})))
-    
-    # Identification
-    # assi_HMDB = iso_HMDB = HMDB_code = ""
-    assi_HMDB = iso_HMDB = HMDB_code = NA
-    tmplist.mass.iso = tmplist.mass.adduct = NULL
-    # index = which(HMDB_add_iso[,label]==mass)
-    
-    # Consider groups off 4ppm
-    mass.all = as.numeric(HMDB_add_iso[,label])
-    index = which((mass.all > (mass - mtol)) & (mass.all < (mass + mtol)))
-    # index = which(mass.all < (mass + 2*mtol))
-    tmplist.mass = HMDB_add_iso[index,,drop=FALSE]
-    
-    if (dim(tmplist.mass)[1]>0) {
-      
-      index.iso = grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE)
-      if (length(index.iso)>0){
-        tmplist.mass.iso = tmplist.mass[index.iso,,drop=FALSE]
-        tmplist.mass = tmplist.mass[-index.iso,,drop=FALSE]
-      }
-      
-      if (dim(tmplist.mass)[1]>0) {        
-        index.adduct = grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE)
-        if (length(index.adduct)>0){
-          tmplist.mass.adduct = tmplist.mass[index.adduct,,drop=FALSE]
-          tmplist.mass = tmplist.mass[-index.adduct,,drop=FALSE]
-        }  
-      }  
-      
-      # First compouds without adducts or isotopes
-      if (dim(tmplist.mass)[1]>0) {
-        
-        # pure compounds
-        assi_HMDB = as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";"))
-        HMDB_code = as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";"))
-        theormz_HMDB = as.numeric(tmplist.mass[1,label])
-        
-        # adducts
-        if (!is.null(tmplist.mass.adduct)) {
-          if (dim(tmplist.mass.adduct)[1]>0) {
-            if (is.na(assi_HMDB)){
-              assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";"))
-              HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
-            } else {
-              assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
-              HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
-            }}}
-        
-        # isotopes
-        if (!is.null(tmplist.mass.iso)) {
-          if (dim(tmplist.mass.iso)[1]>0) {
-            iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
-          }}
-        
-        # No pure compounts  
-      } else if (!is.null(tmplist.mass.adduct)) {
-        
-        theormz_HMDB = as.numeric(tmplist.mass.adduct[1,label])
-        
-        # adducts
-        if (!is.null(tmplist.mass.adduct)) {
-          if (dim(tmplist.mass.adduct)[1]>0) {
-            if (is.na(assi_HMDB)){
-              assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";"))
-              HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
-            } else {
-              assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
-              HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
-            }}}
-        
-        # isotopes
-        if (!is.null(tmplist.mass.iso)) {
-          if (dim(tmplist.mass.iso)[1]>0) {
-            iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
-          }}
-        
-        # only isotopes  
-      } else if (!is.null(tmplist.mass.iso)) {
-        
-        if (dim(tmplist.mass.iso)[1]>0) {
-          theormz_HMDB = as.numeric(tmplist.mass.iso[1,label])
-          iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
-        }
-      }  
-      
-    }
-    
-    outpgrlist.identified = rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp),
-                                         t(as.matrix(ints.allsamps)),
-                                         data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB)))
-  }
-  
-  HMDB_add_iso = HMDB_add_iso[-index,]
-  
-}
-
-#dir.create(paste(outdir, "6-grouping_hmdb", sep="/"), showWarnings = FALSE)
-save(outpgrlist.identified, file=paste(paste(outdir, "6-grouping_hmdb", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/"))
-
-#dir.create(paste(outdir, "6-grouping_hmdb_done", sep="/"), showWarnings = FALSE)
-save(outlist.grouped, file=paste(paste(outdir, "6-grouping_hmdb_grouped", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/"))
diff --git a/DIMS/PeakGroupingIdentified.nf b/DIMS/PeakGroupingIdentified.nf
deleted file mode 100644
index 2181cae..0000000
--- a/DIMS/PeakGroupingIdentified.nf
+++ /dev/null
@@ -1,19 +0,0 @@
-process PeakGroupingIdentified {
-    label 'PeakGroupingIdentified'
-    container = 'docker://umcugenbioinf/dims:1.3'
-    shell = ['/bin/bash', '-euo', 'pipefail']
-
-    input:
-       path(SpectrumPeak_file)
-       path(HMDBpart_file)
-       path(pattern_file)
-
-    output:
-       path '*_negative.RData'
-       path '*_positive.RData'
-
-    script:
-        """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $SpectrumPeak_file $HMDBpart_file $pattern_file $params.resolution $params.ppm
-        """
-}

From 6d54b1405d843cfc9a5a92f0523d641b5250b041 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 1 Sep 2023 17:12:30 +0200
Subject: [PATCH 12/73] extra files for Nextflow pipeline

---
 DIMS/CollectFilled.R      |  83 ++++++
 DIMS/CollectFilled.nf     |  17 ++
 DIMS/CollectSumAdducts.R  |  25 ++
 DIMS/CollectSumAdducts.nf |  17 ++
 DIMS/GenerateExcel.R      | 579 ++++++++++++++++++++++++++++++++++++++
 DIMS/GenerateExcel.nf     |  18 ++
 DIMS/HMDBparts_main.R     |  64 +++++
 DIMS/HMDBparts_main.nf    |  20 ++
 DIMS/SumAdducts.R         | 122 ++++++++
 DIMS/SumAdducts.nf        |  18 ++
 10 files changed, 963 insertions(+)
 create mode 100755 DIMS/CollectFilled.R
 create mode 100644 DIMS/CollectFilled.nf
 create mode 100755 DIMS/CollectSumAdducts.R
 create mode 100644 DIMS/CollectSumAdducts.nf
 create mode 100644 DIMS/GenerateExcel.R
 create mode 100644 DIMS/GenerateExcel.nf
 create mode 100644 DIMS/HMDBparts_main.R
 create mode 100644 DIMS/HMDBparts_main.nf
 create mode 100755 DIMS/SumAdducts.R
 create mode 100644 DIMS/SumAdducts.nf

diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
new file mode 100755
index 0000000..62be430
--- /dev/null
+++ b/DIMS/CollectFilled.R
@@ -0,0 +1,83 @@
+#!/usr/bin/Rscript
+## adapted from 10-collectSamplesFilled.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+scripts_dir        <- cmd_args[1]
+ppm     <- as.numeric(cmd_args[2])
+z_score <- as.numeric(cmd_args[3])
+
+source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R"))
+source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R"))
+# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R"))
+
+# for each scan mode, collect all filled peak group lists
+scanmodes <- c("positive", "negative")
+
+for (scanmode in scanmodes) {
+  # get list of files
+  filled_files <- list.files("./", full.names=TRUE, pattern=scanmode)
+  # load files and combine into one object
+  outlist.tot <- NULL
+  for (i in 1:length(filled_files)) {
+    load(filled_files[i])
+    print(filled_files[i])
+    outlist.tot <- rbind(outlist.tot, peakgrouplist_filled)
+  }
+
+  # remove duplicates; peak groups with exactly the same m/z
+  outlist.tot <- mergeDuplicatedRows(outlist.tot)
+
+  # sort on mass
+  outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),]
+
+  # load replication pattern
+  pattern_file <- paste0(scanmode, "_repl_pattern.RData")
+  repl_pattern <- get(load(pattern_file))
+
+  # Normalization: not done.
+  # if (normalization != "disabled") {
+  #   outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB")
+  # }
+
+  if (z_score == 1) {
+    outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
+    nr.removed.samples <- length(which(repl_pattern[] == "character(0)"))
+    order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)])
+    outlist.stats.more <- cbind(outlist.stats[,1:7],
+                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)],
+                               outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int],
+                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)])
+  
+    tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE)
+    tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index]))
+    tmp <- outlist.stats.more[,tmp.index[tmp.index.order]]
+    outlist.stats.more <- outlist.stats.more[,-tmp.index]
+    outlist.stats.more <- cbind(outlist.stats.more,tmp)
+    outlist.tot <- outlist.stats.more
+  }
+
+  # filter identified compounds
+  index.1 <- which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"])))
+  index.2 <- which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"])))
+  index <- union(index.1,index.2)
+  outlist.ident <- outlist.tot[index,]
+  outlist.not.ident <- outlist.tot[-index,]
+
+  if (z_score == 1) {
+    outlist.ident$ppmdev <- as.numeric(outlist.ident$ppmdev)
+    outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),]
+  }
+  # NAs in theormz_noise 
+  outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0
+  outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise)
+  outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0
+  outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise)
+
+  # save output for identified peak groups (not.identified later)
+  # save(outlist.not.ident, file="./outlist_not_identified_", scanmode, ".RData")
+  save(outlist.ident, file=paste0("./outlist_identified_", scanmode, ".RData"))
+
+}
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
new file mode 100644
index 0000000..c28516c
--- /dev/null
+++ b/DIMS/CollectFilled.nf
@@ -0,0 +1,17 @@
+process CollectFilled {
+    label 'CollectFilled'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(filled_files)
+       path(replication_pattern)
+
+    output:
+       path('outlist*.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore
+        """
+}
diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R
new file mode 100755
index 0000000..ae89cbc
--- /dev/null
+++ b/DIMS/CollectSumAdducts.R
@@ -0,0 +1,25 @@
+#!/usr/bin/Rscript
+## adapted from 12-collectSamplesAdded.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+# outdir <- cmd_args[1]
+# scanmode <- cmd_args[2]
+scanmode <- c("positive", "negative")
+
+for (scanmode in scanmodes) {
+  # collect all AdductSums part files for each scanmode
+  adductsum_part_files <- list.files("./", pattern = scanmode)
+
+  outlist.tot <- NULL
+  for (i in 1:length(adductsum_part_files)) {
+    load(adductsum_part_files[i])
+    outlist.tot <- rbind(outlist.tot, adductsum)
+  }
+
+  # save output file
+  save(outlist.tot, file="/AdductSums_", scanmode, ".RData")
+}
+
diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf
new file mode 100644
index 0000000..a72455c
--- /dev/null
+++ b/DIMS/CollectSumAdducts.nf
@@ -0,0 +1,17 @@
+process CollectSumAdducts {
+    tag {"DIMS CollectSumAdducts"}
+    label 'CollectSumAdducts'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(collect_file) // input files need to be linked, but called within R script
+
+    output:
+       path('AdductSums_*.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R 
+        """
+}
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
new file mode 100644
index 0000000..7958f95
--- /dev/null
+++ b/DIMS/GenerateExcel.R
@@ -0,0 +1,579 @@
+#!/usr/bin/Rscript
+## adapted from 13-excelExport.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep = "")
+
+# outdir <- cmd_args[1] #"/Users/nunen/Documents/Metab/test_set"
+init_filepath <- cmd_args[1]
+project <- cmd_args[2] #"test"
+dims_matrix <- cmd_args[3] #"DBS"
+hmdb <- cmd_args[4] #"/hpc/dbg_mz/tools/db/HMDB_with_info_relevance_IS_C5OH.RData"
+z_score <- as.numeric(cmd_args[5])
+
+# load required packages 
+library("ggplot2")
+library("reshape2")
+library("openxlsx")
+library("loder")
+suppressMessages(library("dplyr"))
+suppressMessages(library("stringr"))
+
+# Initialise and load data
+outdir <- "../../../"
+plot <- TRUE
+export <- TRUE
+control_label <- "C"
+case_label <- "P"
+imagesize_multiplier <- 2
+# init <- paste0(outdir, "/logs/init.RData")
+
+# load information on samples
+load(init_filepath)
+# load the HMDB file with info on biological relevance of metabolites
+load(hmdb) # load object rlvnc into global environment
+
+rundate <- Sys.Date()
+
+# create a directory for plots in project directory
+plotdir <- paste0(outdir, "/plots/adducts")
+dir.create(paste0(outdir, "/plots"), showWarnings = F)
+dir.create(plotdir, showWarnings = F)
+
+options(digits=16)
+
+# setwd(outdir)
+
+# sum positive and negative adductsums
+
+# Load pos and neg adduct sums
+load("/AdductSums_negative.RData")
+outlist.neg.adducts.HMDB <- outlist.tot
+
+load("/AdductSums_positive.RData")
+outlist.pos.adducts.HMDB <- outlist.tot
+rm(outlist.tot)
+
+# Only continue with patients (columns) that are in both pos and neg, so patients that are in both
+tmp <- intersect(colnames(outlist.neg.adducts.HMDB), colnames(outlist.pos.adducts.HMDB))
+outlist.neg.adducts.HMDB <- outlist.neg.adducts.HMDB[,tmp]
+outlist.pos.adducts.HMDB <- outlist.pos.adducts.HMDB[,tmp]
+
+# Find indexes of neg hmdb code that are also found in pos and vice versa
+index.neg <- which(rownames(outlist.neg.adducts.HMDB) %in% rownames(outlist.pos.adducts.HMDB))
+index.pos <- which(rownames(outlist.pos.adducts.HMDB) %in% rownames(outlist.neg.adducts.HMDB))
+
+# Get number of columns
+# Only continue with HMDB codes (rows) that were found in both pos and neg mode and remove last column (hmdb_name)
+tmp.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.pos.adducts.HMDB)[2]-1)]
+tmp.hmdb_name.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], dim(outlist.pos.adducts.HMDB)[2]]
+tmp.pos.left <- outlist.pos.adducts.HMDB[-index.pos,]
+
+tmp.neg <- outlist.neg.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.neg.adducts.HMDB)[2]-1)]
+tmp.neg.left <- outlist.neg.adducts.HMDB[-index.neg,]
+
+# Combine positive and negative numbers and paste back HMDB column
+tmp <- apply(tmp.pos, 2,as.numeric) + apply(tmp.neg, 2,as.numeric)
+rownames(tmp) <- rownames(tmp.pos)
+tmp <- cbind(tmp, "HMDB_name"=tmp.hmdb_name.pos)
+outlist <- rbind(tmp, tmp.pos.left, tmp.neg.left)
+
+# Filter 
+peaksInList <- which(rownames(outlist) %in% rownames(rlvnc))
+outlist <- cbind(outlist[peaksInList,],as.data.frame(rlvnc[rownames(outlist)[peaksInList],]))
+# filter out all irrelevant HMDB's. the tibble::rownames is needed for the older version of dplyr on the HPC (it will reindex the rownames)
+outlist <- outlist %>% 
+  tibble::rownames_to_column('rowname') %>% 
+  filter(!grepl("Exogenous|Drug|exogenous", relevance)) %>%
+  tibble::column_to_rownames('rowname')
+
+# Add HMDB_code column with all the HMDB ID and sort on it
+outlist <- cbind(outlist, "HMDB_code" = rownames(outlist))
+outlist <- outlist[order(outlist[,"HMDB_code"]),]
+
+# Create excel
+filelist <- "AllPeakGroups"
+
+wb <- createWorkbook("SinglePatient")
+addWorksheet(wb, filelist)
+
+# small function for rounding numbers to x digits for numeric values
+round_df <- function(df, digits) {
+  # round all numeric variables
+  # df: data frame 
+  # digits: number of digits to round
+  numeric_columns <- sapply(df, mode) == 'numeric'
+  df[numeric_columns] <- round(df[numeric_columns], digits)
+  df
+}
+
+# Add Z-scores and create plots
+if (z_score == 1) {
+  # Statistics: Z-score
+  outlist <- cbind(plots = NA, outlist)
+  #outlist <- as.data.frame(outlist)
+  
+  startcol <- dim(outlist)[2] + 3 
+  
+  # Get columns with control intensities 
+  control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE)
+  control_columns <- outlist[, control_col_ids]
+  
+  # Get columns with patient intensities 
+  patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE)
+  patient_columns <- outlist[, patient_col_ids]
+  
+  intensity_col_ids <- c(control_col_ids, patient_col_ids)
+  
+  # set intensities of 0 to NA?
+  outlist[,intensity_col_ids][outlist[,intensity_col_ids] == 0] <- NA
+  
+  # calculate mean and sd for Control group
+  outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x),na.rm = TRUE))
+  outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x),na.rm = TRUE))
+  
+  # Make and add columns with zscores
+  cnames.z <- NULL
+  for (i in intensity_col_ids) {
+    cname <- colnames(outlist)[i]
+    cnames.z <- c(cnames.z, paste(cname, "Zscore", sep="_"))
+    zscores.1col <- (as.numeric(as.vector(unlist(outlist[ , i]))) - outlist$avg.ctrls) / outlist$sd.ctrls
+    outlist <- cbind(outlist, zscores.1col)
+  }
+  colnames(outlist)[startcol:ncol(outlist)] <- cnames.z
+  
+  patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1]))))
+  patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] # sorts 
+  
+  temp_png <- NULL
+  
+  # Iterate over every row, make boxplot, insert into excel, and make Zscore for every patient
+  for (p in 1:nrow(outlist)) {
+    # box plot 
+    hmdb_name <- rownames(outlist[p,])
+    
+    intensities <- list(as.numeric(as.vector(unlist(control_columns[p,]))))
+    labels <- c("C", patient_ids)
+    
+    for (i in 1:length(patient_ids)) {
+      id <- patient_ids[i]
+      # get all intensities that start with ex. P18. (so P18.1, P18.2, but not x_P18.1 and not P180.1)
+      p.int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1,])[startsWith(names(patient_columns[1,]), paste0(id, "."))]])))
+      intensities[[i+1]] <- p.int
+    }
+    
+    intensities <- setNames(intensities, labels)
+    
+    plot_width <- length(labels) * 16 + 90
+    
+    plot.new()
+    if (export) {
+      png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"), 
+          width = plot_width, 
+          height = 280)
+    }
+    par(oma=c(2,0,0,0))
+    boxplot(intensities, 
+            col=c("green", rep("red", length(intensities)-1)),
+            names.arg = labels, 
+            las=2, 
+            main = hmdb_name) 
+    dev.off()
+    
+    file_png <- paste0(plotdir, "/", hmdb_name, "_box.png")
+    if (is.null(temp_png)) {
+      temp_png <- readPng(file_png)
+      img_dim <- dim(temp_png)[c(1,2)]
+      cell_dim <- img_dim * imagesize_multiplier
+      setColWidths(wb, filelist, cols = 1, widths = cell_dim[2]/20)
+    }
+    
+    insertImage(wb, 
+                filelist, 
+                file_png, 
+                startRow = p + 1, 
+                startCol = 1, 
+                height = cell_dim[1], 
+                width = cell_dim[2], 
+                units = "px")
+    
+    if (p %% 100 == 0) {
+      cat("at row: ", p, "\n")
+    }
+  }
+  
+  setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1]/4)
+  setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20)
+} else {
+  setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18)
+  setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20)
+}
+writeData(wb, sheet = 1, outlist, startCol = 1)
+xlsx_name <- paste0(outdir, "/", project, ".xlsx")
+saveWorkbook(wb,
+             xlsx_name,
+             overwrite = TRUE)
+cat(xlsx_name)
+rm(wb)
+
+write.table(outlist, file = paste(outdir, "allpgrps_stats.txt", sep = "/"))
+
+# INTERNE STANDAARDEN
+IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),]
+IS_codes <- rownames(IS)
+cat(IS_codes,"\n")
+
+# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl.pattern) will contain more sample_names then the peak data (IS), 
+# thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data.
+sample_names_nodata <- setdiff(names(repl.pattern),names(IS))
+if (!is.null(sample_names_nodata)) {
+  write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
+  cat(sample_names_nodata,"\n")
+  for (sample_name in sample_names_nodata) {
+    repl.pattern[[sample_name]] <- NULL
+  }}
+
+# Retrieve IS summed adducts
+IS_summed <- IS[c(names(repl.pattern), "HMDB_code")]
+IS_summed$HMDB.name <- IS$name
+IS_summed <- reshape2::melt(IS_summed, id.vars=c('HMDB_code','HMDB.name'))
+colnames(IS_summed) <- c('HMDB.code','HMDB.name','Sample','Intensity')
+IS_summed$Intensity <- as.numeric(IS_summed$Intensity)
+IS_summed$Matrix <- dims_matrix
+IS_summed$Rundate <- rundate
+IS_summed$Project <- project
+IS_summed$Intensity <- as.numeric(as.character(IS_summed$Intensity))
+
+# Retrieve IS positive mode
+IS_pos <- as.data.frame(subset(outlist.pos.adducts.HMDB,rownames(outlist.pos.adducts.HMDB) %in% IS_codes))
+IS_pos$HMDB_name <- IS[match(row.names(IS_pos),IS$HMDB_code,nomatch=NA),'name']
+IS_pos$HMDB.code <- row.names(IS_pos)
+IS_pos <- reshape2::melt(IS_pos, id.vars=c('HMDB.code','HMDB_name'))
+colnames(IS_pos) <- c('HMDB.code','HMDB.name','Sample','Intensity')
+IS_pos$Matrix <- dims_matrix
+IS_pos$Rundate <- rundate
+IS_pos$Project <- project
+IS_pos$Intensity <- as.numeric(as.character(IS_pos$Intensity))
+
+# Retrieve IS negative mode
+IS_neg <- as.data.frame(subset(outlist.neg.adducts.HMDB,rownames(outlist.neg.adducts.HMDB) %in% IS_codes))
+IS_neg$HMDB_name <- IS[match(row.names(IS_neg),IS$HMDB_code,nomatch=NA),'name']
+IS_neg$HMDB.code <- row.names(IS_neg)
+IS_neg <- reshape2::melt(IS_neg, id.vars=c('HMDB.code','HMDB_name'))
+colnames(IS_neg) <- c('HMDB.code','HMDB.name','Sample','Intensity')
+IS_neg$Matrix <- dims_matrix
+IS_neg$Rundate <- rundate
+IS_neg$Project <- project
+IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity))
+
+# Save results
+save(IS_pos,IS_neg,IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData'))
+
+# number of samples, for plotting length and width
+sample_count <- length(repl.pattern)
+
+# change the order of the x-axis summed plots to a natural sorted one
+Sample_naturalorder <- unique(as.character(IS_summed$Sample))
+Sample_naturalorder <- str_sort(Sample_naturalorder, numeric = TRUE)
+IS_summed$Sample_level <- factor(IS_summed$Sample, levels = c(Sample_naturalorder))
+IS_pos$Sample_level <- factor(IS_pos$Sample, levels = c(Sample_naturalorder))
+IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder))
+
+##########
+##### bar plots with all IS
+##########
+
+# function for ggplot theme
+# theme for all IS bar plots
+theme_IS_bar <- function(myPlot) {
+  myPlot + 
+    scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
+    theme(
+      legend.position='none',
+      axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.5, size=6),
+      axis.text.y=element_text(size=6))
+}
+
+# ggplot functions
+IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Neg)") +
+  geom_bar(aes(fill=HMDB.name),stat='identity') +
+  labs(x='',y='Intensity') +
+  facet_wrap(~HMDB.name, scales='free_y')
+
+IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Pos)") +
+  geom_bar(aes(fill=HMDB.name),stat='identity') +
+  labs(x='',y='Intensity') +
+  facet_wrap(~HMDB.name, scales='free_y')
+
+IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Summed)") +
+  geom_bar(aes(fill=HMDB.name),stat='identity') +
+  labs(x='',y='Intensity') +
+  facet_wrap(~HMDB.name, scales='free_y')
+
+# add theme to ggplot functions
+IS_neg_bar_plot <- theme_IS_bar(IS_neg_bar_plot)
+IS_pos_bar_plot <- theme_IS_bar(IS_pos_bar_plot)
+IS_sum_bar_plot <- theme_IS_bar(IS_sum_bar_plot)
+
+# save plots to disk
+plot_width <- 9 + 0.35 * sample_count
+ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot=IS_neg_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot=IS_pos_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot=IS_sum_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
+
+##########
+##### Line plots with all IS
+##########
+
+# function for ggplot theme
+# add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines
+theme_IS_line <- function(myPlot) {
+  myPlot + 
+    guides(
+      shape = guide_legend(override.aes = list(size = 0.5)),
+      color = guide_legend(override.aes = list(size = 0.5))) +
+    theme(
+      legend.title = element_text(size = 8), 
+      legend.text  = element_text(size = 6),
+      legend.key.size = unit(0.7, "line"),
+      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8))
+}
+
+# ggplot functions
+IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Neg)") +
+  geom_point(aes(col=HMDB.name)) +
+  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
+  labs(x='',y='Intensity')
+
+IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Pos)") +
+  geom_point(aes(col = HMDB.name)) +
+  geom_line(aes(col = HMDB.name, group = HMDB.name)) +
+  labs(x = '', y = 'Intensity')
+
+IS_sum_line_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Sum)") +
+  geom_point(aes(col = HMDB.name)) +
+  geom_line(aes(col = HMDB.name, group = HMDB.name)) +
+  labs(x = '', y = 'Intensity')
+
+# add theme to ggplot functions
+IS_sum_line_plot <- theme_IS_line(IS_sum_line_plot)
+IS_neg_line_plot <- theme_IS_line(IS_neg_line_plot)
+IS_pos_line_plot <- theme_IS_line(IS_pos_line_plot)
+
+# save plots to disk
+plot_width <- 8 + 0.2 * sample_count
+ggsave(paste0(outdir,"/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir,"/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir,"/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
+
+##########
+##### bar plots with a selection of IS
+##########
+IS_neg_selection <- c('2H2-Ornithine (IS)', '2H3-Glutamate (IS)', '2H2-Citrulline (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)')
+IS_pos_selection <- c('2H4-Alanine (IS)', '13C6-Phenylalanine (IS)', '2H4_13C5-Arginine (IS)', '2H3-Propionylcarnitine (IS)', '2H9-Isovalerylcarnitine (IS)')
+IS_sum_selection <- c('2H8-Valine (IS)', '2H3-Leucine (IS)', '2H3-Glutamate (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)')
+
+# add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum)
+if (dims_matrix == "DBS"){
+  hline.data.neg <- 
+    data.frame(z = c(15000, 200000, 130000, 18000, 50000),
+               HMDB.name = IS_neg_selection)
+  hline.data.pos <- 
+    data.frame(z = c(150000, 3300000, 1750000, 150000, 270000),
+               HMDB.name = IS_pos_selection)
+  hline.data.sum <- 
+    data.frame(z = c(1300000, 2500000, 500000, 1800000, 1400000),
+               HMDB.name = IS_sum_selection)
+} else if (dims_matrix == "Plasma"){
+  hline.data.neg <- 
+    data.frame(z = c(6500, 100000, 75000, 7500, 25000),
+               HMDB.name = IS_neg_selection)
+  hline.data.pos <- 
+    data.frame(z = c(85000, 1000000, 425000, 70000, 180000),
+               HMDB.name = IS_pos_selection)
+  hline.data.sum <- 
+    data.frame(z = c(700000, 1250000, 150000, 425000, 300000),
+               HMDB.name = IS_sum_selection)
+} 
+
+# function for ggplot theme
+# see bar plots with all IS
+
+# ggplot functions
+IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Neg)") +
+  geom_bar(aes(fill=HMDB.name),stat='identity') +
+  labs(x='',y='Intensity') +
+  facet_wrap(~HMDB.name, scales='free', ncol = 2) + 
+  if(exists("hline.data.neg")){geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))} #subset, if some IS have no data, no empty plots will be generated with a line)
+
+IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Pos)") +
+  geom_bar(aes(fill=HMDB.name),stat='identity') +
+  labs(x='',y='Intensity') +
+  facet_wrap(~HMDB.name, scales='free', ncol = 2) +
+  if(exists("hline.data.pos")){geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))}
+
+IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Sum)") +
+  geom_bar(aes(fill=HMDB.name),stat='identity') +
+  labs(x='',y='Intensity') +
+  facet_wrap(~HMDB.name, scales='free', ncol = 2) +
+  if(exists("hline.data.sum")){geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))}
+
+# add theme to ggplot functions
+IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot) 
+IS_pos_selection_barplot <- theme_IS_bar(IS_pos_selection_barplot)
+IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot) 
+
+# save plots to disk
+plot_width <- 9 + 0.35 * sample_count
+ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
+
+##########
+##### line plots with a selection of IS
+##########
+
+# function for ggplot theme
+# see line plots with all IS
+
+# ggplot functions
+IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Neg)") +
+  geom_point(aes(col=HMDB.name)) +
+  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
+  labs(x='',y='Intensity')
+
+IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Pos)") +
+  geom_point(aes(col=HMDB.name)) +
+  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
+  labs(x='',y='Intensity')
+
+IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
+  ggtitle("Interne Standaard (Sum)") +
+  geom_point(aes(col=HMDB.name)) +
+  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
+  labs(x='',y='Intensity')
+
+# add theme to ggplot functions
+IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot)
+IS_pos_selection_lineplot <- theme_IS_line(IS_pos_selection_lineplot)
+IS_sum_selection_lineplot <- theme_IS_line(IS_sum_selection_lineplot)
+
+# save plots to disk
+plot_width <- 8 + 0.2 * sample_count
+ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in")
+
+
+### POSITIVE CONTROLS CHECK
+# these positive controls need to be in the samplesheet, in order to make the Pos_Contr.RData file
+# Positive control samples all have the format P1002.x, P1003.x and P1005.x (where x is a number)
+
+column_list <- colnames(outlist)
+patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_")
+positive_controls_index <- grepl(pattern=paste(patterns, collapse="|"), column_list)
+positivecontrol_list <- column_list[positive_controls_index]
+
+if (z_score == 1) {
+  # find if one or more positive control samples are missing
+  pos_contr_warning <- c()
+  # any() grep because you get a vector of FALSE's and TRUE's. only one grep match is needed for each positive control
+  if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) && 
+      any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) && 
+      any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))){
+    cat("All three positive controls are present")
+  } else {
+    pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse=" ")
+  }
+  # you need all positive control samples, thus starting the script only if all are available
+  if (length(pos_contr_warning) == 0) {
+    ### POSITIVE CONTROLS
+    # make positive control excel with specific HMDB_codes in combination with specific control samples
+    PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] #P1001.x_Zscore
+    PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] #P1003.x_Zscore
+    LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] #P1005.x_Zscore
+    
+    PA_codes <- c('HMDB00824', 'HMDB00783', 'HMDB00123')
+    PKU_codes <- c('HMDB00159')
+    LPI_codes <- c('HMDB00904', 'HMDB00641', 'HMDB00182')
+    
+    PA_data <- outlist[PA_codes, c('HMDB_code','name', PA_sample_name)]
+    PA_data <- reshape2::melt(PA_data, id.vars = c('HMDB_code','name'))
+    colnames(PA_data) <- c('HMDB.code','HMDB.name','Sample','Zscore')
+    
+    PKU_data <- outlist[PKU_codes, c('HMDB_code','name', PKU_sample_name)]
+    PKU_data <- reshape2::melt(PKU_data, id.vars = c('HMDB_code','name'))
+    colnames(PKU_data) <- c('HMDB.code','HMDB.name','Sample','Zscore')
+    
+    LPI_data <- outlist[LPI_codes, c('HMDB_code','name', LPI_sample_name)]
+    LPI_data <- reshape2::melt(LPI_data, id.vars = c('HMDB_code','name'))
+    colnames(LPI_data) <- c('HMDB.code','HMDB.name','Sample','Zscore')
+    
+    Pos_Contr <- rbind(PA_data, PKU_data, LPI_data)
+    Pos_Contr$Zscore <- as.numeric(Pos_Contr$Zscore)
+    # extra information added to excel for future reference. made in beginning of this script
+    Pos_Contr$Matrix <- dims_matrix
+    Pos_Contr$Rundate <- rundate
+    Pos_Contr$Project <- project
+    
+    #Save results
+    save(Pos_Contr,file = paste0(outdir, "/", project, '_Pos_Contr.RData'))
+    Pos_Contr$Zscore <- round_df(Pos_Contr$Zscore, 2) # asked by Lab to round the number to 2 digits
+    write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, '_Pos_Contr.xlsx'), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE)
+    
+  } else {
+    write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
+  }}
+
+### MISSING M/Z CHECK
+# check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail
+
+# Load the outlist_identified files + remove the loaded files
+load(paste0(outdir,"/outlist_identified_negative.RData"))
+outlist.ident.neg <- outlist.ident
+load(paste0(outdir,"/outlist_identified_positive.RData"))
+outlist.ident.pos <- outlist.ident
+rm(outlist.ident)
+rm(outlist.not.ident)
+
+# check for missing m/z in negative and positive mode
+mode <- c("Negative", "Positive")
+index <- 1
+results_ident <- c() #empty results list
+outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos)
+for(outlist.ident in outlist_ident_list){
+  current_mode <- mode[index]
+  # retrieve all unique m/z values in whole numbers and check if all are available
+  mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits=0)))
+  mz_range = seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600
+  mz_missing = c()
+  for (mz in mz_range){
+    if (!mz %in% mz_values) {
+      mz_missing <- c(mz_missing, mz)
+    } }
+  y <- mz_missing
+  # check if m/z are missing and make an .txt file with information
+  group_ident <- cumsum(c(1, abs(y[-length(y)] - y[-1]) > 1))
+  if(length(group_ident) > 1){
+    results_ident <- c(results_ident, paste0("Missing m/z values ", current_mode, " mode"))
+    results_ident <- c(results_ident, by(y, group_ident, identity))
+  } else {
+    results_ident <- c(results_ident, paste0(current_mode, " mode did not have missing mz values"))
+  }
+  index <- index + 1 # change to new mode in for loop
+}
+lapply(results_ident, write, file=paste(outdir, "missing_mz_warning.txt", sep = "/"), append=TRUE, ncolumns=1000)
+
+cat("Ready excelExport.R")
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
new file mode 100644
index 0000000..326c47e
--- /dev/null
+++ b/DIMS/GenerateExcel.nf
@@ -0,0 +1,18 @@
+process GenerateExcel {
+    tag {"DIMS GenerateExcel"}
+    label 'GenerateExcel'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(collect_file) // input files need to be linked, but called within R script
+       path(init_filepath) 
+
+    output:
+       path('AdductSums_*.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $params.analysis_id $params.matrix $params.relevance_file $params.zscore 
+        """
+}
diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
new file mode 100644
index 0000000..09503a7
--- /dev/null
+++ b/DIMS/HMDBparts_main.R
@@ -0,0 +1,64 @@
+#!/usr/bin/Rscript
+## adapted from hmdb_part_adductSums.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+db_path <- cmd_args[1] # location of HMDB db file
+breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
+
+load(db_path)
+load(breaks_filepath)
+
+# Cut up HMDB minus adducts minus isotopes into small parts 
+
+# load(paste(outdir, "breaks.fwhm.RData", sep = "/"))
+# outdir <- paste(outdir, "hmdb_part_adductSums", sep = "/")
+# dir.create(outdir, showWarnings = FALSE)
+
+scanmodes <- c("positive", "negative")
+
+for (scanmode in scanmodes) {
+  if (scanmode == "negative") {
+    column_label <- "MNeg"
+    HMDB_add_iso <- HMDB_add_iso.Neg
+  } else if (scanmode == "positive") {
+    column_label <- "Mpos"
+    HMDB_add_iso <- HMDB_add_iso.Pos
+  }
+
+  # filter mass range meassured NB: remove the last comma?!
+  outlist <- HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] & HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+
+  # remove adducts and isotopes, put internal standard at the beginning
+  outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed=TRUE), ]
+  outlist <- outlist[grep("HMDB", rownames(outlist), fixed=TRUE), ]
+  outlist <- outlist[-grep("_", rownames(outlist), fixed=TRUE), ]
+  outlist <- rbind(outlist_IS, outlist)
+  # sort on m/z value
+  outlist <- outlist[order(outlist[ ,column_label]), ]
+
+  n <- dim(outlist)[1]
+  # size of hmdb parts in lines:
+  sub <- 2000
+  end <- 0
+  check <- 0
+
+  # generate hmdb parts
+  if (n >= sub & (floor(n/sub)) >= 2) {
+    for (i in 1:floor(n/sub)){
+      start <- -(sub-1) + i*sub
+      end <- i*sub
+      outlist_part <- outlist[c(start:end), ]
+      save(outlist_part, file=paste0(scanmode, "_hmdb.", i, ".RData"))
+    }
+  }
+}
+
+# finish last hmdb part
+start = end + 1
+end = n
+
+outlist_part = outlist[c(start:end),]
+save(outlist_part, file=paste0(scanmode, "_hmdb.", i+1, ".RData"))
diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf
new file mode 100644
index 0000000..72fedfc
--- /dev/null
+++ b/DIMS/HMDBparts_main.nf
@@ -0,0 +1,20 @@
+process HMDBparts_main {
+    // Custom process to cut HMDB db into parts for main entry only, no adducts, no isotopes
+    tag {"DIMS HMDBparts_main"}
+    label 'HMDBparts_main'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+        path(hmdb_db_file)
+        path(breaks_file)
+
+    output:
+        path('*.RData')
+
+    script:
+
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R $hmdb_db_file $breaks_file 
+        """
+}
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
new file mode 100755
index 0000000..8b08408
--- /dev/null
+++ b/DIMS/SumAdducts.R
@@ -0,0 +1,122 @@
+#!/usr/bin/Rscript
+## adapted from 11-runSumAdducts.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+# collect_file <- cmd_args[1]
+hmdbpart_main_file <- cmd_args[1]
+scripts_dir <- cmd_args[2]
+z_score <- as.numeric(cmd_args[3])
+# outdir <- cmd_args[2]
+# scanmode <- cmd_args[3]
+# adducts <- cmd_args[4]
+
+# for debugging:
+print(hmdbpart_main_file)
+# NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder
+print(scripts_dir)
+print(z_score)
+
+if (grepl("positive_hmdb", hmdbpart_main_file)) { 
+  scanmode <- "positive" 
+  # for the adduct sum: include adducts M+Na (1) and M+K (2)
+  adducts = c(1,2)
+} else {
+    if (grepl("negative_hmdb", hmdbpart_main_file)) { 
+       scanmode <- "negative" 
+       # for the adduct sum: include adduct M+Cl (1) 
+       adducts <- c(1)
+    }
+}
+
+# load input files
+collect_file <- paste0("outlist_identified_", scanmode, ".RData")
+load(collect_file)
+repl_file <- paste0(scanmode, "_repl_pattern.RData")
+load(repl_file)
+outlist_part <- get(load(hmdbpart_main_file))
+
+# adducts=as.vector(unlist(strsplit(adducts, ",",fixed = TRUE)))
+
+# load(paste(outdir, "/outlist_identified_", scanmode, ".RData", sep=""))
+
+# Local and on HPC
+#batch = strsplit(file, "/",fixed = TRUE)[[1]]
+#batch = batch[length(batch)]
+#batch = strsplit(batch, ".",fixed = TRUE)[[1]][2]
+batch_number = strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2]
+print(batch_number)
+
+outlist.tot <- unique(outlist.ident)
+
+sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, scanmode, outdir, z_score){
+  #theor_MZ = outlist_part
+  #grpnames.long = names(repl.pattern.filtered)
+  #peaklist = outlist.ident
+  #adducts = c(1) #for neg or c(1,2) for pos
+  #batch <- 300
+  #outdir <- "/Users/nunen/Documents/Metab/processed/zebrafish"
+  #scanmode <- "negative"
+  #z_score <- 0
+  
+  hmdb_codes <- rownames(theor_MZ)
+  hmdb_names <- theor_MZ[,1, drop=FALSE]
+  hmdb_names[] <- lapply(hmdb_names, as.character)
+  
+  # remove isotopes!!!
+  index <- grep("HMDB", hmdb_codes, fixed=TRUE)
+  hmdb_codes <- hmdb_codes[index]
+  hmdb_names <- hmdb_names[index,]
+  index = grep("_", rownames(hmdb_codes), fixed=TRUE)
+  if (length(index) > 0) hmdb_codes <- hmdb_codes[-index]
+  if (length(index) > 0) hmdb_names <- hmdb_names[-index]
+  
+  # negative
+  names <- NULL
+  adductsum <- NULL
+  names_long <- NULL
+  
+  if (length(hmdb_codes) != 0) {
+    
+    for(i in 1:length(hmdb_codes)){
+      #compound="HMDB00045"
+      compound <- hmdb_codes[i]
+      compound_plus <- c(compound,paste(compound, adducts, sep = "_"))
+      
+      # x=peaklist$HMDB_code[1]
+      metab <- unlist(lapply(peaklist$HMDB_code, function(x) {(length(intersect(unlist(strsplit(as.vector(x),";")),compound_plus))>0)}))
+      # peaklist[metab, "assi.hmdb"]
+      # which(metab==TRUE)
+      
+      total <- c()
+      
+      # peaklist[metab, c("mzmed.pgrp", "HMDB_code", "C34.1")]
+      # ints=peaklist[metab, c(7:(length(grpnames.long)+6))]
+      if (z_score == 1) {
+        ints <- peaklist[metab, c(15:(length(grpnames.long)+14))]
+      } else {
+        ints <- peaklist[metab, c(7:(length(grpnames.long)+6))]
+      }
+      total <- apply(ints, 2, sum)
+      
+      if (sum(total)!=0) {
+        #message(i)
+        names <- c(names, compound)
+        adductsum <- rbind(adductsum,total)
+        names_long <- c(names_long, hmdb_names[i])
+      }
+    }
+    
+    if (!is.null(adductsum)){ 
+      rownames(adductsum) <- names
+      adductsum <- cbind(adductsum, "HMDB_name"=names_long)
+      save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep=""))
+    }
+  }  
+}
+
+
+sumAdducts(outlist.tot, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score)
+
diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf
new file mode 100644
index 0000000..6a7be46
--- /dev/null
+++ b/DIMS/SumAdducts.nf
@@ -0,0 +1,18 @@
+process SumAdducts {
+    label 'SumAdducts'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(collect_file) // input files need to be linked, but called within R script
+       path(replication_pattern)
+       path(HMDBpart_main_file)
+
+    output:
+       path('*_SummedAdducts.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R $HMDBpart_main_file $params.scripts_dir $params.zscore
+        """
+}

From 03edd3295abca1740371b45ffb3f8dd5ed32ad74 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 20 Oct 2023 17:19:14 +0200
Subject: [PATCH 13/73] modified files and added new files

---
 DIMS/AssignToBins.R              |   1 -
 DIMS/AssignToBins.nf             |   3 +-
 DIMS/AverageTechReplicates.R     |   5 -
 DIMS/AverageTechReplicates.nf    |   9 +-
 DIMS/CollectFilled.R             |  22 +-
 DIMS/CollectFilled.nf            |   6 +-
 DIMS/CollectSumAdducts.R         |   7 +-
 DIMS/FillMissing.R               |  16 +-
 DIMS/FillMissing.nf              |   3 +-
 DIMS/GenerateBreaks.R            |   1 -
 DIMS/GenerateBreaks.nf           |   5 +-
 DIMS/GenerateExcel.R             | 135 ++++++-----
 DIMS/GenerateExcel.nf            |   4 +-
 DIMS/GenerateViolinPlots.R       | 381 +++++++++++++++++++++++++++++++
 DIMS/GenerateViolinPlots.nf      |  17 ++
 DIMS/HMDBparts.R                 |  18 +-
 DIMS/HMDBparts.nf                |   7 +-
 DIMS/HMDBparts_main.R            |  15 +-
 DIMS/HMDBparts_main.nf           |   2 +-
 DIMS/MakeInit.nf                 |   5 +-
 DIMS/PeakFinding.R               |   1 -
 DIMS/PeakFinding.nf              |   3 +-
 DIMS/PeakGrouping.R              |  28 +--
 DIMS/PeakGrouping.nf             |   3 +-
 DIMS/SpectrumPeakFinding.R       |   3 -
 DIMS/SpectrumPeakFinding.nf      |   8 +-
 DIMS/SumAdducts.R                |  46 +---
 DIMS/SumAdducts.nf               |   3 +-
 DIMS/ThermoRawFileParser.nf      |   6 +-
 DIMS/UnidentifiedCalcZscores.R   |  67 ++++++
 DIMS/UnidentifiedCalcZscores.nf  |  18 ++
 DIMS/UnidentifiedCollectPeaks.R  | 102 +++++++++
 DIMS/UnidentifiedCollectPeaks.nf |  20 ++
 DIMS/UnidentifiedFillMissing.R   |  50 ++++
 DIMS/UnidentifiedFillMissing.nf  |  18 ++
 DIMS/UnidentifiedPeakGrouping.R  |  93 ++++++++
 DIMS/UnidentifiedPeakGrouping.nf |  19 ++
 37 files changed, 954 insertions(+), 196 deletions(-)
 create mode 100644 DIMS/GenerateViolinPlots.R
 create mode 100644 DIMS/GenerateViolinPlots.nf
 create mode 100755 DIMS/UnidentifiedCalcZscores.R
 create mode 100644 DIMS/UnidentifiedCalcZscores.nf
 create mode 100755 DIMS/UnidentifiedCollectPeaks.R
 create mode 100644 DIMS/UnidentifiedCollectPeaks.nf
 create mode 100755 DIMS/UnidentifiedFillMissing.R
 create mode 100644 DIMS/UnidentifiedFillMissing.nf
 create mode 100755 DIMS/UnidentifiedPeakGrouping.R
 create mode 100644 DIMS/UnidentifiedPeakGrouping.nf

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 4e320d3..ae9c8b1 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -6,7 +6,6 @@ suppressPackageStartupMessages(library("xcms"))
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 filepath        <- cmd_args[1] # location of mzML file
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index c7103cd..fbbb48e 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -1,10 +1,11 @@
 process AssignToBins {
+    tag "DIMS AssignToBins ${file_id}"
     label 'AssignToBins'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(file_id, path(mzML_filename), path(breaks_file))
+       tuple val(file_id), path(mzML_filename) , path(breaks_file)
 
     output:
        path("${file_id}.RData")
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index b880f13..4001937 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -3,7 +3,6 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 init_filepath <- cmd_args[1]
 nr_replicates <- as.numeric(cmd_args[2])
@@ -12,16 +11,12 @@ dimsThresh    <- 100
 
 removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
   # bad_samples=remove_pos
-  
   tmp <- repl_pattern
-  
   removeFromGroup <- NULL
   
   for (i in 1:length(tmp)){
     tmp2 <- repl_pattern[[i]]
-    
     remove <- NULL
-    
     for (j in 1:length(tmp2)){
       if (tmp2[j] %in% bad_samples){
         remove = c(remove, j)
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index 26f9211..33567a3 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -1,4 +1,5 @@
 process AverageTechReplicates {
+    tag "DIMS AverageTechReplicates"
     label 'AverageTechReplicates'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
@@ -8,10 +9,10 @@ process AverageTechReplicates {
        path(init_filepath)
 
     output:
-       path '*_repl_pattern.RData', emit: patterns
-       path '*_avg.RData', emit: binned
-       path 'miss_infusions_negative.txt'
-       path 'miss_infusions_positive.txt'
+       path('*_repl_pattern.RData'), emit: patterns
+       path('*_avg.RData'), emit: binned
+       path('miss_infusions_negative.txt')
+       path('miss_infusions_positive.txt')
 
     script:
         """
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 62be430..e927a65 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -3,7 +3,6 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n")
 
 scripts_dir        <- cmd_args[1]
 ppm     <- as.numeric(cmd_args[2])
@@ -23,7 +22,6 @@ for (scanmode in scanmodes) {
   outlist.tot <- NULL
   for (i in 1:length(filled_files)) {
     load(filled_files[i])
-    print(filled_files[i])
     outlist.tot <- rbind(outlist.tot, peakgrouplist_filled)
   }
 
@@ -44,7 +42,7 @@ for (scanmode in scanmodes) {
 
   if (z_score == 1) {
     outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
-    nr.removed.samples <- length(which(repl_pattern[] == "character(0)"))
+    nr.removed.samples <- length(which(repl_pattern[]=="character(0)"))
     order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)])
     outlist.stats.more <- cbind(outlist.stats[,1:7],
                                outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)],
@@ -59,25 +57,23 @@ for (scanmode in scanmodes) {
     outlist.tot <- outlist.stats.more
   }
 
-  # filter identified compounds
-  index.1 <- which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"])))
-  index.2 <- which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"])))
-  index <- union(index.1,index.2)
-  outlist.ident <- outlist.tot[index,]
-  outlist.not.ident <- outlist.tot[-index,]
+  outlist.ident <- outlist.tot
 
   if (z_score == 1) {
     outlist.ident$ppmdev <- as.numeric(outlist.ident$ppmdev)
     outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),]
   }
-  # NAs in theormz_noise 
+  # take care of NAs in theormz_noise
   outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0
   outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise)
   outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0
   outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise)
 
-  # save output for identified peak groups (not.identified later)
-  # save(outlist.not.ident, file="./outlist_not_identified_", scanmode, ".RData")
-  save(outlist.ident, file=paste0("./outlist_identified_", scanmode, ".RData"))
+  # Extra output in Excel-readable format:
+  remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
+  remove_colindex <- which(colnames(outlist.ident) %in% remove_columns)
+  outlist.ident <- outlist.ident[ , -remove_colindex]
+  write.table(outlist.ident, file=paste0("outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+  save(outlist.ident, file=paste0("outlist_identified_", scanmode, ".RData"))
 
 }
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index c28516c..a7819f4 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -1,14 +1,16 @@
 process CollectFilled {
+    tag "DIMS CollectFilled"
     label 'CollectFilled'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
        path(filled_files)
-       path(replication_pattern)
+       path(replication_pattern)  // input files need to be linked, but called within R script
 
     output:
-       path('outlist*.RData')
+       path('outlist*.txt')
+       path('outlist*.RData'), emit: filled_pgrlist
 
     script:
         """
diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R
index ae89cbc..e935eb8 100755
--- a/DIMS/CollectSumAdducts.R
+++ b/DIMS/CollectSumAdducts.R
@@ -3,11 +3,8 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
-# outdir <- cmd_args[1]
-# scanmode <- cmd_args[2]
-scanmode <- c("positive", "negative")
+scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
   # collect all AdductSums part files for each scanmode
@@ -20,6 +17,6 @@ for (scanmode in scanmodes) {
   }
 
   # save output file
-  save(outlist.tot, file="/AdductSums_", scanmode, ".RData")
+  save(outlist.tot, file=paste0("AdductSums_", scanmode, ".RData"))
 }
 
diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index 127c557..a2e29ff 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -13,9 +13,14 @@ resol   <- as.numeric(cmd_args[4])
 ppm     <- as.numeric(cmd_args[5])
 outdir <- "./"
 
+print(peakgrouplist_file)
+
 if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else
     if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" }
 
+print(scanmode)
+print(scripts_dir)
+
 # load in function scripts
 source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
 source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
@@ -31,12 +36,19 @@ source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R"))
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 repl_pattern <- get(load(pattern_file))
 
+print(head(repl_pattern))
+
 # load peak group list and determine output file name
-load(peakgrouplist_file)
+outpgrlist_identified <- get(load(peakgrouplist_file))
+
+print(head(outpgrlist_identified))
+
 outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
+print(outputfile_name)
+
 # replace missing values (zeros) with random noise
-peakgrouplist_filled <- replaceZeros(outpgrlist.identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
 # save output 
 save(peakgrouplist_filled, file=paste0("./", outputfile_name))
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index 4749b87..3a1f61b 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -1,11 +1,12 @@
 process FillMissing {
+    tag "DIMS FillMissing ${GroupedList_file}"
     label 'FillMissing'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
        path(GroupedList_file)
-       path(replication_pattern)
+       path(replication_pattern) // input files need to be linked, but called within R script
 
     output:
        path('*_filled.RData')
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 38161fb..7aeb8c4 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -7,7 +7,6 @@ suppressPackageStartupMessages(library("xcms"))
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 filepath <- cmd_args[1] # 1 of the mzML files
 outdir   <- cmd_args[2] 
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index f60d5cc..593014b 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -1,14 +1,15 @@
 process GenerateBreaks {
+    tag "DIMS GenerateBreaks"
     label 'GenerateBreaks'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(file_id, path(mzML_file))
+       tuple val(file_id), path(mzML_file)
 
 
     output:
-       path 'breaks.fwhm.RData'
+       path('breaks.fwhm.RData')
 
     script:
         """
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index 7958f95..a84c9d2 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -3,7 +3,6 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep = "")
 
 # outdir <- cmd_args[1] #"/Users/nunen/Documents/Metab/test_set"
 init_filepath <- cmd_args[1]
@@ -21,13 +20,15 @@ suppressMessages(library("dplyr"))
 suppressMessages(library("stringr"))
 
 # Initialise and load data
-outdir <- "../../../"
 plot <- TRUE
 export <- TRUE
 control_label <- "C"
 case_label <- "P"
 imagesize_multiplier <- 2
-# init <- paste0(outdir, "/logs/init.RData")
+# setting outdir to export files to the project directory
+outdir <- "../../../"
+# percentage of outliers to remove from calculation of robust scaler
+perc <- 5
 
 # load information on samples
 load(init_filepath)
@@ -43,15 +44,11 @@ dir.create(plotdir, showWarnings = F)
 
 options(digits=16)
 
-# setwd(outdir)
-
 # sum positive and negative adductsums
-
 # Load pos and neg adduct sums
-load("/AdductSums_negative.RData")
+load("AdductSums_negative.RData")
 outlist.neg.adducts.HMDB <- outlist.tot
-
-load("/AdductSums_positive.RData")
+load("AdductSums_positive.RData")
 outlist.pos.adducts.HMDB <- outlist.tot
 rm(outlist.tot)
 
@@ -113,7 +110,6 @@ if (z_score == 1) {
   # Statistics: Z-score
   outlist <- cbind(plots = NA, outlist)
   #outlist <- as.data.frame(outlist)
-  
   startcol <- dim(outlist)[2] + 3 
   
   # Get columns with control intensities 
@@ -123,12 +119,14 @@ if (z_score == 1) {
   # Get columns with patient intensities 
   patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE)
   patient_columns <- outlist[, patient_col_ids]
-  
   intensity_col_ids <- c(control_col_ids, patient_col_ids)
   
   # set intensities of 0 to NA?
   outlist[,intensity_col_ids][outlist[,intensity_col_ids] == 0] <- NA
-  
+
+  # Extra output: save outlist as it is and use it to calculate robust scaler
+  outlist.noZ <- outlist
+
   # calculate mean and sd for Control group
   outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x),na.rm = TRUE))
   outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x),na.rm = TRUE))
@@ -142,13 +140,47 @@ if (z_score == 1) {
     outlist <- cbind(outlist, zscores.1col)
   }
   colnames(outlist)[startcol:ncol(outlist)] <- cnames.z
-  
+
+  # Extra output: calculate robust scaler (Zscores minus outliers in Controls)
+  # calculate mean and sd for Control group without outliers
+  outlist.noZ$avg.ctrls <- 0
+  outlist.noZ$sd.ctrls  <- 0
+
+  robust_scaler <- function(control_intensities, control_col_ids, perc=5) {
+    nr_toremove <- ceiling(length(control_col_ids)*perc/100)
+    sorted_control_intensities <- sort(as.numeric(control_intensities))
+    trimmed_control_intensities <- sorted_control_intensities[(nr_toremove+1):(length(sorted_control_intensities)-nr_toremove)]
+    return(trimmed_control_intensities)
+  }
+
+  # only calculate robust Z-scores if there are enough Controls
+  if (length(control_col_ids) > 10) {
+    for (metabolite_index in 1:nrow(outlist)) {
+      outlist.noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc))
+      outlist.noZ$sd.ctrls[metabolite_index]  <-   sd(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc))
+    }
+  }
+
+  # Make and add columns with robust zscores
+  cnames.robust <- gsub("_Zscore", "_RobustZscore", cnames.z)
+  for (i in intensity_col_ids) {
+    zscores.1col <- (as.numeric(as.vector(unlist(outlist.noZ[ , i]))) - outlist.noZ$avg.ctrls) / outlist.noZ$sd.ctrls
+    outlist.noZ <- cbind(outlist.noZ, zscores.1col)
+  }
+  colnames(outlist.noZ)[startcol:ncol(outlist.noZ)] <- cnames.robust
+
+  # Extra output: metabolites filtered on relevance
+  save(outlist, file=paste0("AdductSums_filtered_Zscores.RData"))
+  write.table(outlist, file=paste0("AdductSums_filtered_Zscores.txt"), sep="\t", row.names = FALSE)
+  # Extra output: filtered metabolites with robust scaled Zscores
+  write.table(outlist.noZ, file=paste0("AdductSums_filtered_robustZ.txt"), sep="\t", row.names = FALSE)
+
+  # get the IDs of the patients 
   patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1]))))
   patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] # sorts 
   
-  temp_png <- NULL
-  
   # Iterate over every row, make boxplot, insert into excel, and make Zscore for every patient
+  temp_png <- NULL
   for (p in 1:nrow(outlist)) {
     # box plot 
     hmdb_name <- rownames(outlist[p,])
@@ -224,18 +256,18 @@ IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),]
 IS_codes <- rownames(IS)
 cat(IS_codes,"\n")
 
-# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl.pattern) will contain more sample_names then the peak data (IS), 
+# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS), 
 # thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data.
-sample_names_nodata <- setdiff(names(repl.pattern),names(IS))
+sample_names_nodata <- setdiff(names(repl_pattern),names(IS))
 if (!is.null(sample_names_nodata)) {
   write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
   cat(sample_names_nodata,"\n")
   for (sample_name in sample_names_nodata) {
-    repl.pattern[[sample_name]] <- NULL
+    repl_pattern[[sample_name]] <- NULL
   }}
 
 # Retrieve IS summed adducts
-IS_summed <- IS[c(names(repl.pattern), "HMDB_code")]
+IS_summed <- IS[c(names(repl_pattern), "HMDB_code")]
 IS_summed$HMDB.name <- IS$name
 IS_summed <- reshape2::melt(IS_summed, id.vars=c('HMDB_code','HMDB.name'))
 colnames(IS_summed) <- c('HMDB.code','HMDB.name','Sample','Intensity')
@@ -268,10 +300,10 @@ IS_neg$Project <- project
 IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity))
 
 # Save results
-save(IS_pos,IS_neg,IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData'))
+save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData'))
 
 # number of samples, for plotting length and width
-sample_count <- length(repl.pattern)
+sample_count <- length(repl_pattern)
 
 # change the order of the x-axis summed plots to a natural sorted one
 Sample_naturalorder <- unique(as.character(IS_summed$Sample))
@@ -280,9 +312,7 @@ IS_summed$Sample_level <- factor(IS_summed$Sample, levels = c(Sample_naturalorde
 IS_pos$Sample_level <- factor(IS_pos$Sample, levels = c(Sample_naturalorder))
 IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder))
 
-##########
-##### bar plots with all IS
-##########
+## bar plots with all IS
 
 # function for ggplot theme
 # theme for all IS bar plots
@@ -290,28 +320,28 @@ theme_IS_bar <- function(myPlot) {
   myPlot + 
     scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
     theme(
-      legend.position='none',
-      axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.5, size=6),
-      axis.text.y=element_text(size=6))
+      legend.position = 'none',
+      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size=6),
+      axis.text.y = element_text(size=6))
 }
 
 # ggplot functions
 IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
-  geom_bar(aes(fill=HMDB.name),stat='identity') +
-  labs(x='',y='Intensity') +
+  geom_bar(aes(fill=HMDB.name), stat='identity') +
+  labs(x='', y='Intensity') +
   facet_wrap(~HMDB.name, scales='free_y')
 
 IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
-  geom_bar(aes(fill=HMDB.name),stat='identity') +
-  labs(x='',y='Intensity') +
+  geom_bar(aes(fill=HMDB.name), stat='identity') +
+  labs(x='', y='Intensity') +
   facet_wrap(~HMDB.name, scales='free_y')
 
 IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Summed)") +
-  geom_bar(aes(fill=HMDB.name),stat='identity') +
-  labs(x='',y='Intensity') +
+  geom_bar(aes(fill=HMDB.name), stat='identity') +
+  labs(x='', y='Intensity') +
   facet_wrap(~HMDB.name, scales='free_y')
 
 # add theme to ggplot functions
@@ -325,9 +355,7 @@ ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot=IS_neg_bar_plot, height
 ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot=IS_pos_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
 ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot=IS_sum_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
 
-##########
-##### Line plots with all IS
-##########
+## Line plots with all IS
 
 # function for ggplot theme
 # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines
@@ -348,7 +376,7 @@ IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
   geom_point(aes(col=HMDB.name)) +
   geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='',y='Intensity')
+  labs(x='', y='Intensity')
 
 IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
@@ -373,9 +401,7 @@ ggsave(paste0(outdir,"/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, hei
 ggsave(paste0(outdir,"/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
 ggsave(paste0(outdir,"/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
 
-##########
-##### bar plots with a selection of IS
-##########
+## bar plots with a selection of IS
 IS_neg_selection <- c('2H2-Ornithine (IS)', '2H3-Glutamate (IS)', '2H2-Citrulline (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)')
 IS_pos_selection <- c('2H4-Alanine (IS)', '13C6-Phenylalanine (IS)', '2H4_13C5-Arginine (IS)', '2H3-Propionylcarnitine (IS)', '2H9-Isovalerylcarnitine (IS)')
 IS_sum_selection <- c('2H8-Valine (IS)', '2H3-Leucine (IS)', '2H3-Glutamate (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)')
@@ -409,22 +435,22 @@ if (dims_matrix == "DBS"){
 # ggplot functions
 IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
-  geom_bar(aes(fill=HMDB.name),stat='identity') +
-  labs(x='',y='Intensity') +
+  geom_bar(aes(fill=HMDB.name), stat='identity') +
+  labs(x='', y='Intensity') +
   facet_wrap(~HMDB.name, scales='free', ncol = 2) + 
   if(exists("hline.data.neg")){geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))} #subset, if some IS have no data, no empty plots will be generated with a line)
 
 IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
-  geom_bar(aes(fill=HMDB.name),stat='identity') +
-  labs(x='',y='Intensity') +
+  geom_bar(aes(fill=HMDB.name), stat='identity') +
+  labs(x='', y='Intensity') +
   facet_wrap(~HMDB.name, scales='free', ncol = 2) +
   if(exists("hline.data.pos")){geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))}
 
 IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
-  geom_bar(aes(fill=HMDB.name),stat='identity') +
-  labs(x='',y='Intensity') +
+  geom_bar(aes(fill=HMDB.name), stat='identity') +
+  labs(x='', y='Intensity') +
   facet_wrap(~HMDB.name, scales='free', ncol = 2) +
   if(exists("hline.data.sum")){geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))}
 
@@ -439,9 +465,7 @@ ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_b
 ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
 ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
 
-##########
-##### line plots with a selection of IS
-##########
+## line plots with a selection of IS
 
 # function for ggplot theme
 # see line plots with all IS
@@ -451,19 +475,19 @@ IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selecti
   ggtitle("Interne Standaard (Neg)") +
   geom_point(aes(col=HMDB.name)) +
   geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='',y='Intensity')
+  labs(x='', y='Intensity')
 
 IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
   geom_point(aes(col=HMDB.name)) +
   geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='',y='Intensity')
+  labs(x='', y='Intensity')
 
 IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
   geom_point(aes(col=HMDB.name)) +
   geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='',y='Intensity')
+  labs(x='', y='Intensity')
 
 # add theme to ggplot functions
 IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot)
@@ -539,15 +563,14 @@ if (z_score == 1) {
 
 ### MISSING M/Z CHECK
 # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail
-
+print("Nu in missing m/z check")
 # Load the outlist_identified files + remove the loaded files
-load(paste0(outdir,"/outlist_identified_negative.RData"))
+load(paste0(outdir, "RData/outlist_identified_negative.RData"))
 outlist.ident.neg <- outlist.ident
-load(paste0(outdir,"/outlist_identified_positive.RData"))
+load(paste0(outdir, "RData/outlist_identified_positive.RData"))
 outlist.ident.pos <- outlist.ident
 rm(outlist.ident)
 rm(outlist.not.ident)
-
 # check for missing m/z in negative and positive mode
 mode <- c("Negative", "Positive")
 index <- 1
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 326c47e..655ce0d 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -7,12 +7,14 @@ process GenerateExcel {
     input:
        path(collect_file) // input files need to be linked, but called within R script
        path(init_filepath) 
+       val(analysis_id) 
+       path(relevance_file)
 
     output:
        path('AdductSums_*.RData')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $params.analysis_id $params.matrix $params.relevance_file $params.zscore 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $analysis_id $params.matrix $relevance_file $params.zscore 
         """
 }
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
new file mode 100644
index 0000000..bf2abe8
--- /dev/null
+++ b/DIMS/GenerateViolinPlots.R
@@ -0,0 +1,381 @@
+# For untargeted metabolomics, this tool calculates probability scores for 
+# metabolic disorders. In addition, it provides visual support with violin plots 
+# of the DIMS measurements for the lab specialists.
+# Input needed: 
+# 1. Excel file in which metabolites are listed with their intensities for
+#    controls (with C in samplename) and patients (with P in samplename) and their
+#    corresponding Z-scores. 
+# 2. All files from github: https://github.com/UMCUGenetics/dIEM
+
+library(dplyr) # tidytable is for other_isobaric.R (left_join)
+library(reshape2) # used in prepare_data.R
+library(data.table) # for function setDT
+library(openxlsx) # for opening Excel file
+library(ggplot2) # for plotting
+#library(gridExtra) # for table top highest/lowest
+
+# define parameters - check after addition to run.sh
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) {
+  cat("  ", arg, "\n", sep = "")
+}
+
+scripts_dir <- cmd_args[1] 
+run_name <- cmd_args[2] # run_name <- "Test_run_5"
+z_score <- as.numeric(cmd_args[3]) # calculate Z-scores or not? z_score <- 1
+
+# load functions
+source(paste0(scripts_dir, "AddOnFunctions/same_samplename.R"))
+source(paste0(scripts_dir, "AddOnFunctions/prepare_data.R"))
+source(paste0(scripts_dir, "AddOnFunctions/prepare_data_perpage.R"))
+source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R"))
+source(paste0(scripts_dir, "AddOnFunctions/violin_plots.R"))
+source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R"))
+
+# The list of parameters can be shortened for HPC. Leave for now.
+top_nr_IEM <- 5 # number of diseases that score highest in algorithm to plot
+integer_list <- c(1:top_nr_IEM) # indices of top diseases
+threshold_IEM <- 5 # probability score cut-off for plotting the top diseases
+ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases
+nr_plots_perpage <- 20 # number of violin plots per page in PDF
+
+# Settings from config.R
+# binary variable: run function, yes(1) or no(0). Can be removed at later stage
+if (z_score == 1) { algorithm <- ratios <- violin <- 1 } # ? Or put if statement in run.sh?
+# integer: are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
+header_row = 1
+# column name where the data starts (default B)
+col_start <- "B"
+zscore_cutoff <- 5
+xaxis_cutoff <- 20
+
+# path to DIMS excel file
+# path_DIMSfile = "/Users/ihoek3/Documents/dIEM/voorbeeld_PL_Diagn17_RUN10.xlsx"
+path_DIMSfile <- paste0("./", run_name, ".xlsx") 
+
+# path: output folder 
+output_dir <- paste0("./dIEM") 
+dir.create(output_dir, showWarnings = F) 
+
+# folder in which all metabolite lists are (.txt)
+path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups"
+### file for ratios step 3
+file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv"
+### file for algorithm step 4
+file_expected_biomarkers_IEM <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv"
+
+
+#### STEP 1: Preparation #### 
+# in: run_name, path_DIMSfile, header_row ||| out: output_dir, DIMS
+
+# Load the excel file.
+dims_xls <- readWorkbook(xlsxFile = path_DIMSfile, sheet = 1, startRow = header_row)
+if (exists("dims_xls")) {
+  cat(paste0("\nThe excel file is succesfully loaded:\n -> ",path_DIMSfile))
+} else {
+  cat(paste0("\n\n**** Error: Could not find an excel file. Please check if path to excel file is correct in config.R:\n -> ",path_DIMSfile,"\n"))
+}
+
+#### STEP 2: Edit DIMS data #####      
+# in: dims_xls ||| out: Data, nr_contr, nr_pat
+# Input: the xlsx file that comes out of the pipeline with format:
+# [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore]
+# Output: "_CSV.csv" file that is suited for the algorithm in shiny.
+
+# Determine the number of Contols and Patients in column names:
+nr_contr <- length(grep("C",names(dims_xls)))/2   # Number of control samples
+nr_pat <- length(grep("P",names(dims_xls)))/2     # Number of patient samples
+# total number of samples
+nrsamples <- nr_contr + nr_pat
+# check whether the number of intensity columns equals the number of Zscore columns
+if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) {
+  cat("\n**** Error: there aren't as many intensities listed as Zscores")
+}
+cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n"))
+
+# Move the columns HMDB_code and HMDB_name to the beginning. 
+HMDB_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
+other_cols <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols]
+dims_xls_copy <- dims_xls[ , c(HMDB_info_cols, other_cols)]
+# Remove the columns from 'name' to 'pathway'
+from_col <- which(colnames(dims_xls_copy) == "name")
+to_col   <- which(colnames(dims_xls_copy) == "pathway")
+dims_xls_copy <- dims_xls_copy[ , -c(from_col:to_col)]
+# in case the excel had an empty "plots" column, remove it
+if ("plots" %in% colnames(dims_xls_copy)) { 
+  dims_xls_copy <- dims_xls_copy[ , -grep("plots", colnames(dims_xls_copy))]
+} 
+# Rename columns 
+names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy))
+names(dims_xls_copy) <- gsub("sd.ctrls",  "SD_controls", names(dims_xls_copy))
+names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy))
+names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy))
+
+# intensity columns and mean and standard deviation of controls
+numeric_cols <- c(3:ncol(dims_xls_copy))
+# make sure all values are numeric
+dims_xls_copy[ , numeric_cols] <- sapply(dims_xls_copy[ , numeric_cols], as.numeric)
+
+if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) {
+  cat("\n### Step 2 # Edit dims data is done.\n")
+} else {
+  cat("\n**** Error: Could not execute step 2 \n")
+}
+
+#### STEP 3: Calculate ratios of intensities for metabolites ####      
+# in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file)
+# This script loads the file with Ratios (file_ratios_metabolites) and calculates 
+# the ratios of the intensities of the given metabolites. It also calculates
+# Zs-cores based on the avg and sd of the ratios of the controls.
+
+# Input: dataframe with intenstities and Zscores of controls and patients:
+# [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore]
+
+# Output: "_CSV.csv" file that is suited for the algorithm, with format:
+# "_Ratios_CSV.csv" file, same file as above, but with ratio rows added.
+
+if (ratios == 1) { 
+  cat(paste0("\nloading ratios file:\n ->  ", file_ratios_metabolites, "\n"))
+  ratio_input <- read.csv(file_ratios_metabolites, sep=';', stringsAsFactors=FALSE)
+  
+  # Prepare empty data frame to fill with ratios
+  ratio_list <- setNames(data.frame(matrix(
+    ncol=ncol(dims_xls_copy),
+    nrow=nrow(ratio_input)
+  )), colnames(dims_xls_copy))
+
+  # put HMDB info into first two columns of ratio_list
+  ratio_list[ ,1:2] <- ratio_input[ ,1:2]
+
+  # look for intensity columns (exclude Zscore columns)
+  control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+  patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+  intensity_cols <- c(control_cols, patient_cols)
+  # calculate each of the ratios of intensities 
+  for (ratio_index in 1:nrow(ratio_input)) {
+    ratio_numerator   <- ratio_input[ratio_index, "HMDB_numerator"] 
+    ratio_numerator   <- strsplit(ratio_numerator, "plus")[[1]]
+    ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] 
+    ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]]
+    # find these HMDB IDs in dataset. Could be a sum of multiple metabolites
+    sel_denominator <- sel_numerator <- c()
+    for (numerator_index in 1:length(ratio_numerator)) { 
+      sel_numerator <- c(sel_numerator, which(dims_xls_copy[ , "HMDB.code"] == ratio_numerator[numerator_index])) 
+    }
+    for (denominator_index in 1:length(ratio_denominator)) { 
+      sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) 
+    }
+    # calculate ratio
+    ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) /
+      apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum)
+    # calculate log of ratio
+    ratio_list[ratio_index, intensity_cols]<- log2(ratio_list[ratio_index, intensity_cols])
+  }
+
+  # Calculate means and SD's of the calculated ratios for Controls
+  ratio_list[ , "Mean_controls"] <- apply(ratio_list[ , control_cols], 1, mean)
+  ratio_list[ , "SD_controls"]   <- apply(ratio_list[ , control_cols], 1, sd)
+
+  # Calc z-scores with the means and SD's of Controls
+  zscore_cols <- grep("Zscore", colnames(ratio_list))
+  for (sample_index in 1:length(zscore_cols)) {
+    zscore_col <- zscore_cols[sample_index] 
+    # matching intensity column
+    int_col <- intensity_cols[sample_index]
+    # add test on column names
+    if (same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) {
+      # calculate Z-scores
+      ratio_list[ , zscore_col] <- (ratio_list[ , int_col] - ratio_list[ , "Mean_controls"]) / ratio_list[ , "SD_controls"]
+    }
+  }
+
+  # Add rows of the ratio hmdb codes to the data of zscores from the pipeline.
+  dims_xls_ratios <- rbind(ratio_list, dims_xls_copy)
+
+  # Edit the DIMS output Zscores of all patients in format:
+  # HMDB_code patientname1  patientname2
+  names(dims_xls_ratios) <- gsub("HMDB.code","HMDB_code", names(dims_xls_ratios))
+  names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios))
+  # remove the string "_Zscore" from column names; this causes problems in step 4
+  # names(dims_xls_ratios) <- gsub("_Zscore", "", names(dims_xls_ratios)) 
+  
+  # Select only the cols with zscores of the patients 
+  zscore_patients <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])]
+  # Select only the cols with zscores of the controls
+  zscore_controls <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])]
+
+}
+
+#### STEP 4: Run the IEM algorithm #########      
+# in: algorithm, file_expected_biomarkers_IEM, zscore_patients ||| out: prob_score (+file)
+# algorithm taken from DOI: 10.3390/ijms21030979
+
+if (algorithm == 1) {
+  # Load data
+  cat(paste0("\nloading expected file:\n ->  ", file_expected_biomarkers_IEM, "\n"))
+  expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep=';', stringsAsFactors=FALSE)
+  # modify column names
+  names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code",  names(expected_biomarkers))  
+  names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers))
+  
+  # prepare dataframe scaffold rank_patients
+  rank_patients <- zscore_patients
+  # Fill df rank_patients with the ranks for each patient
+  for (patient_index in 3:ncol(zscore_patients)) {
+    # number of positive zscores in patient
+    pos <- sum(zscore_patients[ , patient_index] > 0) 
+    # sort the column on zscore; NB: this sorts the entire object, not just one column
+    rank_patients <- rank_patients[order(-rank_patients[patient_index]), ]
+    # Rank all positive zscores highest to lowest
+    rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index]))
+    # Rank all negative zscores lowest to highest
+    rank_patients[(pos+1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos+1):nrow(rank_patients), patient_index]))
+  }
+  # NB: Warning message: In xtfrm.data.frame(x) : cannot xtfrm data frames. Ignore for now.
+
+  # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2).
+  expected_zscores <- merge(x=expected_biomarkers, y=zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
+  expected_zscores_original <- expected_zscores # necessary copy?
+
+  # determine which columns contain Z-scores and which contain disease info
+  select_zscore_cols <- grep("_Zscore", colnames(expected_zscores))
+  select_info_cols <- 1:(min(select_zscore_cols) -1)
+  # set some zscores to zero
+  select_incr_indisp <- which(expected_zscores$Change=="Increase" & expected_zscores$Dispensability=="Indispensable")
+  expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6 , 0, x))
+  select_decr_indisp <- which(expected_zscores$Change=="Decrease" & expected_zscores$Dispensability=="Indispensable")
+  expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2 , 0, x))
+
+  # calculate rank score: 
+  expected_ranks <- merge(x=expected_biomarkers, y=rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
+  rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols]/(expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols]*0.9)
+  # combine disease info with rank scores
+  expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores)
+
+  # multiply weight score and rank score
+  weight_score <- expected_zscores
+  weight_score[ , select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[ , select_zscore_cols]
+
+  # sort table on Disease and Absolute_Weight
+  weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ]
+
+  # select columns to check duplicates
+  dup <- weight_score[ , c('Disease', 'M.z')] 
+  uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast=FALSE),]
+
+  # calculate probability score
+  prob_score <-  aggregate(uni[ , select_zscore_cols], uni["Disease"], sum)
+
+  # list of all diseases that have at least one metabolite Zscore at 0
+  for (patient_index in 2:ncol(prob_score)) {
+    patient_zscore_colname <- colnames(prob_score)[patient_index]
+    matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname)
+    # determine which Zscores are 0 for this patient
+    zscores_zero <- which(expected_zscores[ , matching_colname_expected] == 0)
+    # get Disease for these 
+    disease_zero <- unique(expected_zscores[zscores_zero, "Disease"])
+    # set the probability score of these diseases to 0 
+    prob_score[which(prob_score$Disease %in% disease_zero), patient_index]<- 0
+  }
+
+  # determine disease rank per patient
+  disease_rank <- prob_score 
+  # rank diseases in decreasing order
+  disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x])))
+  # modify column names, Zscores have now been converted to probability scores
+  colnames(prob_score) <- gsub("_Zscore","_prob_score", colnames(prob_score)) # redundant?
+  colnames(disease_rank) <- gsub("_Zscore","", colnames(disease_rank))
+
+  # Create conditional formatting for output excel sheet. Colors according to values.
+  wb <- createWorkbook()
+  addWorksheet(wb, "Probability Scores")
+  writeData(wb, "Probability Scores", prob_score)
+  conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white","#FFFDA2","red"), rule = c(1, 10, 100))
+  saveWorkbook(wb, file = paste0(output_dir,"/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
+  # check whether prob_score df exists and has expected dimensions.
+  if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) {
+    cat("\n### Step 4 # Running the IEM algorithm is done.\n\n")
+  } else {
+    cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n")
+  }
+  
+  rm(wb)
+}
+
+#### STEP 5: Make violin plots #####      
+# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, top_diseases, top_metab, output_dir ||| out: pdf file
+
+if (violin == 1) { # make violin plots
+  
+  # preparation
+  # isobarics_txt <- c()
+  zscore_patients_copy <- zscore_patients 
+  # keep the original for testing purposes, remove later. 
+  colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) # for robust scaler
+  colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients))
+  colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) # for robust scaler
+  colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls))
+  
+  # Make patient list for violin plots
+  patient_list <- names(zscore_patients)[-c(1,2)]
+
+  # from table expected_biomarkers, choose selected columns 
+  select_columns <- c("Disease", "HMDB_code", "HMDB_name")
+  select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns)
+  expected_biomarkers_select <- expected_biomarkers[ , select_col_nrs]
+  # remove duplicates
+  expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[ , c(1,2)]), ]
+  
+  # first step: normal violin plots
+  # Find all text files in the given folder, which contain metabolite lists of which
+  # each file will be a page in the pdf with violin plots.
+  # Make a PDF file for each of the categories in metabolite_dirs
+  metabolite_dirs <- list.files(path=path_metabolite_groups, full.names=FALSE, recursive=FALSE)
+  for (metabolite_dir in metabolite_dirs) {
+    # create a directory for the output PDFs
+    pdf_dir <- paste(output_dir, metabolite_dir, sep="/")
+    dir.create(pdf_dir, showWarnings=FALSE)
+    cat("making plots in category:", metabolite_dir, "\n")
+    
+    # get a list of all metabolite files
+    metabolite_files <- list.files(path=paste(path_metabolite_groups, metabolite_dir, sep="/"), pattern="*.txt", full.names=FALSE, recursive=FALSE)
+    # put all metabolites into one list
+    metab_list_all <- list()
+    metab_list_names <- c()
+    cat("making plots from the input files:")
+    # open the text files and add each to a list of dataframes (metab_list_all)
+    for (file_index in seq_along(metabolite_files)) {
+      infile <- metabolite_files[file_index]
+      metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep="/"), sep = "\t", header = TRUE, quote="")
+      # put into list of all lists
+      metab_list_all[[file_index]] <- metab_list
+      metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1])
+      cat(paste0("\n", infile))
+    } 
+    # include list of classes in metabolite list
+    names(metab_list_all) <- metab_list_names
+    
+    # prepare list of metabolites; max nr_plots_perpage on one page
+    metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients)
+    metab_interest_controls <- prepare_data(metab_list_all, zscore_controls)
+    metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr)
+    
+    # make violin plots per patient
+    for (pt_nr in 1:length(patient_list)) {
+      pt_name <- patient_list[pt_nr]
+      # for category Diagnostics, make list of metabolites that exceed alarm values for this patient
+      # for category Other, make list of top highest and lowest Z-scores for this patient
+      if (grepl("Diagnost", pdf_dir)) {
+        top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted)
+      } else {
+        top_metab_pt <- prepare_toplist(pt_name, zscore_patients)
+      }
+
+      # generate normal violin plots
+      violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt)
+      
+    } # end for pt_nr
+
+  } # end for metabolite_dir
+   
+} # end if violin = 1
diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
new file mode 100644
index 0000000..e580feb
--- /dev/null
+++ b/DIMS/GenerateViolinPlots.nf
@@ -0,0 +1,17 @@
+process GenerateViolinPlots {
+    tag {"DIMS GenerateViolinPlots"}
+    label 'GenerateViolinPlots'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(excel_file) // input files need to be linked, but called within R script
+
+    output:
+       path('*.pdf')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $params.scripts_dir $params.analysis_id $params.zscore 
+        """
+}
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 51490b6..06e0f2c 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -3,11 +3,11 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n")
 
+#  Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm
 db_path <- cmd_args[1] # location of HMDB db file
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-standard_run  <- cmd_args[3] # "yes"
+standard_run  <- cmd_args[4] # "yes"
 
 # Cut up entire HMDB into small parts based on the new binning/breaks 
 load(breaks_filepath)
@@ -15,18 +15,20 @@ load(breaks_filepath)
 # In case of a standard run (m/z 69-606) use external HMDB parts
 min_mz <- round(breaks_fwhm[1])
 max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
+
 # test if standard mz range is used
-if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) {
+if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) {
   # skip generating HMDB parts
-  hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/"
-  hmdb_parts <- list.files(hmdb_parts_dir, pattern=hmdb) # all files containing hmdb in file name
+  hmdb_parts_files <- cmd_args[3] 
+
+  hmdb_parts <- list.files(hmdb_parts_files, pattern="hmdb") # all files containing hmdb in file name
   for (hmdb_file in hmdb_parts) {
-    file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), "./", recursive = TRUE)
+    file.copy(paste(hmdb_parts_files, hmdb_file, sep="/"), "./", recursive = TRUE)
   }
 } else { 
   # generate HMDB parts in case of non-standard mz range
   load(db_path)
-  ppm <- as.numeric(cmd_args[4])
+  ppm <- as.numeric(cmd_args[5])
 
   scanmodes <- c("positive", "negative")
 
@@ -109,7 +111,7 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz <
         outlist_part <- outlist_i
         save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep=""))
         check <- check + dim(outlist_part)[1]
-        cat("\n", "Check", check == dim(outlist)[1])
+        # cat("\n", "Check", check == dim(outlist)[1])
 
       }
     }
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
index 7d58bfe..833b94d 100644
--- a/DIMS/HMDBparts.nf
+++ b/DIMS/HMDBparts.nf
@@ -1,16 +1,13 @@
 process HMDBparts {
-    // Custom process to cut HMDB db into parts
     tag {"DIMS HMDBparts"}
+    // Custom process to cut HMDB db into parts
     label 'HMDBparts'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-        // tuple(path(hmdb_db_file), path(breaks_file))
         path(hmdb_db_file)
         path(breaks_file)
-        // val(standard_run)
-        // val(ppm)
 
     output:
         path('*.RData')
@@ -18,6 +15,6 @@ process HMDBparts {
     script:
 
         """
-        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.standard_run $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm
         """
 }
diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index 09503a7..941d45f 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -3,7 +3,6 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n")
 
 db_path <- cmd_args[1] # location of HMDB db file
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
@@ -13,10 +12,6 @@ load(breaks_filepath)
 
 # Cut up HMDB minus adducts minus isotopes into small parts 
 
-# load(paste(outdir, "breaks.fwhm.RData", sep = "/"))
-# outdir <- paste(outdir, "hmdb_part_adductSums", sep = "/")
-# dir.create(outdir, showWarnings = FALSE)
-
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
@@ -41,7 +36,7 @@ for (scanmode in scanmodes) {
 
   n <- dim(outlist)[1]
   # size of hmdb parts in lines:
-  sub <- 2000
+  sub <- 1000
   end <- 0
   check <- 0
 
@@ -57,8 +52,8 @@ for (scanmode in scanmodes) {
 }
 
 # finish last hmdb part
-start = end + 1
-end = n
+start <- end + 1
+end <- n
 
-outlist_part = outlist[c(start:end),]
-save(outlist_part, file=paste0(scanmode, "_hmdb.", i+1, ".RData"))
+outlist_part <- outlist[c(start:end),]
+save(outlist_part, file = paste0(scanmode, "_hmdb.", i+1, ".RData"))
diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf
index 72fedfc..3f9845b 100644
--- a/DIMS/HMDBparts_main.nf
+++ b/DIMS/HMDBparts_main.nf
@@ -1,6 +1,6 @@
 process HMDBparts_main {
+    tag "DIMS HMDBparts_main"
     // Custom process to cut HMDB db into parts for main entry only, no adducts, no isotopes
-    tag {"DIMS HMDBparts_main"}
     label 'HMDBparts_main'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index 4908047..48b1c9b 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -1,13 +1,14 @@
 process MakeInit {
+    tag "DIMS MakeInit"
     label 'MakeInit'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(path(samplesheet), val(nr_replicates))
+       tuple path(samplesheet), val(nr_replicates)
 
     output:
-       path 'init.RData'
+       path('init.RData')
 
     script:
         """
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 379a0fa..926cb77 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -3,7 +3,6 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 filepath        <- cmd_args[1]
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index 343c4c1..1752a74 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -1,11 +1,10 @@
 process PeakFinding {
+    tag "DIMS PeakFinding ${RData_file}"
     label 'PeakFinding'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       // path(RData_file)
-       // path(breaks_file)
        tuple(path(RData_file), path(breaks_file))
 
     output:
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index 756c0a7..423fb69 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -3,19 +3,14 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 HMDB_part_file <- cmd_args[1]
-# SpecPeaks_file <- cmd_args[2]
-# pattern_file   <- cmd_args[3]
 ppm <- as.numeric(cmd_args[2])
 
 options(digits=16)
 
 # load part of the HMDB
 HMDB_add_iso <- get(load(HMDB_part_file)) 
-# load(HMDB_part_file) 
-# HMDB_add_iso <- outlist_part
 
 # determine appropriate scanmode based on HMDB_part_file 
 if (grepl("negative", basename(HMDB_part_file))) { scanmode <- "negative" } else
@@ -30,27 +25,16 @@ load(SpecPeaks_file)
 outlist.copy <- outlist.tot
 rm(outlist.tot)
 
-print(dim(outlist.copy))
-
 # load replication pattern
-# load(paste0("./", scanmode, "_repl_pattern", ".RData"))
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 load(pattern_file)
-# load("./breaks.fwhm.RData")
 
 # determine appropriate column name in HMDB part 
 if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" }
 
-# for debugging:
-print(dim(HMDB_add_iso))
-print(scanmode)
-print(column_label)
-print(ppm)
-print(head(repl_pattern_filtered, 1))
-
 # Initialize
 outpgrlist.identified <- NULL
-outlist.grouped <- NULL
+list_of_peaks_used_in_peak_groups_identified <- NULL
 
 # First find peak groups identified based on HMDB masses
 while (dim(HMDB_add_iso)[1] > 0) { 
@@ -60,13 +44,11 @@ while (dim(HMDB_add_iso)[1] > 0) {
   reference_mass <- as.numeric(HMDB_add_iso[index, column_label])
   mass_tolerance <- (reference_mass * ppm) / 10^6
   
-  print(paste0("ref_mass ", reference_mass, " mtol ", mass_tolerance))
-
   # find the peaks in the dataset with corresponding m/z
   mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"])
   selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance)))
   tmplist <- outlist.copy[selp,,drop=FALSE]
-  outlist.grouped <- rbind(outlist.grouped, tmplist)
+  list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist)
 
   nrsamples <- length(selp)
   if (nrsamples > 0) {
@@ -188,9 +170,7 @@ while (dim(HMDB_add_iso)[1] > 0) {
 }
 
 
-print(head(outpgrlist.identified))
-
 # save peak list corresponding to masses in HMDB part
-# save(outlist.grouped, file=paste0(batch_number, "_", scanmode, "_all.RData"))
+save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData"))
 # save peak group list, identified part
-save(outpgrlist.identified, file=paste0(batch_number, "_", scanmode, "_identified.RData"))
+save(outpgrlist.identified, file = paste0(batch_number, "_", scanmode, "_identified.RData"))
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index 89328fb..8da1e76 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -1,4 +1,5 @@
 process PeakGrouping {
+    tag "DIMS PeakGrouping ${HMDBpart_file}"
     label 'PeakGrouping'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
@@ -9,7 +10,7 @@ process PeakGrouping {
        path(pattern_file)
 
     output:
-       // path '*_all.RData', emit: peaklist_all
+       path '*_peaks_used.RData', emit: peaks_used
        path '*_identified.RData', emit: grouped_identified
 
     script:
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index 9935b05..a8f33f9 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -7,11 +7,8 @@ scanmodes <- c("positive", "negative")
 # Check whether all jobs terminated correct!
 notRun = NULL
 
-print("one")
-
 # collect spectrum peaks for each scanmode
 for (scanmode in scanmodes) {
-  print(scanmode)
   # load peak lists of all biological samples
   input_dir <- getwd() # "./"
   peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep=""))
diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf
index 786131c..775cd2c 100644
--- a/DIMS/SpectrumPeakFinding.nf
+++ b/DIMS/SpectrumPeakFinding.nf
@@ -1,18 +1,18 @@
 process SpectrumPeakFinding {
+    tag "DIMS SpectrumPeakFinding"
     label 'SpectrumPeakFinding'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(RData_file)
-       path(replication_pattern)
+       path(RData_file) // input files need to be linked, but called within R script
+       path(replication_pattern) // input files need to be linked, but called within R script
 
     output:
        path 'SpectrumPeaks_*.RData'
-       // path 'SpectrumPeaks_positive.RData'
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $replication_pattern
+        Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R 
         """
 }
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index 8b08408..716b473 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -3,21 +3,12 @@
 
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
 # collect_file <- cmd_args[1]
 hmdbpart_main_file <- cmd_args[1]
 scripts_dir <- cmd_args[2]
 z_score <- as.numeric(cmd_args[3])
-# outdir <- cmd_args[2]
-# scanmode <- cmd_args[3]
-# adducts <- cmd_args[4]
-
-# for debugging:
-print(hmdbpart_main_file)
 # NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder
-print(scripts_dir)
-print(z_score)
 
 if (grepl("positive_hmdb", hmdbpart_main_file)) { 
   scanmode <- "positive" 
@@ -38,29 +29,11 @@ repl_file <- paste0(scanmode, "_repl_pattern.RData")
 load(repl_file)
 outlist_part <- get(load(hmdbpart_main_file))
 
-# adducts=as.vector(unlist(strsplit(adducts, ",",fixed = TRUE)))
-
-# load(paste(outdir, "/outlist_identified_", scanmode, ".RData", sep=""))
-
-# Local and on HPC
-#batch = strsplit(file, "/",fixed = TRUE)[[1]]
-#batch = batch[length(batch)]
-#batch = strsplit(batch, ".",fixed = TRUE)[[1]][2]
-batch_number = strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2]
-print(batch_number)
+batch_number <- strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2]
 
 outlist.tot <- unique(outlist.ident)
 
 sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, scanmode, outdir, z_score){
-  #theor_MZ = outlist_part
-  #grpnames.long = names(repl.pattern.filtered)
-  #peaklist = outlist.ident
-  #adducts = c(1) #for neg or c(1,2) for pos
-  #batch <- 300
-  #outdir <- "/Users/nunen/Documents/Metab/processed/zebrafish"
-  #scanmode <- "negative"
-  #z_score <- 0
-  
   hmdb_codes <- rownames(theor_MZ)
   hmdb_names <- theor_MZ[,1, drop=FALSE]
   hmdb_names[] <- lapply(hmdb_names, as.character)
@@ -81,28 +54,27 @@ sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number,
   if (length(hmdb_codes) != 0) {
     
     for(i in 1:length(hmdb_codes)){
-      #compound="HMDB00045"
       compound <- hmdb_codes[i]
       compound_plus <- c(compound,paste(compound, adducts, sep = "_"))
       
-      # x=peaklist$HMDB_code[1]
       metab <- unlist(lapply(peaklist$HMDB_code, function(x) {(length(intersect(unlist(strsplit(as.vector(x),";")),compound_plus))>0)}))
-      # peaklist[metab, "assi.hmdb"]
-      # which(metab==TRUE)
       
       total <- c()
       
-      # peaklist[metab, c("mzmed.pgrp", "HMDB_code", "C34.1")]
-      # ints=peaklist[metab, c(7:(length(grpnames.long)+6))]
+      # get the intensities for selected metabolite.
+      # NB: column numbers in previous version of code are incorrect 
       if (z_score == 1) {
-        ints <- peaklist[metab, c(15:(length(grpnames.long)+14))]
+        int_cols_C <- grep("C", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")])
+        int_cols_P <- grep("P", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")])
+        int_cols <- c(int_cols_C, int_cols_P)
+        ints <- peaklist[metab, int_cols]
+        # ints <- peaklist[metab, c(15:(length(grpnames.long)+14))]
       } else {
         ints <- peaklist[metab, c(7:(length(grpnames.long)+6))]
       }
       total <- apply(ints, 2, sum)
-      
+ 
       if (sum(total)!=0) {
-        #message(i)
         names <- c(names, compound)
         adductsum <- rbind(adductsum,total)
         names_long <- c(names_long, hmdb_names[i])
diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf
index 6a7be46..531308e 100644
--- a/DIMS/SumAdducts.nf
+++ b/DIMS/SumAdducts.nf
@@ -1,11 +1,12 @@
 process SumAdducts {
+    tag "DIMS SumAdducts ${collect_file}"
     label 'SumAdducts'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
        path(collect_file) // input files need to be linked, but called within R script
-       path(replication_pattern)
+       path(replication_pattern) // input files need to be linked, but called within R script
        path(HMDBpart_main_file)
 
     output:
diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf
index af86aa3..df91279 100644
--- a/DIMS/ThermoRawFileParser.nf
+++ b/DIMS/ThermoRawFileParser.nf
@@ -1,14 +1,14 @@
 process ConvertRawFile {
-    // Custom process to convert raw file to mzML format
     tag {"DIMS ConvertRawFile ${file_id}"}
+    // Custom process to convert raw file to mzML format
     label 'ThermoRawFileParser_1_1_11'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-        tuple(file_id, path(raw_file))
+        tuple val(file_id), path(raw_file)
 
     output:
-        tuple(file_id, path("${file_id}.mzML"))
+        tuple val(file_id), path("${file_id}.mzML")
 
     script:
 
diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
new file mode 100755
index 0000000..525bfc0
--- /dev/null
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -0,0 +1,67 @@
+#!/usr/bin/Rscript
+## adapted from 10-collectSamplesFilled.R
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n")
+
+scripts_dir        <- cmd_args[1]
+ppm     <- as.numeric(cmd_args[2])
+z_score <- as.numeric(cmd_args[3])
+
+source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R"))
+source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R"))
+# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R"))
+
+# for each scan mode, collect all filled peak group lists
+scanmodes <- c("positive", "negative")
+
+for (scanmode in scanmodes) {
+  # get list of files
+  # filled_files <- list.files("./", full.names=TRUE, pattern=scanmode)
+  filled_file <- paste0("./PeakGroupList_", scanmode, "_Unidentified_filled.RData")
+  print(filled_file)
+  # load file 
+  outlist.tot <- get(load(filled_file))
+
+  # remove duplicates; peak groups with exactly the same m/z
+  outlist.tot <- mergeDuplicatedRows(outlist.tot)
+
+  # sort on mass
+  outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),]
+
+  # load replication pattern
+  pattern_file <- paste0(scanmode, "_repl_pattern.RData")
+  repl_pattern <- get(load(pattern_file))
+
+  # Normalization: not done.
+  # if (normalization != "disabled") {
+  #   outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB")
+  # }
+
+  if (z_score == 1) {
+    outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
+    nr.removed.samples <- length(which(repl_pattern[]=="character(0)"))
+    order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)])
+    outlist.stats.more <- cbind(outlist.stats[,1:7],
+                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)],
+                               outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int],
+                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)])
+  
+    tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE)
+    tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index]))
+    tmp <- outlist.stats.more[,tmp.index[tmp.index.order]]
+    outlist.stats.more <- outlist.stats.more[,-tmp.index]
+    outlist.stats.more <- cbind(outlist.stats.more,tmp)
+    outlist.tot <- outlist.stats.more
+  }
+
+  outlist.not.ident = outlist.tot
+
+  # Extra output in Excel-readable format:
+  remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
+  remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns)
+  outlist.not.ident <- outlist.not.ident[ , -remove_colindex]
+  write.table(outlist.not.ident, file=paste0("unidentified_outlist_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+
+}
diff --git a/DIMS/UnidentifiedCalcZscores.nf b/DIMS/UnidentifiedCalcZscores.nf
new file mode 100644
index 0000000..1687b76
--- /dev/null
+++ b/DIMS/UnidentifiedCalcZscores.nf
@@ -0,0 +1,18 @@
+process UnidentifiedCalcZscores {
+    tag "DIMS UnidentifiedCalcZscores"
+    label 'UnidentifiedCalcZscores'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(unidentified_filled_files)
+       path(replication_pattern)  // input files need to be linked, but called within R script
+
+    output:
+       path('unidentified_outlist*.txt')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCalcZscores.R $params.scripts_dir $params.ppm $params.zscore
+        """
+}
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
new file mode 100755
index 0000000..322a1c1
--- /dev/null
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -0,0 +1,102 @@
+#!/usr/bin/Rscript
+## adapted from 7-collectSamplesGroupedHMDB.R
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+
+ppm <- as.numeric(cmd_args[1])
+outdir <- "./" 
+
+scanmodes <- c("positive", "negative")
+
+for (scanmode in scanmodes) {
+  # get list of all files that contain lists of peaks that were used in identified peak grouping
+  files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep=""))
+  # load the list of all peaks
+  load(paste0("SpectrumPeaks_", scanmode, ".RData")) #outlist.tot
+
+  # Make a list of indexes of peaks that have been identified, then remove these from the peaklist.
+  remove <- NULL
+  for (i in 1:length(files)) {
+    load(files[i]) # outlist.grouped, now called list_of_peaks_used_in_peak_groups_identified
+    remove <- c(remove, which(outlist.tot[ ,"mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i ,"mzmed.pkt"]))
+  }
+  outlist.rest <- outlist.tot[-remove, ]
+
+  # sort on mass
+  outlist <- outlist.rest[order(as.numeric(outlist.rest[ ,"mzmed.pkt"])),]
+
+  save(outlist, file=paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData"))
+
+  # cut the unidentified peak list in parts of part_len length for parallel processing
+  # NB: the while statement below gives an error message. Skip the cutting into parts.
+  # part_len <- 10000
+  # nr_peaks <- dim(outlist)[1]
+  # end <- 0
+  # min_1_last <- part_len
+  # check <- 0
+  # outlist_i_min_1 <- NULL
+  # part_nr <- 0
+
+  # if (nr_peaks >= part_len & (floor(nr_peaks/part_len) - 1) >= 2) {
+  #   for (part_nr in 2:floor(nr_peaks/part_len) -1 ) {
+  #     start <- -(part_len-1) + part_nr*part_len
+  #     end <- part_nr*part_len
+  #     
+  #     if (part_nr > 1) {
+  #       outlist_i <- outlist[c(start:end),]
+  # 
+  #         n_moved <- 0
+       
+  #      # Calculate ppm and replace border, avoid cut within peakgroup!
+  #      while ((as.numeric(outlist_i[1, "mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last, "mzmed.pkt"])) * 1e+06/as.numeric(outlist_i[1, "mzmed.pkt"]) < ppm) {
+  #        outlist_i_min_1 <- rbind(outlist_i_min_1, outlist_i[1,])
+  #        outlist_i <- outlist_i[-1, ]
+  #        n_moved <- n_moved + 1
+  #      }
+      
+  #      # message(paste("Process", i-1,":", dim(outlist_i_min_1)[1]))
+  #      save(outlist_i_min_1, file = paste(outdir, paste(scanmode, paste("outlist_i_min_1", part_nr-1, "RData", sep="."), sep="_"), sep="/"))
+  #      check <- check + dim(outlist_i_min_1)[1]
+      
+  #      outlist_i_min_1 <- outlist_i
+  #      min_1_last <- dim(outlist_i_min_1)[1]
+      
+  #    } else {
+  #      outlist_i_min_1 = outlist[c(start:end), ]
+  #    }
+  #  }
+  #}
+
+  #start <- end + 1
+  #end <- nr_peaks
+  #outlist_i <- outlist[c(start:end), ]
+  #n_moved <- 0
+
+  #if(!is.null(outlist_i_min_1)){
+  #  # Calculate ppm and replace border, avoid cut within peakgroup!
+  #  while ((as.numeric(outlist_i[1,"mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last,"mzmed.pkt"]))*1e+06/as.numeric(outlist_i[1,"mzmed.pkt"]) < ppm) {
+  #    outlist_i_min_1 = rbind(outlist_i_min_1, outlist_i[1,])
+  #    outlist_i = outlist_i[-1,]
+  #    n_moved = n_moved + 1
+  #  }
+  
+  #  cat(paste("Process", i+1-1,":", dim(outlist_i_min_1)[1]))
+  #  save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i,"RData", sep="."), sep="_"), sep="/"))
+  #  check=check+dim(outlist_i_min_1)[1]
+  #}
+
+  #outlist_i_min_1=outlist_i
+  #cat("Process", i+2-1,":", dim(outlist_i_min_1)[1], "\n")
+  #save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i+1,"RData", sep="."), sep="_"), sep="/"))
+  #}
+
+  #check <- check + dim(outlist_i_min_1)[1]
+  #if (check==dim(outlist)[1]){
+  #  cat("Check is oke!\n")
+  #} else {
+  #  cat("Check is failed!\n")
+}
diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf
new file mode 100644
index 0000000..475b0a4
--- /dev/null
+++ b/DIMS/UnidentifiedCollectPeaks.nf
@@ -0,0 +1,20 @@
+process UnidentifiedCollectPeaks {
+    tag "DIMS UnidentifiedCollectPeaks"
+    label 'UnidentifiedCollectPeaks'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(SpectrumPeaks_file)
+       path(PeakList_identified)
+
+    output:
+       path('SpectrumPeaks_*_Unidentified.RData')
+       // path('SpectrumPeaks_negative_Unidentified.RData')
+       // path('SpectrumPeaks_positive_Unidentified.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $SpectrumPeaks_file $params.ppm
+        """
+}
diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R
new file mode 100755
index 0000000..edfc8a7
--- /dev/null
+++ b/DIMS/UnidentifiedFillMissing.R
@@ -0,0 +1,50 @@
+#!/usr/bin/Rscript
+# adapted from 9-runFillMissing.R
+
+# define parameters
+cmd_args <- commandArgs(trailingOnly = TRUE)
+for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+
+# define parameters 
+peakgrouplist_file1 <- cmd_args[1]
+peakgrouplist_file2 <- cmd_args[2]
+scripts_dir        <- cmd_args[3]
+thresh  <- as.numeric(cmd_args[4])
+resol   <- as.numeric(cmd_args[5])
+ppm     <- as.numeric(cmd_args[6])
+outdir <- "./"
+
+print(scripts_dir)
+
+# load in function scripts
+source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
+source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getSD.R"))
+source(paste0(scripts_dir, "AddOnFunctions/getArea.R"))
+source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R"))
+source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R"))
+source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R"))
+source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R"))
+
+peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2)
+for (peakgrouplist_file in peakgrouplist_files) {
+
+  if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else
+      if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" }
+
+  # get replication pattern for sample names
+  pattern_file <- paste0(scanmode, "_repl_pattern.RData")
+  repl_pattern <- get(load(pattern_file))
+
+  # load peak group list and determine output file name
+  outpgrlist_identified <- get(load(peakgrouplist_file))
+
+  outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
+
+  # replace missing values (zeros) with random noise
+  peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+
+  # save output 
+  save(peakgrouplist_filled, file=paste0("./", outputfile_name))
+}
diff --git a/DIMS/UnidentifiedFillMissing.nf b/DIMS/UnidentifiedFillMissing.nf
new file mode 100644
index 0000000..50c9783
--- /dev/null
+++ b/DIMS/UnidentifiedFillMissing.nf
@@ -0,0 +1,18 @@
+process UnidentifiedFillMissing {
+    tag "DIMS UnidentifiedFillMissing ${GroupedList_file}"
+    label 'UnidentifiedFillMissing'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(GroupedList_file)
+       path(replication_pattern) // input files need to be linked, but called within R script
+
+    output:
+       path('*_filled.RData')
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        """
+}
diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R
new file mode 100755
index 0000000..d7a3dee
--- /dev/null
+++ b/DIMS/UnidentifiedPeakGrouping.R
@@ -0,0 +1,93 @@
+#!/usr/bin/Rscript
+## adapted from 8-peakGrouping.rest.R
+
+# load required packages 
+# none 
+
+# define parameters 
+cmd_args <- commandArgs(trailingOnly = TRUE)
+
+resol <- as.numeric(cmd_args[1])
+ppm <- as.numeric(cmd_args[2])
+outdir <- "./"
+options(digits=16)
+
+print(list.files(outdir, pattern="RData"))
+
+scanmodes <- c("positive", "negative")
+
+# function for grouping unidentified peaks
+groupingRest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
+  outlist.copy <- get(load(unidentified_peaklist))
+  # batch = strsplit(unidentified_peaklist, ".",fixed = TRUE)[[1]][2] 
+  load(paste0("./", scanmode, "_repl_pattern.RData"))
+  
+  outpgrlist <- NULL
+  
+  # Then group on highest peaks
+  range <- ppm*1e-06
+  startcol <- 7
+  
+  # while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) {
+  while (dim(outlist.copy)[1] > 0) {
+    
+    sel <- which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1]
+    
+    # ppm range around max
+    mzref <- as.numeric(outlist.copy[sel, "mzmed.pkt"])
+    pkmin <- -(range*mzref - mzref)
+    pkmax <- 2*mzref-pkmin
+    
+    selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax
+    tmplist <- outlist.copy[selp,,drop=FALSE]
+    
+    nrsamples <- length(unique(tmplist[,"samplenr"]))
+    if (nrsamples > 0) {
+      
+      mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
+      mzmin.pgrp <- -(range*mzmed.pgrp - mzmed.pgrp)
+      mzmax.pgrp <- 2*mzmed.pgrp - mzmin.pgrp
+      
+      selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp
+      tmplist <- outlist.copy[selp,,drop=FALSE]
+      
+      # remove used peaks!!!
+      tmp <- as.vector(which(tmplist[,"height.pkt"]==-1))
+      if (length(tmp)>0) tmplist<-tmplist[-tmp,,drop=FALSE]
+      
+      nrsamples <- length(unique(tmplist[,"samplenr"]))
+      
+      fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"]))
+      fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"]))
+      ints.allsamps <- rep(0, length(names(repl_pattern_filtered)))
+      names(ints.allsamps) <- names(repl_pattern_filtered) # same order as sample list!!!
+      
+      # Check for each sample if multiple peaks exists, if so take the sum!
+      labels <- unique(tmplist[,"samplenr"])
+      ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))})))
+      
+      outpgrlist <- rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps,NA,NA,NA,NA))
+    }
+   
+    outlist.copy <- outlist.copy[-which(selp==TRUE),,drop=FALSE]
+  }
+  
+  outpgrlist <- as.data.frame(outpgrlist)  # ignore warnings of duplicate row names
+  colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp")
+  colnames(outpgrlist)[(length(repl_pattern_filtered)+7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB")
+ 
+  return(outpgrlist) 
+  
+}
+
+
+for (scanmode in scanmodes) {
+  unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData") 
+  # generate peak group lists of the unidentified peaks
+  outpgrlist <- groupingRest(outdir, unidentified_peaklist, scanmode, ppm=ppm)
+
+  # save output in RData format for further processing  
+  save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData"))
+  write.table(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.txt"))
+}
+
diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf
new file mode 100644
index 0000000..00fc576
--- /dev/null
+++ b/DIMS/UnidentifiedPeakGrouping.nf
@@ -0,0 +1,19 @@
+process UnidentifiedPeakGrouping {
+    tag "DIMS UnidentifiedPeakGrouping ${UnidentifiedSpectrumPeaks_file}"
+    label 'UnidentifiedPeakGrouping'
+    container = 'docker://umcugenbioinf/dims:1.3'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+       path(UnidentifiedSpectrumPeaks_file) // input files need to be linked, but called within R script
+       path(replication_pattern)  // input files need to be linked, but called within R script
+
+    output:
+       path('*_Unidentified.txt')
+       path('*_Unidentified.RData'), emit: grouped_unidentified
+
+    script:
+        """
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $params.resolution $params.ppm
+        """
+}

From 657a7cdca8d9db879352e7177efbc676f5b04ad5 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 21 Dec 2023 08:24:06 +0100
Subject: [PATCH 14/73] first working version of full NF pipeline

---
 DIMS/AddOnFunctions/check_same_samplename.R |   4 +
 DIMS/AddOnFunctions/create_violin_plots.R   |  98 +++++++++++
 DIMS/AddOnFunctions/prepare_alarmvalues.R   |  32 ++++
 DIMS/AddOnFunctions/prepare_data.R          |  42 +++++
 DIMS/AddOnFunctions/prepare_data_perpage.R  |  47 +++++
 DIMS/AddOnFunctions/prepare_toplist.R       |  29 +++
 DIMS/AssignToBins.R                         |   5 +
 DIMS/AssignToBins.nf                        |   3 +-
 DIMS/AverageTechReplicates.R                |  61 ++++++-
 DIMS/AverageTechReplicates.nf               |   7 +-
 DIMS/CollectSumAdducts.nf                   |   2 +-
 DIMS/GenerateExcel.R                        |  22 ++-
 DIMS/GenerateExcel.nf                       |   3 +-
 DIMS/GenerateViolinPlots.R                  | 184 ++++++++++++++------
 DIMS/GenerateViolinPlots.nf                 |   8 +-
 DIMS/HMDBparts.nf                           |   2 +-
 DIMS/ThermoRawFileParser.nf                 |   2 +-
 17 files changed, 476 insertions(+), 75 deletions(-)
 create mode 100644 DIMS/AddOnFunctions/check_same_samplename.R
 create mode 100644 DIMS/AddOnFunctions/create_violin_plots.R
 create mode 100644 DIMS/AddOnFunctions/prepare_alarmvalues.R
 create mode 100644 DIMS/AddOnFunctions/prepare_data.R
 create mode 100644 DIMS/AddOnFunctions/prepare_data_perpage.R
 create mode 100644 DIMS/AddOnFunctions/prepare_toplist.R

diff --git a/DIMS/AddOnFunctions/check_same_samplename.R b/DIMS/AddOnFunctions/check_same_samplename.R
new file mode 100644
index 0000000..16ae9cf
--- /dev/null
+++ b/DIMS/AddOnFunctions/check_same_samplename.R
@@ -0,0 +1,4 @@
+# function to test whether intensity and Z-score columns match
+check_same_samplename <- function(int_col_name, zscore_col_name) {
+  paste0(int_col_name, "_Zscore") == zscore_col_name
+}
diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R
new file mode 100644
index 0000000..35e6afb
--- /dev/null
+++ b/DIMS/AddOnFunctions/create_violin_plots.R
@@ -0,0 +1,98 @@
+create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) {
+
+  # set parameters for plots
+  plot_height <- 9.6 
+  plot_width <- 6
+  fontsize <- 1 
+  nr_plots_perpage <- 20
+  circlesize <- 0.8 
+  colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF","#A704FD","#F36265","#DA0641")
+  #                   green     blue      blue/purple purple    orange    red
+  
+  # patient plots, create the PDF device
+  if (grepl("Diagnostics", pdf_dir)) { 
+    prefix <- "Dx_" 
+  } else if (grepl("IEM", pdf_dir)) { 
+    prefix <- "IEM_" 
+  } else { 
+    prefix <- "R_" 
+  }
+
+  pdf(paste0(pdf_dir, "/", prefix, pt_name, ".pdf"), 
+      onefile = TRUE,
+      width = plot_width, 
+      height = plot_height) 
+  
+  # page headers:
+  page_headers <- names(metab_perpage)
+  
+  # put table into PDF file, if not empty
+  if (!is.null(dim(top_metab_pt))) {
+    plot.new()
+    # get the names and numbers in the table aligned
+    table_theme <- ttheme_default(core = list(fg_params = list(hjust=0, x=0.05, fontsize=6)),
+                                  colhead = list(fg_params = list(fontsize=8, fontface="bold")))
+    grid.table(top_metab_pt, theme = table_theme, rows = NULL)
+    # g <- tableGrob(top_metab_pt)
+    # grid.draw(g)
+    text(x=0.45, y=1.02, paste0("Top deviating metabolites for patient: ", pt_name), font=1, cex=1)
+  }
+  
+  # violin plots
+  for (page_index in 1:length(metab_perpage)) {
+    # extract list of metabolites to plot on a page
+    metab_list_2plot <- metab_perpage[[page_index]]
+    # cut off Z-scores higher than 20 or lower than -5 (for nicer plots)
+    metab_list_2plot$value[metab_list_2plot$value >  20] <-  20
+    metab_list_2plot$value[metab_list_2plot$value <  -5] <-  -5
+    # extract data for patient of interest (pt_name)
+    pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
+    # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients
+    metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ]
+    # subtitle per page
+    sub_perpage <- gsub("_", " ", page_headers[page_index])
+    # for IEM plots, put subtitle on two lines
+    sub_perpage <- gsub("probability", "\nprobability", sub_perpage)
+    
+    # draw violin plot. shape=22 gives square for patient of interest
+    ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) +
+      theme(axis.text.y=element_text(size=rel(fontsize)), plot.caption = element_text(size=rel(fontsize))) +
+      xlim(-5, 20) +
+      geom_violin(scale="width") +
+      geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") +
+      scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") +
+      # add labels. Use font Courier to get all the plots in the same location.
+      labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + 
+      theme(axis.text.y = element_text(family = "Courier", size=6)) +
+      # do not show legend
+      theme(legend.position="none") +
+      # add title 
+      ggtitle(label = paste0("Results for patient ", pt_name)) + 
+      # labs(x = "Z-scores", y = "Metabolites", title = paste0("Results for patient ", pt_name), subtitle = sub_perpage, color = "z-score") + 
+      # add vertical lines
+      geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) +
+      geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2)
+      
+    suppressWarnings(print(ggplot_object))
+    
+  }
+
+  # add explanation of violin plots, version number etc.
+  # plot.new()
+  plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='')
+  if (length(explanation) > 0) {
+    text(0.2, 5, explanation[1], pos=4, cex=0.8)
+    for (line_index in 2:length(explanation)) {
+      text_y_position <- 5 - (line_index*0.2)
+      text(-0.2, text_y_position, explanation[line_index], pos=4, cex=0.5)
+    }
+    # full_explanation <- paste(explanation[2:length(explanation)], sep="  \n")
+    # text(0.2, 4, full_explanation, pos=4, cex=0.6)
+    #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n"))
+    #grid.arrange(explanation_grob)
+  }
+
+  # close the PDF file
+  dev.off()
+  
+}
diff --git a/DIMS/AddOnFunctions/prepare_alarmvalues.R b/DIMS/AddOnFunctions/prepare_alarmvalues.R
new file mode 100644
index 0000000..ff94fce
--- /dev/null
+++ b/DIMS/AddOnFunctions/prepare_alarmvalues.R
@@ -0,0 +1,32 @@
+prepare_alarmvalues <- function(pt_name, metab_interest_sorted) {
+  # set parameters for table
+  high_zscore_cutoff <- 5
+  low_zscore_cutoff <- -3
+  
+  # make table of all metabolites
+  all_metab <- c()
+  for (page_nr in 1:length(metab_interest_sorted)) {
+    all_metab <- rbind(all_metab, metab_interest_sorted[[page_nr]])
+  }
+  # extract data for patient of interest (pt_name)
+  pt_list <- all_metab[which(all_metab$variable==pt_name), ]
+  # remove column with patient name
+  pt_list <- pt_list[ , -2]
+  # round off Z-scores
+  pt_list$value <- round(as.numeric(pt_list$value), 2)
+  
+  # determine alarms for this patient: Z-score above 5 or below -3
+  pt_list_high <- pt_list[pt_list$value > high_zscore_cutoff, ]
+  pt_list_low <- pt_list[pt_list$value < low_zscore_cutoff, ]
+  # add lines for increased, decreased
+  extra_line1 <- c("Increased", "") 
+  extra_line2 <- c("Decreased", "")
+  # combine the two lists
+  top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low)
+  # remove row names
+  rownames(top_metab_pt) <- NULL
+  # change column names for display
+  colnames(top_metab_pt) <- c("Metabolite", "Z-score")
+
+  return(top_metab_pt)
+}
diff --git a/DIMS/AddOnFunctions/prepare_data.R b/DIMS/AddOnFunctions/prepare_data.R
new file mode 100644
index 0000000..07a8805
--- /dev/null
+++ b/DIMS/AddOnFunctions/prepare_data.R
@@ -0,0 +1,42 @@
+prepare_data <- function(metab_list_all, zscore_patients_local) {
+  # remove "_Zscore" from column (patient) names
+  colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local))
+  # put data into pages, max 20 violin plots per page in PDF
+  metab_interest_sorted <- list()
+  metab_category <- c()
+  for (metab_class_index in 1:length(metab_list_all)) { # "acyl_carnitines" "amino_acids" "crea_gua"
+    metab_class <- names(metab_list_all)[metab_class_index]
+    metab_list <- metab_list_all[[metab_class_index]]
+    if (ncol(metab_list) > 2) {
+      # third column are the alarm values, so reduce the data frame to 2 columns and save list
+      metab_list_alarm <- metab_list
+      metab_list <- metab_list[ , c(1,2)]
+    }
+    # make sure that all HMDB_names have 45 characters 
+    for (metab_index in 1:length(metab_list$HMDB_name)) {
+      if (is.character(metab_list$HMDB_name[metab_index])) { 
+        HMDB_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]]
+      } else { HMDB_name_separated <- "strspliterror" }
+      if (length(HMDB_name_separated) <= 45) {
+        HMDB_name_separated <- c(HMDB_name_separated, rep(" ", 45-length(HMDB_name_separated)))
+      } else {
+        HMDB_name_separated <- c(HMDB_name_separated[1:42], "...")
+      }
+      metab_list$HMDB_name[metab_index] <- paste0(HMDB_name_separated, collapse = "")
+    }
+    # find metabolites and ratios in data frame zscore_patients_local
+    metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code")
+    # remove column "HMDB_code"
+    metab_interest <- metab_interest[ , -which(colnames(metab_interest) == "HMDB_code")]
+    # put the data frame in long format
+    metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name")
+    # sort on metabolite names (HMDB_name)
+    sort_order <- order(metab_interest_melt$HMDB_name)
+    metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ]
+    metab_category <- c(metab_category, metab_class)
+  }
+  names(metab_interest_sorted) <- metab_category
+ 
+  return(metab_interest_sorted)
+   
+} 
diff --git a/DIMS/AddOnFunctions/prepare_data_perpage.R b/DIMS/AddOnFunctions/prepare_data_perpage.R
new file mode 100644
index 0000000..9f112f3
--- /dev/null
+++ b/DIMS/AddOnFunctions/prepare_data_perpage.R
@@ -0,0 +1,47 @@
+prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat=20, nr_contr=30) {
+  total_nr_pages <- 0
+  metab_perpage <- list()
+  metab_category <- c()
+  for (metab_class_index in 1:length(metab_interest_sorted)) { # "acyl_carnitines" "amino_acids" "crea_gua"
+    # split list into pages, each page containing max nr_plots_perpage (20) compounds
+    metab_interest_perclass <- metab_interest_sorted[[metab_class_index]]
+    metab_class <- names(metab_interest_sorted)[metab_class_index]
+    # add controls
+    metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]]
+    # number of pages for this class
+    nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage)
+    for (page_nr in 1:nr_pages) {
+      total_nr_pages <- total_nr_pages + 1
+      select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr-1)) + 1
+      select_rows_end   <-  nr_pat * nr_plots_perpage * page_nr
+      metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ]
+      # same for controls
+      select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr-1)) + 1
+      select_rows_end_contr   <-  nr_contr * nr_plots_perpage * page_nr
+      metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ]
+      # add controls
+      metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr)
+      # if a page has fewer than nr_plots_perpage plots, fill page with empty plots
+      NA_rows <- which(is.na(metab_onepage$HMDB_name))
+      if (length(NA_rows) > 0) {
+        # repeat the patient and control variables
+        metab_onepage$variable[NA_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)]
+        # for HMDB name, substitute a number of spaces
+        for (row_nr in NA_rows) {
+          metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr/(nr_pat + nr_contr))), collapse = "")
+        }
+        metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name)
+        # leave the values at NA
+      }
+      # put data for one page into object with data for all pages
+      metab_perpage[[total_nr_pages]] <- metab_onepage
+      # create list of page headers
+      metab_category <- c(metab_category, paste(metab_class, page_nr, sep="_"))
+    }
+  }
+  # add page headers to list
+  names(metab_perpage) <- metab_category
+  
+  return(metab_perpage)
+  
+}
diff --git a/DIMS/AddOnFunctions/prepare_toplist.R b/DIMS/AddOnFunctions/prepare_toplist.R
new file mode 100644
index 0000000..369f887
--- /dev/null
+++ b/DIMS/AddOnFunctions/prepare_toplist.R
@@ -0,0 +1,29 @@
+prepare_toplist <- function(pt_name, zscore_patients_copy) {
+  # set parameters for table
+  top_highest <- 20
+  top_lowest <- 10
+  
+  # extract data for patient of interest (pt_name)
+  pt_list <- zscore_patients_copy[ , c(1,2, which(colnames(zscore_patients_copy) == pt_name))]
+  # sort metabolites on Z-scores for this patient
+  pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE)
+  pt_list_sorted <- pt_list[pt_list_sort$ix, ]
+  # determine top highest and lowest Z-scores for this patient
+  pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE)
+  pt_list_low  <- pt_list[pt_list_sort$ix[1:top_lowest], ]
+  pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix)-top_highest+1)], ]
+  # round off Z-scores
+  pt_list_low[ , 3]  <- round(as.numeric(pt_list_low[ , 3]), 2)
+  pt_list_high[ , 3] <- round(as.numeric(pt_list_high[ , 3]), 2)
+  # add lines for increased, decreased
+  extra_line1 <- c("Increased", "", "") 
+  extra_line2 <- c("Decreased", "", "")
+  top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low)
+  # remove row names
+  rownames(top_metab_pt) <- NULL
+  
+  # change column names for display
+  colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score")
+  
+  return(top_metab_pt)
+}
diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index ae9c8b1..cfbe15c 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -35,6 +35,11 @@ neg_results     <- NULL
 # read in the data for 1 sample
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
+# generate TIC plots. Prepare txt files with data for plots
+TIC_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
+colnames(TIC_intensity_persample) <- c("retentionTime", "TIC")
+write.table(TIC_intensity_persample, file = paste0("./", sample_name, "_TIC.txt"))
+
 # load breaks_fwhm
 load(breaks_filepath)
 
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index fbbb48e..25b3891 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -8,7 +8,8 @@ process AssignToBins {
        tuple val(file_id), path(mzML_filename) , path(breaks_file)
 
     output:
-       path("${file_id}.RData")
+       path("${file_id}.RData"), emit: RData_files
+       path("${file_id}_TIC.txt"), emit: TIC_txt_files
 
     script:
         """
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 4001937..098651a 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -6,8 +6,20 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 
 init_filepath <- cmd_args[1]
 nr_replicates <- as.numeric(cmd_args[2])
-thresh2remove <- 2000
+thresh2remove <- 1000000000
 dimsThresh    <- 100
+run_name      <- cmd_args[3]
+dims_matrix   <- cmd_args[4]
+
+print(run_name)
+print(dims_matrix)
+
+if (dims_matrix == "DBS") {
+	thresh2remove <- 50000000
+}
+
+library("ggplot2")
+library("gridExtra")
 
 removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
   # bad_samples=remove_pos
@@ -50,11 +62,8 @@ for (i in 1:length(repl_pattern)) {
   sum_pos <- 0
   n_pos <- 0
   n_neg <- 0
-  cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl_pattern))
   for (j in 1:length(tech_reps)) {
     load(paste("./", tech_reps[j], ".RData", sep=""))
-    cat("\n\nParsing", tech_reps[j])
-
     cat("\n\tNegative peak_list sum", sum(peak_list$neg[,1]))
     if (sum(peak_list$neg[,1]) < thresh2remove){
       cat(" ... Removed")
@@ -104,4 +113,48 @@ repl_pattern_filtered <- retVal$pattern
 save(repl_pattern_filtered, file = "./positive_repl_pattern.RData")
 write.table(remove_pos, file = "./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE , sep= "\t")
 
+# New: generate TIC plots
+# get all txt files
+TIC_files = list.files("./", full.names=TRUE, pattern="*TIC.txt")
+all_samps <- sub('_TIC\\..*$', '', basename(TIC_files))
+
+# determine maximum intensity
+highest_tic_max <- 0
+for (file in TIC_files) {
+  tic <- read.table(file)
+  this_tic_max <- max(tic$TIC)
+  if (this_tic_max > highest_tic_max) {
+    highest_tic_max <- this_tic_max
+    max_sample <- sub('_TIC\\..*$', '', basename(file))
+    }
+}
+
+tic_plot_list <- list()
+k = 0
+for (i in c(1:length(repl_pattern))) { # change after test-phase !!!
+  tech_reps <- as.vector(unlist(repl_pattern[i]))
+  sampleName <- names(repl_pattern)[i]
+  for (j in 1:length(tech_reps)) {
+    k = k + 1
+    repl1_nr <- read.table(TIC_files[j])
+    bad_color_pos <- tech_reps[j] %in% remove_pos[[1]]
+    bad_color_neg <- tech_reps[j] %in% remove_neg[[1]]
+    if (bad_color_neg & bad_color_pos) {plotcolor = '#F8766D'} else if (bad_color_pos) {plotcolor = "#ED8141"} else if (bad_color_neg) {plotcolor = "#BF80FF"} else {plotcolor = 'white'}
+    tic_plot <- ggplot(repl1_nr, aes(retentionTime, TIC)) +
+      geom_line(linewidth = 0.3) +
+      geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
+      labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], "  ||  ", sampleName)) +
+      theme(plot.background = element_rect(fill = plotcolor), axis.text = element_text(size = 4), axis.title = element_text(size = 4), plot.title = element_text(size = 6))
+    tic_plot_list[[k]] <- tic_plot
+  }
+
+}
+# create a layout matrix dependent on numer of replicates
+layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
 
+tic_plot_pdf <- marrangeGrob(grobs = tic_plot_list,
+			     nrow = 10,
+			     ncol = nr_replicates,
+			     layout_matrix = layout,
+			     top = quote(paste("TICs of run", run_name," \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", g, "/", npages)))
+ggsave(filename = paste0("./../../../", run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index 33567a3..aa89477 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -5,8 +5,11 @@ process AverageTechReplicates {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(RData_file)
+       path(RData_file)     // input files need to be linked, but called within R script
+       path(TIC_txt_files)  // input files need to be linked, but called within R script
        path(init_filepath)
+       val(analysis_id)
+       val(matrix)
 
     output:
        path('*_repl_pattern.RData'), emit: patterns
@@ -16,7 +19,7 @@ process AverageTechReplicates {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix
         """
 }
 
diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf
index a72455c..da7ab0c 100644
--- a/DIMS/CollectSumAdducts.nf
+++ b/DIMS/CollectSumAdducts.nf
@@ -1,5 +1,5 @@
 process CollectSumAdducts {
-    tag {"DIMS CollectSumAdducts"}
+    tag "DIMS CollectSumAdducts"
     label 'CollectSumAdducts'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index a84c9d2..0fd394f 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -243,14 +243,12 @@ if (z_score == 1) {
 }
 writeData(wb, sheet = 1, outlist, startCol = 1)
 xlsx_name <- paste0(outdir, "/", project, ".xlsx")
-saveWorkbook(wb,
-             xlsx_name,
-             overwrite = TRUE)
+saveWorkbook(wb, xlsx_name, overwrite = TRUE)
+# save a local copy in work directory as well
+saveWorkbook(wb, paste0("./", project, ".xlsx"), overwrite = TRUE)
 cat(xlsx_name)
 rm(wb)
 
-write.table(outlist, file = paste(outdir, "allpgrps_stats.txt", sep = "/"))
-
 # INTERNE STANDAARDEN
 IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),]
 IS_codes <- rownames(IS)
@@ -570,26 +568,26 @@ outlist.ident.neg <- outlist.ident
 load(paste0(outdir, "RData/outlist_identified_positive.RData"))
 outlist.ident.pos <- outlist.ident
 rm(outlist.ident)
-rm(outlist.not.ident)
 # check for missing m/z in negative and positive mode
 mode <- c("Negative", "Positive")
 index <- 1
 results_ident <- c() #empty results list
 outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos)
-for(outlist.ident in outlist_ident_list){
+for (outlist.ident in outlist_ident_list) {
   current_mode <- mode[index]
   # retrieve all unique m/z values in whole numbers and check if all are available
   mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits=0)))
-  mz_range = seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600
-  mz_missing = c()
-  for (mz in mz_range){
+  mz_range <- seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600
+  mz_missing <- c()
+  for (mz in mz_range) {
     if (!mz %in% mz_values) {
       mz_missing <- c(mz_missing, mz)
-    } }
+    } 
+  }
   y <- mz_missing
   # check if m/z are missing and make an .txt file with information
   group_ident <- cumsum(c(1, abs(y[-length(y)] - y[-1]) > 1))
-  if(length(group_ident) > 1){
+  if (length(group_ident) > 1) {
     results_ident <- c(results_ident, paste0("Missing m/z values ", current_mode, " mode"))
     results_ident <- c(results_ident, by(y, group_ident, identity))
   } else {
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 655ce0d..9e861e2 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -1,5 +1,5 @@
 process GenerateExcel {
-    tag {"DIMS GenerateExcel"}
+    tag "DIMS GenerateExcel"
     label 'GenerateExcel'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
@@ -12,6 +12,7 @@ process GenerateExcel {
 
     output:
        path('AdductSums_*.RData')
+       path('*.xlsx'), emit: excel_file 
 
     script:
         """
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index bf2abe8..a8b819c 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -5,14 +5,15 @@
 # 1. Excel file in which metabolites are listed with their intensities for
 #    controls (with C in samplename) and patients (with P in samplename) and their
 #    corresponding Z-scores. 
-# 2. All files from github: https://github.com/UMCUGenetics/dIEM
+# 2. All files from github: https://github.com/UMCUGenetics/DIMS
+
+## adapted from 15-dIEM_violin.R
 
 library(dplyr) # tidytable is for other_isobaric.R (left_join)
 library(reshape2) # used in prepare_data.R
-library(data.table) # for function setDT
 library(openxlsx) # for opening Excel file
 library(ggplot2) # for plotting
-#library(gridExtra) # for table top highest/lowest
+library(gridExtra) # for table top highest/lowest
 
 # define parameters - check after addition to run.sh
 cmd_args <- commandArgs(trailingOnly = TRUE)
@@ -20,50 +21,56 @@ for (arg in cmd_args) {
   cat("  ", arg, "\n", sep = "")
 }
 
-scripts_dir <- cmd_args[1] 
-run_name <- cmd_args[2] # run_name <- "Test_run_5"
-z_score <- as.numeric(cmd_args[3]) # calculate Z-scores or not? z_score <- 1
+run_name <- cmd_args[1] 
+scripts_dir <- cmd_args[2]
+z_score <- as.numeric(cmd_args[3])
 
 # load functions
-source(paste0(scripts_dir, "AddOnFunctions/same_samplename.R"))
+source(paste0(scripts_dir, "AddOnFunctions/check_same_samplename.R"))
 source(paste0(scripts_dir, "AddOnFunctions/prepare_data.R"))
 source(paste0(scripts_dir, "AddOnFunctions/prepare_data_perpage.R"))
 source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R"))
-source(paste0(scripts_dir, "AddOnFunctions/violin_plots.R"))
+source(paste0(scripts_dir, "AddOnFunctions/create_violin_plots.R"))
 source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R"))
 
 # The list of parameters can be shortened for HPC. Leave for now.
-top_nr_IEM <- 5 # number of diseases that score highest in algorithm to plot
-integer_list <- c(1:top_nr_IEM) # indices of top diseases
-threshold_IEM <- 5 # probability score cut-off for plotting the top diseases
-ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases
+top_nr_IEM       <- 5 # number of diseases that score highest in algorithm to plot
+threshold_IEM    <- 5 # probability score cut-off for plotting the top diseases
+ratios_cutoff    <- -5 # z-score cutoff of axis on the left for top diseases
 nr_plots_perpage <- 20 # number of violin plots per page in PDF
 
 # Settings from config.R
 # binary variable: run function, yes(1) or no(0). Can be removed at later stage
-if (z_score == 1) { algorithm <- ratios <- violin <- 1 } # ? Or put if statement in run.sh?
+if (z_score == 1) { 
+	algorithm <- ratios <- violin <- 1 
+} 
 # integer: are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
-header_row = 1
+header_row <- 1
 # column name where the data starts (default B)
-col_start <- "B"
+col_start     <- "B"
 zscore_cutoff <- 5
-xaxis_cutoff <- 20
+xaxis_cutoff  <- 20
 
 # path to DIMS excel file
-# path_DIMSfile = "/Users/ihoek3/Documents/dIEM/voorbeeld_PL_Diagn17_RUN10.xlsx"
-path_DIMSfile <- paste0("./", run_name, ".xlsx") 
+path_DIMSfile <- paste0("./", run_name, ".xlsx")
 
-# path: output folder 
-output_dir <- paste0("./dIEM") 
+# path: output folder for dIEM and violin plots 
+output_dir <- paste0("../../../dIEM")
 dir.create(output_dir, showWarnings = F) 
+print(getwd())
+print(path_DIMSfile)
 
 # folder in which all metabolite lists are (.txt)
 path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups"
-### file for ratios step 3
+# file for ratios step 3
 file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv"
-### file for algorithm step 4
+# file for algorithm step 4
 file_expected_biomarkers_IEM <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv"
+# explanation: file with text to be included in violin plots
+file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt"
 
+# copy list of isomers to project folder.
+file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir)
 
 #### STEP 1: Preparation #### 
 # in: run_name, path_DIMSfile, header_row ||| out: output_dir, DIMS
@@ -84,7 +91,7 @@ if (exists("dims_xls")) {
 
 # Determine the number of Contols and Patients in column names:
 nr_contr <- length(grep("C",names(dims_xls)))/2   # Number of control samples
-nr_pat <- length(grep("P",names(dims_xls)))/2     # Number of patient samples
+nr_pat   <- length(grep("P",names(dims_xls)))/2   # Number of patient samples
 # total number of samples
 nrsamples <- nr_contr + nr_pat
 # check whether the number of intensity columns equals the number of Zscore columns
@@ -95,11 +102,11 @@ cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n-
 
 # Move the columns HMDB_code and HMDB_name to the beginning. 
 HMDB_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
-other_cols <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols]
-dims_xls_copy <- dims_xls[ , c(HMDB_info_cols, other_cols)]
+other_cols     <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols]
+dims_xls_copy  <- dims_xls[ , c(HMDB_info_cols, other_cols)]
 # Remove the columns from 'name' to 'pathway'
-from_col <- which(colnames(dims_xls_copy) == "name")
-to_col   <- which(colnames(dims_xls_copy) == "pathway")
+from_col      <- which(colnames(dims_xls_copy) == "name")
+to_col        <- which(colnames(dims_xls_copy) == "pathway")
 dims_xls_copy <- dims_xls_copy[ , -c(from_col:to_col)]
 # in case the excel had an empty "plots" column, remove it
 if ("plots" %in% colnames(dims_xls_copy)) { 
@@ -143,13 +150,13 @@ if (ratios == 1) {
     ncol=ncol(dims_xls_copy),
     nrow=nrow(ratio_input)
   )), colnames(dims_xls_copy))
-
+  
   # put HMDB info into first two columns of ratio_list
   ratio_list[ ,1:2] <- ratio_input[ ,1:2]
 
   # look for intensity columns (exclude Zscore columns)
-  control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
-  patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+  control_cols   <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+  patient_cols   <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
   intensity_cols <- c(control_cols, patient_cols)
   # calculate each of the ratios of intensities 
   for (ratio_index in 1:nrow(ratio_input)) {
@@ -163,15 +170,23 @@ if (ratios == 1) {
       sel_numerator <- c(sel_numerator, which(dims_xls_copy[ , "HMDB.code"] == ratio_numerator[numerator_index])) 
     }
     for (denominator_index in 1:length(ratio_denominator)) { 
-      sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) 
+      # special case for sum of metabolites (dividing by one)  
+      if (ratio_denominator[denominator_index] != "one") {
+        sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) 
+      }
     }
     # calculate ratio
-    ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) /
-      apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum)
+    if (ratio_denominator[denominator_index] != "one") {
+      ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) /
+        apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum)
+    } else {
+      # special case for sum of metabolites (dividing by one)
+      ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum)
+    }
     # calculate log of ratio
     ratio_list[ratio_index, intensity_cols]<- log2(ratio_list[ratio_index, intensity_cols])
   }
-
+  
   # Calculate means and SD's of the calculated ratios for Controls
   ratio_list[ , "Mean_controls"] <- apply(ratio_list[ , control_cols], 1, mean)
   ratio_list[ , "SD_controls"]   <- apply(ratio_list[ , control_cols], 1, sd)
@@ -182,13 +197,13 @@ if (ratios == 1) {
     zscore_col <- zscore_cols[sample_index] 
     # matching intensity column
     int_col <- intensity_cols[sample_index]
-    # add test on column names
-    if (same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) {
+    # test on column names
+    if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) {
       # calculate Z-scores
       ratio_list[ , zscore_col] <- (ratio_list[ , int_col] - ratio_list[ , "Mean_controls"]) / ratio_list[ , "SD_controls"]
     }
   }
-
+  
   # Add rows of the ratio hmdb codes to the data of zscores from the pipeline.
   dims_xls_ratios <- rbind(ratio_list, dims_xls_copy)
 
@@ -196,9 +211,10 @@ if (ratios == 1) {
   # HMDB_code patientname1  patientname2
   names(dims_xls_ratios) <- gsub("HMDB.code","HMDB_code", names(dims_xls_ratios))
   names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios))
-  # remove the string "_Zscore" from column names; this causes problems in step 4
-  # names(dims_xls_ratios) <- gsub("_Zscore", "", names(dims_xls_ratios)) 
-  
+
+  # for debugging:
+  write.table(dims_xls_ratios, file=paste0(output_dir, "/ratios.txt"), sep="\t")
+
   # Select only the cols with zscores of the patients 
   zscore_patients <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])]
   # Select only the cols with zscores of the controls
@@ -231,12 +247,11 @@ if (algorithm == 1) {
     # Rank all negative zscores lowest to highest
     rank_patients[(pos+1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos+1):nrow(rank_patients), patient_index]))
   }
-  # NB: Warning message: In xtfrm.data.frame(x) : cannot xtfrm data frames. Ignore for now.
-
+  
   # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2).
   expected_zscores <- merge(x=expected_biomarkers, y=zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
   expected_zscores_original <- expected_zscores # necessary copy?
-
+  
   # determine which columns contain Z-scores and which contain disease info
   select_zscore_cols <- grep("_Zscore", colnames(expected_zscores))
   select_info_cols <- 1:(min(select_zscore_cols) -1)
@@ -251,18 +266,18 @@ if (algorithm == 1) {
   rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols]/(expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols]*0.9)
   # combine disease info with rank scores
   expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores)
-
+  
   # multiply weight score and rank score
   weight_score <- expected_zscores
   weight_score[ , select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[ , select_zscore_cols]
-
+  
   # sort table on Disease and Absolute_Weight
   weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ]
 
   # select columns to check duplicates
   dup <- weight_score[ , c('Disease', 'M.z')] 
   uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast=FALSE),]
-
+  
   # calculate probability score
   prob_score <-  aggregate(uni[ , select_zscore_cols], uni["Disease"], sum)
 
@@ -277,7 +292,7 @@ if (algorithm == 1) {
     # set the probability score of these diseases to 0 
     prob_score[which(prob_score$Disease %in% disease_zero), patient_index]<- 0
   }
-
+  
   # determine disease rank per patient
   disease_rank <- prob_score 
   # rank diseases in decreasing order
@@ -285,7 +300,7 @@ if (algorithm == 1) {
   # modify column names, Zscores have now been converted to probability scores
   colnames(prob_score) <- gsub("_Zscore","_prob_score", colnames(prob_score)) # redundant?
   colnames(disease_rank) <- gsub("_Zscore","", colnames(disease_rank))
-
+  
   # Create conditional formatting for output excel sheet. Colors according to values.
   wb <- createWorkbook()
   addWorksheet(wb, "Probability Scores")
@@ -326,6 +341,12 @@ if (violin == 1) { # make violin plots
   # remove duplicates
   expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[ , c(1,2)]), ]
   
+  # load file with explanatory information to be included in PDF.
+  explanation <- readLines(file_explanation)
+  
+  # for debugging:
+  #write.table(explanation, file=paste0(outdir, "explanation_read_in.txt"), sep="\t")
+
   # first step: normal violin plots
   # Find all text files in the given folder, which contain metabolite lists of which
   # each file will be a page in the pdf with violin plots.
@@ -367,15 +388,80 @@ if (violin == 1) { # make violin plots
       # for category Other, make list of top highest and lowest Z-scores for this patient
       if (grepl("Diagnost", pdf_dir)) {
         top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted)
+        # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_alarmvalues.RData"))
       } else {
         top_metab_pt <- prepare_toplist(pt_name, zscore_patients)
+        # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_toplist.RData"))
       }
 
       # generate normal violin plots
-      violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt)
+      create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt)
       
     } # end for pt_nr
 
   } # end for metabolite_dir
    
+  # Second step: dIEM plots in separate directory
+  dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep="/")
+  dir.create(dIEM_plot_dir)
+
+  # Select the metabolites that are associated with the top highest scoring IEM, for each patient
+  # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely.
+  for (pt_nr in 1:length(patient_list)) {
+    pt_name <- patient_list[pt_nr]
+    # get top diseases for this patient
+    pt_colnr <- which(colnames(disease_rank) == pt_name)
+    pt_top_indices <- which(disease_rank[ , pt_colnr] <= top_nr_IEM)
+    pt_IEMs <- disease_rank[pt_top_indices, "Disease"]
+    pt_top_IEMs <- pt_prob_score_top_IEMs <- c()
+    for (single_IEM in pt_IEMs) {
+      # get the probability score
+      prob_score_IEM <- prob_score[which(prob_score$Disease == single_IEM), pt_colnr]
+      # use only diseases for which probability score is above threshold
+      if (prob_score_IEM >= threshold_IEM) {
+        pt_top_IEMs <- c(pt_top_IEMs, single_IEM)
+        pt_prob_score_top_IEMs <- c(pt_prob_score_top_IEMs, prob_score_IEM)
+      }
+    }
+
+    # prepare data for plotting dIEM violin plots
+    # If prob_score_top_IEM is an empty list, don't make a plot
+    if (length(pt_top_IEMs) > 0) {
+      # Sorting from high to low, both prob_score_top_IEMs and pt_top_IEMs.
+      pt_prob_score_order <- order(-pt_prob_score_top_IEMs)
+      pt_prob_score_top_IEMs <- round(pt_prob_score_top_IEMs, 1)
+      pt_prob_score_top_IEM_sorted <- pt_prob_score_top_IEMs[pt_prob_score_order]
+      pt_top_IEM_sorted <- pt_top_IEMs[pt_prob_score_order]
+      # getting metabolites for each top_IEM disease exactly like in metab_list_all
+      metab_IEM_all <- list()
+      metab_IEM_names <- c()
+      for (single_IEM_index in 1:length(pt_top_IEM_sorted)) {
+        single_IEM <- pt_top_IEM_sorted[single_IEM_index]
+        single_prob_score <- pt_prob_score_top_IEM_sorted[single_IEM_index]
+        select_rows <- which(expected_biomarkers_select$Disease == single_IEM)
+        metab_list <- expected_biomarkers_select[select_rows, ]
+        metab_IEM_names <- c(metab_IEM_names, paste0(single_IEM, ", probability score ", single_prob_score))
+        metab_list <- metab_list[ , -1]
+        metab_IEM_all[[single_IEM_index]] <- metab_list
+      }
+      # put all metabolites into one list
+      names(metab_IEM_all) <- metab_IEM_names
+
+      # get Zscore information from zscore_patients_copy, similar to normal violin plots
+      metab_IEM_sorted <- prepare_data(metab_IEM_all, zscore_patients_copy)
+      metab_IEM_controls <- prepare_data(metab_IEM_all, zscore_controls)
+      # make sure every page has 20 metabolites
+      dIEM_metab_perpage <- prepare_data_perpage(metab_IEM_sorted, metab_IEM_controls, nr_plots_perpage, nr_pat)
+
+      # generate dIEM violin plots
+      create_violin_plots(dIEM_plot_dir, pt_name, dIEM_metab_perpage, top_metab_pt)
+
+    } else {
+      cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM,".
+                   Therefore, this pdf was not made:\t ", pt_name ,"_IEM \n"))
+    }
+
+  } # end for pt_nr
+
 } # end if violin = 1
+
diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
index e580feb..f002341 100644
--- a/DIMS/GenerateViolinPlots.nf
+++ b/DIMS/GenerateViolinPlots.nf
@@ -1,17 +1,19 @@
 process GenerateViolinPlots {
-    tag {"DIMS GenerateViolinPlots"}
+    tag "DIMS GenerateViolinPlots"
     label 'GenerateViolinPlots'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
        path(excel_file) // input files need to be linked, but called within R script
+       val(analysis_id)
 
     output:
-       path('*.pdf')
+       // path('*.pdf') // pdf files are generated, but in different directory
+       path('*.xlsx')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $params.scripts_dir $params.analysis_id $params.zscore 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore
         """
 }
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
index 833b94d..50786af 100644
--- a/DIMS/HMDBparts.nf
+++ b/DIMS/HMDBparts.nf
@@ -1,5 +1,5 @@
 process HMDBparts {
-    tag {"DIMS HMDBparts"}
+    tag "DIMS HMDBparts"
     // Custom process to cut HMDB db into parts
     label 'HMDBparts'
     container = 'docker://umcugenbioinf/dims:1.3'
diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf
index df91279..17cee27 100644
--- a/DIMS/ThermoRawFileParser.nf
+++ b/DIMS/ThermoRawFileParser.nf
@@ -1,5 +1,5 @@
 process ConvertRawFile {
-    tag {"DIMS ConvertRawFile ${file_id}"}
+    tag "DIMS ConvertRawFile ${file_id}"
     // Custom process to convert raw file to mzML format
     label 'ThermoRawFileParser_1_1_11'
     shell = ['/bin/bash', '-euo', 'pipefail']

From 1aa51e0efc846e87533ded407ed6931a080ad202 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 15 Jan 2024 17:15:49 +0100
Subject: [PATCH 15/73] added parameter for measurements with high m/z

---
 DIMS/GenerateBreaks.R  | 2 +-
 DIMS/GenerateBreaks.nf | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 7aeb8c4..8a3ca07 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -52,4 +52,4 @@ for (i in 1:nsegment) {
 
 # generate output file
 save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData")
-
+write(highMZ, file="./highest_mz.txt")
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index 593014b..8f37438 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -10,6 +10,7 @@ process GenerateBreaks {
 
     output:
        path('breaks.fwhm.RData')
+       path('highest_mz.txt'), emit: highest_mz
 
     script:
         """

From f973a23279162bc6a5ed80a4c58a35723ac83353 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 15 Jan 2024 17:20:45 +0100
Subject: [PATCH 16/73] added modification of threshold in case of high m/z

---
 DIMS/AverageTechReplicates.R  | 21 +++++++++++++++++----
 DIMS/AverageTechReplicates.nf |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 098651a..f84c6ba 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -10,12 +10,17 @@ thresh2remove <- 1000000000
 dimsThresh    <- 100
 run_name      <- cmd_args[3]
 dims_matrix   <- cmd_args[4]
+highest_mz    <- as.numeric(cmd_args[5])
 
 print(run_name)
 print(dims_matrix)
+print(highest_mz)
 
 if (dims_matrix == "DBS") {
-	thresh2remove <- 50000000
+    thresh2remove <- 50000000
+}
+if (highest_mz > 700) {
+    thresh2remove <- 1000000
 }
 
 library("ggplot2")
@@ -133,17 +138,25 @@ tic_plot_list <- list()
 k = 0
 for (i in c(1:length(repl_pattern))) { # change after test-phase !!!
   tech_reps <- as.vector(unlist(repl_pattern[i]))
-  sampleName <- names(repl_pattern)[i]
+  sample_name <- names(repl_pattern)[i]
   for (j in 1:length(tech_reps)) {
     k = k + 1
     repl1_nr <- read.table(TIC_files[j])
     bad_color_pos <- tech_reps[j] %in% remove_pos[[1]]
     bad_color_neg <- tech_reps[j] %in% remove_neg[[1]]
-    if (bad_color_neg & bad_color_pos) {plotcolor = '#F8766D'} else if (bad_color_pos) {plotcolor = "#ED8141"} else if (bad_color_neg) {plotcolor = "#BF80FF"} else {plotcolor = 'white'}
+    if (bad_color_neg & bad_color_pos) {
+	plotcolor = '#F8766D'
+    } else if (bad_color_pos) {
+	plotcolor = "#ED8141"
+    } else if (bad_color_neg) {
+	plotcolor = "#BF80FF"
+    } else {
+	plotcolor = 'white'
+    }
     tic_plot <- ggplot(repl1_nr, aes(retentionTime, TIC)) +
       geom_line(linewidth = 0.3) +
       geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
-      labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], "  ||  ", sampleName)) +
+      labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], "  ||  ", sample_name)) +
       theme(plot.background = element_rect(fill = plotcolor), axis.text = element_text(size = 4), axis.title = element_text(size = 4), plot.title = element_text(size = 6))
     tic_plot_list[[k]] <- tic_plot
   }
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index aa89477..8775976 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -10,6 +10,7 @@ process AverageTechReplicates {
        path(init_filepath)
        val(analysis_id)
        val(matrix)
+       val(highest_mz)
 
     output:
        path('*_repl_pattern.RData'), emit: patterns

From e41ea88d4d17b60a632aa45d339158edf7f87d85 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 08:53:58 +0100
Subject: [PATCH 17/73] Add EditSummaryFileHappy

---
 Utils/EditSummaryFileHappy.nf | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 Utils/EditSummaryFileHappy.nf

diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
new file mode 100644
index 0000000..20693ee
--- /dev/null
+++ b/Utils/EditSummaryFileHappy.nf
@@ -0,0 +1,26 @@
+process EditSummaryFileHappy {
+    tag {"EditSummaryFileHappy"}
+    label 'EditSummaryFileHappy'
+    shell = ['/bin/bash', '-euo', 'pipefail']
+
+    input:
+        tuple(val(meta), path(summary_csv))
+
+    output:
+        path("INDEL_PASS_summary.txt", includeInputs: false), emit: indel_pass_summary_txt
+        path("INDEL_ALL_summary.txt", includeInputs: false), emit: indel_all_summary_txt
+        path("SNP_PASS_summary.txt", includeInputs: false), emit: snp_pass_summary_txt
+        path("SNP_ALL_summary.txt", includeInputs: false), emit: snpp_all_summary_txt
+
+    script:
+        """
+        # Add samplenames as columns
+        sed '1s/$/,sample_query,sample_truth/; 2,$s/$/,${meta.query},${meta.truth}' ${summary_csv} > ${summary_csv}.tmp
+        
+        # Split file and add column
+        awk '{print $0 > $3_$4"_summary.txt"}' ${summary_csv}.tmp
+        
+        # Remove tmp files
+        rm *tmp     
+        """
+}
\ No newline at end of file

From a70501c60cd7cfec638716f81a391e8e7e49bdd7 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 11:32:43 +0100
Subject: [PATCH 18/73] refactor script in EditSummaryFileHappy

---
 Utils/EditSummaryFileHappy.nf | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
index 20693ee..d2219c2 100644
--- a/Utils/EditSummaryFileHappy.nf
+++ b/Utils/EditSummaryFileHappy.nf
@@ -4,23 +4,24 @@ process EditSummaryFileHappy {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
+        // meta should have the keys 'id', 'query' and 'truth'
         tuple(val(meta), path(summary_csv))
 
     output:
-        path("INDEL_PASS_summary.txt", includeInputs: false), emit: indel_pass_summary_txt
-        path("INDEL_ALL_summary.txt", includeInputs: false), emit: indel_all_summary_txt
-        path("SNP_PASS_summary.txt", includeInputs: false), emit: snp_pass_summary_txt
-        path("SNP_ALL_summary.txt", includeInputs: false), emit: snpp_all_summary_txt
+        path("INDEL_PASS_summary.csv"), emit: indel_pass_summary_csv
+        path("INDEL_ALL_summary.csv"), emit: indel_all_summary_csv
+        path("SNP_PASS_summary.csv"), emit: snp_pass_summary_csv
+        path("SNP_ALL_summary.csv"), emit: snp_all_summary_csv
 
     script:
         """
-        # Add samplenames as columns
-        sed '1s/$/,sample_query,sample_truth/; 2,$s/$/,${meta.query},${meta.truth}' ${summary_csv} > ${summary_csv}.tmp
+        # Add samplenames as columns (header and row values) at start of line
+        sed '1s/^/sample_query,sample_truth,/; 2,\$s/^/${meta.query},${meta.truth},/' ${summary_csv} > ${summary_csv}.tmp
         
-        # Split file and add column
-        awk '{print $0 > $3_$4"_summary.txt"}' ${summary_csv}.tmp
+        # Split file including header (first line)
+        awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$3"_"\$4"_summary.csv"; print \$0>>\$3"_"\$4"_summary.csv"}' ${summary_csv}.tmp
         
         # Remove tmp files
-        rm *tmp     
+        rm ${summary_csv}.tmp     
         """
 }
\ No newline at end of file

From c20d8f8d111a498c535904c0646f45c644cf4636 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 12:52:18 +0100
Subject: [PATCH 19/73] add joined column of truth and query samples

---
 Utils/EditSummaryFileHappy.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
index d2219c2..8ddb472 100644
--- a/Utils/EditSummaryFileHappy.nf
+++ b/Utils/EditSummaryFileHappy.nf
@@ -16,7 +16,7 @@ process EditSummaryFileHappy {
     script:
         """
         # Add samplenames as columns (header and row values) at start of line
-        sed '1s/^/sample_query,sample_truth,/; 2,\$s/^/${meta.query},${meta.truth},/' ${summary_csv} > ${summary_csv}.tmp
+        sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp
         
         # Split file including header (first line)
         awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$3"_"\$4"_summary.csv"; print \$0>>\$3"_"\$4"_summary.csv"}' ${summary_csv}.tmp

From d8c0bc084c7f829223b451cf471e2084b7fea486 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 12:52:28 +0100
Subject: [PATCH 20/73] add delim

---
 CheckQC/check_qc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index a47b14a..8e9a5d1 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -177,7 +177,7 @@ def create_and_write_output(qc_output, output_path, output_prefix):
 
 def read_and_judge_metrics(qc, metrics):
     for qc_file in metrics:
-        qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter="\t", quotechar='"')
+        qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("sep", "\t"), quotechar='"')
         report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
         qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"])
         failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"])

From 78c04e7762133b824f792d605eaada33643e85b7 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 12:59:45 +0100
Subject: [PATCH 21/73] change column to Type and Filter

---
 Utils/EditSummaryFileHappy.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
index 8ddb472..20f29fc 100644
--- a/Utils/EditSummaryFileHappy.nf
+++ b/Utils/EditSummaryFileHappy.nf
@@ -19,7 +19,7 @@ process EditSummaryFileHappy {
         sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp
         
         # Split file including header (first line)
-        awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$3"_"\$4"_summary.csv"; print \$0>>\$3"_"\$4"_summary.csv"}' ${summary_csv}.tmp
+        awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$4"_"\$5"_summary.csv"; print \$0>>\$4"_"\$5"_summary.csv"}' ${summary_csv}.tmp
         
         # Remove tmp files
         rm ${summary_csv}.tmp     

From eb55d6b09b48e2dc684b8216e901f61f83564b5b Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 13:45:26 +0100
Subject: [PATCH 22/73] use meta.id as tag

---
 Utils/EditSummaryFileHappy.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
index 20f29fc..83ef755 100644
--- a/Utils/EditSummaryFileHappy.nf
+++ b/Utils/EditSummaryFileHappy.nf
@@ -1,5 +1,5 @@
 process EditSummaryFileHappy {
-    tag {"EditSummaryFileHappy"}
+    tag "$meta.id"
     label 'EditSummaryFileHappy'
     shell = ['/bin/bash', '-euo', 'pipefail']
 

From 240008c4d5239c586e65aa472500987fa389af8e Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 14:00:31 +0100
Subject: [PATCH 23/73] add stageAs to CheckQC

---
 CheckQC/CheckQC.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CheckQC/CheckQC.nf b/CheckQC/CheckQC.nf
index 19dbddd..2fe48db 100644
--- a/CheckQC/CheckQC.nf
+++ b/CheckQC/CheckQC.nf
@@ -6,7 +6,7 @@ process CheckQC {
 
     input:
         val(identifier)
-        path(input_files)
+        path(input_files, stageAs: "?/*")
      
     output:
         path("${identifier}_summary.csv", emit: qc_output)

From 56093d452f5a3ae1031f3a159aaf8cb6565636cf Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 20 Feb 2024 14:24:30 +0100
Subject: [PATCH 24/73] typo

---
 CheckQC/check_qc.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index 8e9a5d1..db94a4b 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -177,7 +177,7 @@ def create_and_write_output(qc_output, output_path, output_prefix):
 
 def read_and_judge_metrics(qc, metrics):
     for qc_file in metrics:
-        qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("sep", "\t"), quotechar='"')
+        qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"')
         report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
         qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"])
         failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"])

From 1036d5769fc04a024639b54a978ee4b560abe5ec Mon Sep 17 00:00:00 2001
From: mraves2 <M.L.Raves-2@umcutrecht.nl>
Date: Tue, 20 Feb 2024 17:01:32 +0100
Subject: [PATCH 25/73] used lintr and styler to clean up code

---
 DIMS/AssignToBins.R             |  71 ++--
 DIMS/AverageTechReplicates.R    | 198 +++++------
 DIMS/CollectFilled.R            |  75 ++---
 DIMS/FillMissing.R              |  24 +-
 DIMS/GenerateBreaks.R           |  46 ++-
 DIMS/GenerateExcel.R            | 574 +++++++++++++++++---------------
 DIMS/GenerateViolinPlots.R      | 295 ++++++++--------
 DIMS/HMDBparts.R                |  72 ++--
 DIMS/HMDBparts_main.R           |  33 +-
 DIMS/MakeInit.R                 |  22 +-
 DIMS/PeakFinding.R              |  16 +-
 DIMS/PeakGrouping.R             | 244 +++++++-------
 DIMS/SpectrumPeakFinding.R      |  71 ++--
 DIMS/SumAdducts.R               |  80 ++---
 DIMS/UnidentifiedCalcZscores.R  |  56 ++--
 DIMS/UnidentifiedCollectPeaks.R |  92 +----
 DIMS/UnidentifiedFillMissing.R  |  26 +-
 DIMS/UnidentifiedPeakGrouping.R | 127 ++++---
 18 files changed, 1019 insertions(+), 1103 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index cfbe15c..ed6984a 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -1,44 +1,43 @@
 #!/usr/bin/Rscript
 ## adapted from 2-DIMS.R
 
-# load required packages 
+# load required packages
 suppressPackageStartupMessages(library("xcms"))
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-filepath        <- cmd_args[1] # location of mzML file
-breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-resol           <- as.numeric(cmd_args[3]) # 140000
+filepath        <- cmd_args[1]
+breaks_filepath <- cmd_args[2]
+resol           <- as.numeric(cmd_args[3])
 trim            <- 0.1
-dimsThresh      <- 100
+dims_thresh     <- 100
 
 # get sample name
-sample_name <- sub('\\..*$', '', basename(filepath))
+sample_name <- sub("\\..*$", "", basename(filepath))
 
-options(digits=16)
+options(digits = 16)
 
 # Initialize
-int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve
 scale      <- 2 # Initial value used to estimate scaling parameter
 width      <- 1024
 height     <- 768
-trimLeft        <- NULL
-trimRight       <- NULL
+trim_left       <- NULL
+trim_right      <- NULL
 breaks_fwhm     <- NULL
 breaks_fwhm_avg <- NULL
 bins            <- NULL
 pos_results     <- NULL
 neg_results     <- NULL
 
-### process one sample at a time and find peaks FOR BOTH SCAN MODES! #
 # read in the data for 1 sample
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
-# generate TIC plots. Prepare txt files with data for plots
-TIC_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
-colnames(TIC_intensity_persample) <- c("retentionTime", "TIC")
-write.table(TIC_intensity_persample, file = paste0("./", sample_name, "_TIC.txt"))
+# for TIC plots: prepare txt files with data for plots
+tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
+colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity")
+write.table(tic_intensity_persample, file = paste0("./", sample_name, "_TIC.txt"))
 
 # load breaks_fwhm
 load(breaks_filepath)
@@ -54,45 +53,47 @@ raw_data_matrix <- rawMat(raw_data)
 # Get time values for positive and negative scans
 pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
 neg_times <- raw_data@scantime[raw_data@polarity == "negative"]
-# Select scans between trimLeft and trimRight
-pos_times <- pos_times[pos_times > trimLeft & pos_times < trimRight]
-neg_times <- neg_times[neg_times > trimLeft & neg_times < trimRight]
+# Select scans between trim_left and trim_right
+pos_times <- pos_times[pos_times > trim_left & pos_times < trim_right]
+neg_times <- neg_times[neg_times > trim_left & neg_times < trim_right]
 
 # Generate an index with which to select values for each mode
-pos_index <- which(raw_data_matrix[ ,"time"] %in% pos_times)
-neg_index <- which(raw_data_matrix[ ,"time"] %in% neg_times)
+pos_index <- which(raw_data_matrix[, "time"] %in% pos_times)
+neg_index <- which(raw_data_matrix[, "time"] %in% neg_times)
 # Separate each mode into its own matrix
 pos_raw_data_matrix <- raw_data_matrix[pos_index, ]
 neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
 
 # Get index for binning intensity values
-bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
-bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE)
+bin_indices_pos <- cut(pos_raw_data_matrix[, "mz"], breaks_fwhm,
+                       include.lowest = TRUE, right = TRUE, labels = FALSE)
+bin_indices_neg <- cut(neg_raw_data_matrix[, "mz"], breaks_fwhm,
+                       include.lowest = TRUE, right = TRUE, labels = FALSE)
 
 # Get the list of intensity values for each bin, and add the
 # intensity values which are in the same bin
 if (nrow(pos_raw_data_matrix) > 0) {
   # set NA in intensities to zero
-  pos_raw_data_matrix[is.na(pos_raw_data_matrix[,"intensity"]), "intensity"] <- 0
-  # use only values above dimsThresh
-  pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"]
+  pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0
+  # use only values above dims_thresh
+  pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"]
   # aggregate intensities, calculate mean
   aggr_int_pos <- stats::aggregate(pos_intensity_above_threshold, list(bin_indices_pos), mean)
-  pos_bins[aggr_int_pos[ ,1]] <- aggr_int_pos[ ,2]
+  pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2]
 }
 if (nrow(neg_raw_data_matrix) > 0) {
   # set NA in intensities to zero
-  neg_raw_data_matrix[is.na(neg_raw_data_matrix[,"intensity"]), "intensity"] <- 0
-  # use only values above dimsThresh
-  neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"]
+  neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0
+  # use only values above dims_thresh
+  neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"]
   # aggregate intensities, calculate mean
   aggr_int_neg <- stats::aggregate(neg_intensity_above_threshold, list(bin_indices_neg), mean)
-  neg_bins[aggr_int_neg[ ,1]] <- aggr_int_neg[ ,2]
+  neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2]
 }
 
 # Zero any values that are below the threshold
-pos_bins[pos_bins < dimsThresh] <- 0
-neg_bins[neg_bins < dimsThresh] <- 0
+pos_bins[pos_bins < dims_thresh] <- 0
+neg_bins[neg_bins < dims_thresh] <- 0
 
 pos_results <- cbind(pos_results, pos_bins)
 neg_results <- cbind(neg_results, neg_bins)
@@ -118,6 +119,6 @@ colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1
 pos_results_final <- t(pos_results_transpose)
 neg_results_final <- t(neg_results_transpose)
 
-peak_list <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm)
+peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm)
 
-save(peak_list, file=paste("./", sample_name, ".RData", sep=""))
+save(peak_list, file = paste0("./", sample_name, ".RData"))
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index f84c6ba..4c3d49b 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -1,173 +1,173 @@
 #!/usr/bin/Rscript
 # adapted from 3-AverageTechReplicates.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 init_filepath <- cmd_args[1]
 nr_replicates <- as.numeric(cmd_args[2])
 thresh2remove <- 1000000000
-dimsThresh    <- 100
+dims_thresh    <- 100
 run_name      <- cmd_args[3]
 dims_matrix   <- cmd_args[4]
 highest_mz    <- as.numeric(cmd_args[5])
 
-print(run_name)
-print(dims_matrix)
-print(highest_mz)
-
+# lower the threshold below which a sample will be removed for DBS and for high m/z
 if (dims_matrix == "DBS") {
-    thresh2remove <- 50000000
+  thresh2remove <- 50000000
 }
 if (highest_mz > 700) {
-    thresh2remove <- 1000000
+  thresh2remove <- 1000000
 }
 
 library("ggplot2")
 library("gridExtra")
 
-removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) {
-  # bad_samples=remove_pos
+remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
   tmp <- repl_pattern
-  removeFromGroup <- NULL
-  
-  for (i in 1:length(tmp)){
-    tmp2 <- repl_pattern[[i]]
+  remove_from_group <- NULL
+  for (sample_nr in 1:length(tmp)){
+    tmp2 <- repl_pattern[[sample_nr]]
     remove <- NULL
-    for (j in 1:length(tmp2)){
-      if (tmp2[j] %in% bad_samples){
-        remove = c(remove, j)
+    for (file_name in 1:length(tmp2)) {
+      if (tmp2[file_name] %in% bad_samples) {
+        remove <- c(remove, file_name)
       }
     }
-    
-    if (length(remove) == nr_replicates) removeFromGroup <- c(removeFromGroup,i)
-    if (!is.null(remove)) repl_pattern[[i]] <- repl_pattern[[i]][-remove]
+    if (length(remove) == nr_replicates) {
+      remove_from_group <- c(remove_from_group, sample_nr)
+    }
+    if (!is.null(remove)) {
+      repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove]
+    }
   }
-  
-  if (length(removeFromGroup)!=0) {
-    repl_pattern <- repl_pattern[-removeFromGroup]
+  if (length(remove_from_group) != 0) {
+    repl_pattern <- repl_pattern[-remove_from_group]
   }
-  
-  return(list("pattern"=repl_pattern))
+  return(list("pattern" = repl_pattern))
 }
 
-
 # get repl_pattern
 load("./init.RData")
 
 remove_neg <- NULL
 remove_pos <- NULL
 cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
-for (i in 1:length(repl_pattern)) {
-  techRepsArray.pos <- NULL
-  techRepsArray.neg <- NULL
-  
-  tech_reps <- as.vector(unlist(repl_pattern[i]))
+for (sample_nr in 1:length(repl_pattern)) {
+  tech_reps_array_pos <- NULL
+  tech_reps_array_neg <- NULL
+  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   sum_neg <- 0
   sum_pos <- 0
-  n_pos <- 0
-  n_neg <- 0
-  for (j in 1:length(tech_reps)) {
-    load(paste("./", tech_reps[j], ".RData", sep=""))
-    cat("\n\tNegative peak_list sum", sum(peak_list$neg[,1]))
-    if (sum(peak_list$neg[,1]) < thresh2remove){
+  nr_pos <- 0
+  nr_neg <- 0
+  for (file_name in 1:length(tech_reps)) {
+    load(paste("./", tech_reps[file_name], ".RData", sep = ""))
+    # negative scanmode
+    cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
+    if (sum(peak_list$neg[, 1]) < thresh2remove) {
       cat(" ... Removed")
-      remove_neg <- c(remove_neg, tech_reps[j])
+      remove_neg <- c(remove_neg, tech_reps[file_name])
     } else {
-      n_neg <- n_neg + 1
+      nr_neg <- nr_neg + 1
       sum_neg <- sum_neg + peak_list$neg
     }
-    
-    techRepsArray.neg <- cbind(techRepsArray.neg, peak_list$neg)
-    
-    cat("\n\tPositive peak_list sum", sum(peak_list$pos[,1]))
-    if (sum(peak_list$pos[,1]) < thresh2remove){
+    tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg)
+    # positive scanmode
+    cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
+    if (sum(peak_list$pos[, 1]) < thresh2remove) {
       cat(" ... Removed")
-      remove_pos <- c(remove_pos, tech_reps[j])
+      remove_pos <- c(remove_pos, tech_reps[file_name])
     } else {
-      n_pos <- n_pos + 1
+      nr_pos <- nr_pos + 1
       sum_pos <- sum_pos + peak_list$pos
     }
-    
-    techRepsArray.pos <- cbind(techRepsArray.pos, peak_list$pos)
+    tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos)
   }
-  
-  # filter within bins on at least signal in more than one tech. rep.!!!
-  if (!is.null(dim(sum_pos))) sum_pos[apply(techRepsArray.pos,1,function(x) length(which(x>dimsThresh))==1),1]=0
-  if (!is.null(dim(sum_neg))) sum_neg[apply(techRepsArray.neg,1,function(x) length(which(x>dimsThresh))==1),1]=0
-  
-  if (n_neg != 0){
-    sum_neg[,1] <- sum_neg[,1]/n_neg
-    colnames(sum_neg) <- names(repl_pattern)[i]
-    save(sum_neg, file=paste("./", names(repl_pattern)[i], "_neg_avg.RData", sep=""))
+  # save to file  
+  if (nr_neg != 0) {
+    sum_neg[, 1] <- sum_neg[, 1] / nr_neg
+    colnames(sum_neg) <- names(repl_pattern)[sample_nr]
+    save(sum_neg, file = paste0("./", names(repl_pattern)[sample_nr], "_neg_avg.RData"))
   }
-  if (n_pos != 0) {
-    sum_pos[,1] <- sum_pos[,1]/n_pos
-    colnames(sum_pos) <- names(repl_pattern)[i]
-    save(sum_pos, file=paste("./", names(repl_pattern)[i], "_pos_avg.RData", sep=""))
+  if (nr_pos != 0) {
+    sum_pos[, 1] <- sum_pos[, 1] / nr_pos
+    colnames(sum_pos) <- names(repl_pattern)[sample_nr]
+    save(sum_pos, file = paste0("./", names(repl_pattern)[sample_nr], "_pos_avg.RData"))
   }
 }
 
-retVal <- removeFromRepl.pat(remove_neg, repl_pattern, nr_replicates)
-repl_pattern_filtered <- retVal$pattern
+pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
+repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "./negative_repl_pattern.RData")
-write.table(remove_neg, file = "./miss_infusions_negative.txt", row.names=FALSE, col.names=FALSE , sep= "\t")
+write.table(remove_neg, file = "./miss_infusions_negative.txt",
+            row.names = FALSE, col.names = FALSE, sep = "\t")
 
-retVal <- removeFromRepl.pat(remove_pos, repl_pattern, nr_replicates)
-repl_pattern_filtered <- retVal$pattern
+pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
+repl_pattern_filtered <- pattern_list$pattern
 save(repl_pattern_filtered, file = "./positive_repl_pattern.RData")
-write.table(remove_pos, file = "./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE , sep= "\t")
+write.table(remove_pos, file = "./miss_infusions_positive.txt",
+            row.names = FALSE, col.names = FALSE, sep = "\t")
 
 # New: generate TIC plots
 # get all txt files
-TIC_files = list.files("./", full.names=TRUE, pattern="*TIC.txt")
-all_samps <- sub('_TIC\\..*$', '', basename(TIC_files))
+tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt")
+all_samps <- sub("_TIC\\..*$", "", basename(tic_files))
 
 # determine maximum intensity
 highest_tic_max <- 0
-for (file in TIC_files) {
+for (file in tic_files) {
   tic <- read.table(file)
-  this_tic_max <- max(tic$TIC)
+  this_tic_max <- max(tic$tic_intensity)
   if (this_tic_max > highest_tic_max) {
     highest_tic_max <- this_tic_max
-    max_sample <- sub('_TIC\\..*$', '', basename(file))
-    }
+    max_sample <- sub("_TIC\\..*$", "", basename(file))
+  }
 }
 
 tic_plot_list <- list()
-k = 0
-for (i in c(1:length(repl_pattern))) { # change after test-phase !!!
-  tech_reps <- as.vector(unlist(repl_pattern[i]))
-  sample_name <- names(repl_pattern)[i]
-  for (j in 1:length(tech_reps)) {
-    k = k + 1
-    repl1_nr <- read.table(TIC_files[j])
-    bad_color_pos <- tech_reps[j] %in% remove_pos[[1]]
-    bad_color_neg <- tech_reps[j] %in% remove_neg[[1]]
+plot_nr <-  0
+for (sample_nr in c(1:length(repl_pattern))) {
+  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
+  sample_name <- names(repl_pattern)[sample_nr]
+  for (file_name in 1:length(tech_reps)) {
+    plot_nr <- plot_nr + 1
+    repl1_nr <- read.table(tic_files[file_name])
+    bad_color_pos <- tech_reps[file_name] %in% remove_pos[[1]]
+    bad_color_neg <- tech_reps[file_name] %in% remove_neg[[1]]
     if (bad_color_neg & bad_color_pos) {
-	plotcolor = '#F8766D'
+      plot_color <- "#F8766D"
     } else if (bad_color_pos) {
-	plotcolor = "#ED8141"
+      plot_color <- "#ED8141"
     } else if (bad_color_neg) {
-	plotcolor = "#BF80FF"
+      plot_color <- "#BF80FF"
     } else {
-	plotcolor = 'white'
+      plot_color <- "white"
     }
-    tic_plot <- ggplot(repl1_nr, aes(retentionTime, TIC)) +
+    tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) +
       geom_line(linewidth = 0.3) +
       geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
-      labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], "  ||  ", sample_name)) +
-      theme(plot.background = element_rect(fill = plotcolor), axis.text = element_text(size = 4), axis.title = element_text(size = 4), plot.title = element_text(size = 6))
-    tic_plot_list[[k]] <- tic_plot
+      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[j], "  ||  ", sample_name)) +
+      theme(plot.background = element_rect(fill = plot_color),
+            axis.text = element_text(size = 4),
+            axis.title = element_text(size = 4),
+            plot.title = element_text(size = 6))
+    tic_plot_list[[plot_nr]] <- tic_plot
   }
-
 }
-# create a layout matrix dependent on numer of replicates
+# create a layout matrix dependent on number of replicates
 layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
-
-tic_plot_pdf <- marrangeGrob(grobs = tic_plot_list,
-			     nrow = 10,
-			     ncol = nr_replicates,
-			     layout_matrix = layout,
-			     top = quote(paste("TICs of run", run_name," \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", g, "/", npages)))
-ggsave(filename = paste0("./../../../", run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm")
+# put TIC plots in matrix
+tic_plot_pdf <- marrangeGrob(
+  grobs = tic_plot_list,
+  nrow = 10, ncol = nr_replicates,
+  layout_matrix = layout,
+  top = quote(paste(
+    "TICs of run", run_name,
+    " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ",
+    g, "/", npages
+  ))
+)
+# save to file
+ggsave(filename = paste0("./../../../Bioinformatics/", run_name, "_TICplots.pdf"),
+       tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index e927a65..9e97db2 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -1,7 +1,7 @@
 #!/usr/bin/Rscript
 ## adapted from 10-collectSamplesFilled.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 scripts_dir        <- cmd_args[1]
@@ -10,70 +10,65 @@ z_score <- as.numeric(cmd_args[3])
 
 source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R"))
 source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R"))
-# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
   # get list of files
-  filled_files <- list.files("./", full.names=TRUE, pattern=scanmode)
+  filled_files <- list.files("./", full.names = TRUE, pattern = scanmode)
   # load files and combine into one object
-  outlist.tot <- NULL
-  for (i in 1:length(filled_files)) {
-    load(filled_files[i])
-    outlist.tot <- rbind(outlist.tot, peakgrouplist_filled)
+  outlist_total <- NULL
+  for (file_nr in 1:length(filled_files)) {
+    load(filled_files[file_nr])
+    outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
 
   # remove duplicates; peak groups with exactly the same m/z
-  outlist.tot <- mergeDuplicatedRows(outlist.tot)
+  outlist_total <- mergeDuplicatedRows(outlist_total)
 
   # sort on mass
-  outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),]
+  outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ]
 
   # load replication pattern
   pattern_file <- paste0(scanmode, "_repl_pattern.RData")
   repl_pattern <- get(load(pattern_file))
 
-  # Normalization: not done.
-  # if (normalization != "disabled") {
-  #   outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB")
-  # }
-
   if (z_score == 1) {
-    outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
-    nr.removed.samples <- length(which(repl_pattern[]=="character(0)"))
-    order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)])
-    outlist.stats.more <- cbind(outlist.stats[,1:7],
-                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)],
-                               outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int],
-                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)])
-  
-    tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE)
-    tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index]))
-    tmp <- outlist.stats.more[,tmp.index[tmp.index.order]]
-    outlist.stats.more <- outlist.stats.more[,-tmp.index]
-    outlist.stats.more <- cbind(outlist.stats.more,tmp)
-    outlist.tot <- outlist.stats.more
+    outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE)
+    nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
+    order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
+    outlist_stats_more <- cbind(
+      outlist_stats[, 1:7],
+      outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
+      outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
+      outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
+    )
+
+    tmp_index <- grep("_Zscore", colnames(outlist_stats.more), fixed = TRUE)
+    tmp_index_order <- order(colnames(outlist_stats.more[, tmp_index]))
+    tmp <- outlist_stats.more[, tmp_index[tmp_index_order]]
+    outlist_stats_more <- outlist_stats.more[, -tmp_index]
+    outlist_stats_more <- cbind(outlist_stats_more, tmp)
+    outlist_total <- outlist_stats_more
   }
 
-  outlist.ident <- outlist.tot
+  outlist_ident <- outlist_total
 
   if (z_score == 1) {
-    outlist.ident$ppmdev <- as.numeric(outlist.ident$ppmdev)
-    outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),]
+    outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
+    outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
   }
   # take care of NAs in theormz_noise
-  outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0
-  outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise)
-  outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0
-  outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise)
+  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
+  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
+  outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0
+  outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise)
 
   # Extra output in Excel-readable format:
   remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
-  remove_colindex <- which(colnames(outlist.ident) %in% remove_columns)
-  outlist.ident <- outlist.ident[ , -remove_colindex]
-  write.table(outlist.ident, file=paste0("outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE)
-  save(outlist.ident, file=paste0("outlist_identified_", scanmode, ".RData"))
-
+  remove_colindex <- which(colnames(outlist_ident) %in% remove_columns)
+  outlist_ident <- outlist_ident[, -remove_colindex]
+  write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
+  save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData"))
 }
diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index a2e29ff..0364ad2 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -3,9 +3,9 @@
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
+for (arg in cmd_args) cat("  ", arg, "\n", sep = "")
 
-# define parameters 
+# define parameters
 peakgrouplist_file <- cmd_args[1]
 scripts_dir        <- cmd_args[2]
 thresh  <- as.numeric(cmd_args[3])
@@ -13,13 +13,11 @@ resol   <- as.numeric(cmd_args[4])
 ppm     <- as.numeric(cmd_args[5])
 outdir <- "./"
 
-print(peakgrouplist_file)
-
-if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else
-    if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" }
-
-print(scanmode)
-print(scripts_dir)
+if (grepl("_pos", peakgrouplist_file)) {
+  scanmode <- "positive"
+} else if (grepl("_neg", peakgrouplist_file)) {
+  scanmode <- "negative"
+}
 
 # load in function scripts
 source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
@@ -41,14 +39,10 @@ print(head(repl_pattern))
 # load peak group list and determine output file name
 outpgrlist_identified <- get(load(peakgrouplist_file))
 
-print(head(outpgrlist_identified))
-
 outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
-print(outputfile_name)
-
 # replace missing values (zeros) with random noise
 peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
-# save output 
-save(peakgrouplist_filled, file=paste0("./", outputfile_name))
+# save output
+save(peakgrouplist_filled, file = paste0("./", outputfile_name))
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 8a3ca07..fd60723 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -2,54 +2,52 @@
 ## adapted from 1-generateBreaksFwhm.HPC.R ##
 #!/usr/bin/Rscript
 
-# load required package 
+# load required package
 suppressPackageStartupMessages(library("xcms"))
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 filepath <- cmd_args[1] # 1 of the mzML files
-outdir   <- cmd_args[2] 
+outdir   <- cmd_args[2]
 trim     <- as.numeric(cmd_args[3]) # 0.1
 resol    <- as.numeric(cmd_args[4]) # 140000
 
 # initialize
-trimLeft        <- NULL
-trimRight       <- NULL
+trim_left        <- NULL
+trim_right       <- NULL
 breaks_fwhm     <- NULL
 breaks_fwhm_avg <- NULL
 bins            <- NULL
-posRes          <- NULL
-negRes          <- NULL
 
 # read in mzML file
 raw_data <- suppressMessages(xcmsRaw(filepath))
 
 # trim (remove) scans at the start and end
-trimLeft  <- round(raw_data@scantime[length(raw_data@scantime)*trim])
-trimRight <- round(raw_data@scantime[length(raw_data@scantime)*(1-trim)])
+trim_left  <- round(raw_data@scantime[length(raw_data@scantime) * trim])
+trim_right <- round(raw_data@scantime[length(raw_data@scantime) * (1 - trim)])
 
 # Mass range m/z
-lowMZ  <- raw_data@mzrange[1]
-highMZ <- raw_data@mzrange[2]
+low_mz  <- raw_data@mzrange[1]
+high_mz <- raw_data@mzrange[2]
 
 # determine number of segments (bins)
-nsegment <- 2*(highMZ-lowMZ)
-segment  <- seq(from=lowMZ, to=highMZ, length.out=nsegment+1)
+nr_segments <- 2 * (high_mz - low_mz)
+segment  <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
 
 # determine start and end of each bin.
-for (i in 1:nsegment) {
-  startsegm   <- segment[i]
-  endsegm     <- segment[i+1]
-  resol_mz    <- resol*(1/sqrt(2)^(log2(startsegm/200)))
-  fwhmsegm    <- startsegm/resol_mz
-  breaks_fwhm <- c(breaks_fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm))
+for (i in 1:nr_segments) {
+  start_segment   <- segment[i]
+  end_segment     <- segment[i+1]
+  resol_mz        <- resol*(1 / sqrt(2) ^ (log2(start_segment / 200)))
+  fwhm_segment    <- start_segment / resol_mz
+  breaks_fwhm     <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
   # average the m/z instead of start value
-  range <- seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)
-  deltaMZ <- range[2] - range[1]
-  breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5*deltaMZ)
+  range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
+  delta_mz <- range[2] - range[1]
+  breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz)
 }
 
 # generate output file
-save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData")
-write(highMZ, file="./highest_mz.txt")
+save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "./breaks.fwhm.RData")
+write(high_mz, file = "./highest_mz.txt")
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index 0fd394f..029498f 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -1,17 +1,16 @@
 #!/usr/bin/Rscript
 ## adapted from 13-excelExport.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-# outdir <- cmd_args[1] #"/Users/nunen/Documents/Metab/test_set"
 init_filepath <- cmd_args[1]
-project <- cmd_args[2] #"test"
-dims_matrix <- cmd_args[3] #"DBS"
-hmdb <- cmd_args[4] #"/hpc/dbg_mz/tools/db/HMDB_with_info_relevance_IS_C5OH.RData"
-z_score <- as.numeric(cmd_args[5])
+project       <- cmd_args[2]
+dims_matrix   <- cmd_args[3]
+hmdb          <- cmd_args[4]
+z_score       <- as.numeric(cmd_args[5])
 
-# load required packages 
+# load required packages
 library("ggplot2")
 library("reshape2")
 library("openxlsx")
@@ -19,256 +18,264 @@ library("loder")
 suppressMessages(library("dplyr"))
 suppressMessages(library("stringr"))
 
+round_df <- function(df, digits) {
+  #' function for rounding numbers to x digits for numeric values
+  #'
+  #' @param df dataframe
+  #' @param digits integer number of digits to round off to
+  #'
+  #' @return df
+  numeric_columns <- sapply(df, mode) == "numeric"
+  df[numeric_columns] <- round(df[numeric_columns], digits)
+  return(df)
+}
+
+robust_scaler <- function(control_intensities, control_col_ids, perc = 5) {
+  #' calculate robust scaler: Z-score based on controls without outliers
+  #'
+  #' @param control_intensities matrix with intensities for control samples
+  #' @param control_col_ids vector with column names for control samples
+  #' @param perc float percentage of outliers which will be removed from controls
+  #'
+  #' @return trimmed_control_intensities
+  nr_toremove <- ceiling(length(control_col_ids) * perc / 100)
+  sorted_control_intensities <- sort(as.numeric(control_intensities))
+  trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : (length(sorted_control_intensities) - nr_toremove)]
+  return(trimmed_control_intensities)
+}
+
 # Initialise and load data
-plot <- TRUE
+plot   <- TRUE
 export <- TRUE
 control_label <- "C"
-case_label <- "P"
+case_label    <- "P"
 imagesize_multiplier <- 2
 # setting outdir to export files to the project directory
-outdir <- "../../../"
+outdir <- "./"
 # percentage of outliers to remove from calculation of robust scaler
 perc <- 5
 
 # load information on samples
 load(init_filepath)
 # load the HMDB file with info on biological relevance of metabolites
-load(hmdb) # load object rlvnc into global environment
+load(hmdb)
 
+# get current date
 rundate <- Sys.Date()
 
 # create a directory for plots in project directory
 plotdir <- paste0(outdir, "/plots/adducts")
-dir.create(paste0(outdir, "/plots"), showWarnings = F)
-dir.create(plotdir, showWarnings = F)
+dir.create(paste0(outdir, "/plots"), showWarnings = FALSE)
+dir.create(plotdir, showWarnings = FALSE)
 
-options(digits=16)
+# set the number of digits for floats
+options(digits = 16)
 
-# sum positive and negative adductsums
-# Load pos and neg adduct sums
+# load positive and negative adduct sums
 load("AdductSums_negative.RData")
-outlist.neg.adducts.HMDB <- outlist.tot
+outlist_neg_adducts_hmdb <- outlist.tot
 load("AdductSums_positive.RData")
-outlist.pos.adducts.HMDB <- outlist.tot
+outlist_pos_adducts_hmdb <- outlist.tot
 rm(outlist.tot)
 
 # Only continue with patients (columns) that are in both pos and neg, so patients that are in both
-tmp <- intersect(colnames(outlist.neg.adducts.HMDB), colnames(outlist.pos.adducts.HMDB))
-outlist.neg.adducts.HMDB <- outlist.neg.adducts.HMDB[,tmp]
-outlist.pos.adducts.HMDB <- outlist.pos.adducts.HMDB[,tmp]
+tmp <- intersect(colnames(outlist_neg_adducts_hmdb), colnames(outlist_pos_adducts_hmdb))
+outlist_neg_adducts_hmdb <- outlist_neg_adducts_hmdb[, tmp]
+outlist_pos_adducts_hmdb <- outlist_pos_adducts_hmdb[, tmp]
 
 # Find indexes of neg hmdb code that are also found in pos and vice versa
-index.neg <- which(rownames(outlist.neg.adducts.HMDB) %in% rownames(outlist.pos.adducts.HMDB))
-index.pos <- which(rownames(outlist.pos.adducts.HMDB) %in% rownames(outlist.neg.adducts.HMDB))
+index_neg <- which(rownames(outlist_neg_adducts_hmdb) %in% rownames(outlist_pos_adducts_hmdb))
+index_pos <- which(rownames(outlist_pos_adducts_hmdb) %in% rownames(outlist_neg_adducts_hmdb))
 
-# Get number of columns
-# Only continue with HMDB codes (rows) that were found in both pos and neg mode and remove last column (hmdb_name)
-tmp.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.pos.adducts.HMDB)[2]-1)]
-tmp.hmdb_name.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], dim(outlist.pos.adducts.HMDB)[2]]
-tmp.pos.left <- outlist.pos.adducts.HMDB[-index.pos,]
-
-tmp.neg <- outlist.neg.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.neg.adducts.HMDB)[2]-1)]
-tmp.neg.left <- outlist.neg.adducts.HMDB[-index.neg,]
+# Only continue with HMDB codes (rows) that were found in both positive mode and remove last column (hmdb_name)
+tmp_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_pos_adducts_hmdb)[2] - 1)]
+tmp_hmdb_name_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], dim(outlist_pos_adducts_hmdb)[2]]
+tmp_pos_left <- outlist_pos_adducts_hmdb[-index_pos, ]
+# same for negative mode
+tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_neg_adducts_hmdb)[2] - 1)]
+tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ]
 
 # Combine positive and negative numbers and paste back HMDB column
-tmp <- apply(tmp.pos, 2,as.numeric) + apply(tmp.neg, 2,as.numeric)
-rownames(tmp) <- rownames(tmp.pos)
-tmp <- cbind(tmp, "HMDB_name"=tmp.hmdb_name.pos)
-outlist <- rbind(tmp, tmp.pos.left, tmp.neg.left)
-
-# Filter 
-peaksInList <- which(rownames(outlist) %in% rownames(rlvnc))
-outlist <- cbind(outlist[peaksInList,],as.data.frame(rlvnc[rownames(outlist)[peaksInList],]))
-# filter out all irrelevant HMDB's. the tibble::rownames is needed for the older version of dplyr on the HPC (it will reindex the rownames)
-outlist <- outlist %>% 
-  tibble::rownames_to_column('rowname') %>% 
+tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric)
+rownames(tmp) <- rownames(tmp_pos)
+tmp <- cbind(tmp, "HMDB_name" = tmp_hmdb_name_pos)
+outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left)
+
+# Filter for biological relevance
+peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc))
+outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc[rownames(outlist)[peaks_in_list], ]))
+# filter out all irrelevant HMDBs
+outlist <- outlist %>%
+  tibble::rownames_to_column("rowname") %>%
   filter(!grepl("Exogenous|Drug|exogenous", relevance)) %>%
-  tibble::column_to_rownames('rowname')
+  tibble::column_to_rownames("rowname")
 
 # Add HMDB_code column with all the HMDB ID and sort on it
 outlist <- cbind(outlist, "HMDB_code" = rownames(outlist))
-outlist <- outlist[order(outlist[,"HMDB_code"]),]
+outlist <- outlist[order(outlist[, "HMDB_code"]), ]
 
 # Create excel
 filelist <- "AllPeakGroups"
-
 wb <- createWorkbook("SinglePatient")
 addWorksheet(wb, filelist)
 
-# small function for rounding numbers to x digits for numeric values
-round_df <- function(df, digits) {
-  # round all numeric variables
-  # df: data frame 
-  # digits: number of digits to round
-  numeric_columns <- sapply(df, mode) == 'numeric'
-  df[numeric_columns] <- round(df[numeric_columns], digits)
-  df
-}
-
 # Add Z-scores and create plots
 if (z_score == 1) {
-  # Statistics: Z-score
+  # add a column for plots
   outlist <- cbind(plots = NA, outlist)
-  #outlist <- as.data.frame(outlist)
-  startcol <- dim(outlist)[2] + 3 
-  
-  # Get columns with control intensities 
+  # two columns will be added for mean and stdev of controls; Z-scores start at ncol + 3
+  startcol <- ncol(outlist) + 3
+
+  # Get columns with control intensities
   control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE)
   control_columns <- outlist[, control_col_ids]
-  
-  # Get columns with patient intensities 
+
+  # Get columns with patient intensities
   patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE)
   patient_columns <- outlist[, patient_col_ids]
   intensity_col_ids <- c(control_col_ids, patient_col_ids)
-  
-  # set intensities of 0 to NA?
-  outlist[,intensity_col_ids][outlist[,intensity_col_ids] == 0] <- NA
 
-  # Extra output: save outlist as it is and use it to calculate robust scaler
-  outlist.noZ <- outlist
+  # if there are any intensities of 0 left, set them to NA for stats
+  outlist[, intensity_col_ids][outlist[, intensity_col_ids] == 0] <- NA
+
+  # save outlist as it is and use it to calculate robust scaler
+  outlist_noZ <- outlist
 
   # calculate mean and sd for Control group
-  outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x),na.rm = TRUE))
-  outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x),na.rm = TRUE))
-  
+  outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
+  outlist$sd.ctrls  <- apply(control_columns, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+
   # Make and add columns with zscores
-  cnames.z <- NULL
+  colnames_z <- NULL
   for (i in intensity_col_ids) {
     cname <- colnames(outlist)[i]
-    cnames.z <- c(cnames.z, paste(cname, "Zscore", sep="_"))
-    zscores.1col <- (as.numeric(as.vector(unlist(outlist[ , i]))) - outlist$avg.ctrls) / outlist$sd.ctrls
-    outlist <- cbind(outlist, zscores.1col)
-  }
-  colnames(outlist)[startcol:ncol(outlist)] <- cnames.z
-
-  # Extra output: calculate robust scaler (Zscores minus outliers in Controls)
-  # calculate mean and sd for Control group without outliers
-  outlist.noZ$avg.ctrls <- 0
-  outlist.noZ$sd.ctrls  <- 0
-
-  robust_scaler <- function(control_intensities, control_col_ids, perc=5) {
-    nr_toremove <- ceiling(length(control_col_ids)*perc/100)
-    sorted_control_intensities <- sort(as.numeric(control_intensities))
-    trimmed_control_intensities <- sorted_control_intensities[(nr_toremove+1):(length(sorted_control_intensities)-nr_toremove)]
-    return(trimmed_control_intensities)
+    colnames_z <- c(colnames_z, paste(cname, "Zscore", sep = "_"))
+    zscores_1col <- (as.numeric(as.vector(unlist(outlist[, i]))) - outlist$avg.ctrls) / outlist$sd.ctrls
+    outlist <- cbind(outlist, zscores_1col)
   }
+  colnames(outlist)[startcol:ncol(outlist)] <- colnames_z
+
+  # calculate robust scaler (Zscores minus outliers in Controls)
+  outlist_noZ$avg.ctrls <- 0
+  outlist_noZ$sd.ctrls  <- 0
 
   # only calculate robust Z-scores if there are enough Controls
   if (length(control_col_ids) > 10) {
     for (metabolite_index in 1:nrow(outlist)) {
-      outlist.noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc))
-      outlist.noZ$sd.ctrls[metabolite_index]  <-   sd(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc))
+      outlist_noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist_noZ[metabolite_index, control_col_ids],
+                                                                    control_col_ids, perc))
+      outlist_noZ$sd.ctrls[metabolite_index]  <-   sd(robust_scaler(outlist_noZ[metabolite_index, control_col_ids],
+                                                                    control_col_ids, perc))
     }
   }
 
   # Make and add columns with robust zscores
-  cnames.robust <- gsub("_Zscore", "_RobustZscore", cnames.z)
+  cnames_robust <- gsub("_Zscore", "_RobustZscore", colnames_z)
   for (i in intensity_col_ids) {
-    zscores.1col <- (as.numeric(as.vector(unlist(outlist.noZ[ , i]))) - outlist.noZ$avg.ctrls) / outlist.noZ$sd.ctrls
-    outlist.noZ <- cbind(outlist.noZ, zscores.1col)
+    zscores_1col <- (as.numeric(as.vector(unlist(outlist_noZ[, i]))) - outlist_noZ$avg.ctrls) / outlist_noZ$sd.ctrls
+    outlist_noZ <- cbind(outlist_noZ, zscores_1col)
   }
-  colnames(outlist.noZ)[startcol:ncol(outlist.noZ)] <- cnames.robust
+  colnames(outlist_noZ)[startcol:ncol(outlist_noZ)] <- cnames_robust
 
-  # Extra output: metabolites filtered on relevance
-  save(outlist, file=paste0("AdductSums_filtered_Zscores.RData"))
-  write.table(outlist, file=paste0("AdductSums_filtered_Zscores.txt"), sep="\t", row.names = FALSE)
-  # Extra output: filtered metabolites with robust scaled Zscores
-  write.table(outlist.noZ, file=paste0("AdductSums_filtered_robustZ.txt"), sep="\t", row.names = FALSE)
+  # output metabolites filtered on relevance into tab-separated file
+  write.table(outlist, file = paste0("AdductSums_filtered_Zscores.txt"), sep = "\t", row.names = FALSE)
+  # output filtered metabolites with robust scaled Zscores
+  write.table(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE)
 
-  # get the IDs of the patients 
+  # get the IDs of the patients and sort
   patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1]))))
-  patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] # sorts 
-  
-  # Iterate over every row, make boxplot, insert into excel, and make Zscore for every patient
+  patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)]
+
+  # for every row, make boxplot, insert into excel, and calculate Zscore for every patient
   temp_png <- NULL
   for (p in 1:nrow(outlist)) {
-    # box plot 
-    hmdb_name <- rownames(outlist[p,])
-    
-    intensities <- list(as.numeric(as.vector(unlist(control_columns[p,]))))
+    # get HMDB ID
+    hmdb_name <- rownames(outlist[p, ])
+    # get intensities per metabolite for box plot for control samples
+    intensities <- list(as.numeric(as.vector(unlist(control_columns[p, ]))))
     labels <- c("C", patient_ids)
-    
+    # get intensities per metabolite for box plot for patient samples
     for (i in 1:length(patient_ids)) {
       id <- patient_ids[i]
-      # get all intensities that start with ex. P18. (so P18.1, P18.2, but not x_P18.1 and not P180.1)
-      p.int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1,])[startsWith(names(patient_columns[1,]), paste0(id, "."))]])))
-      intensities[[i+1]] <- p.int
+      # combine all intensities that start with the same string for patients
+      patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ])[startsWith(names(patient_columns[1, ]), paste0(id, "."))]])))
+      intensities[[i + 1]] <- patient_int
     }
-    
     intensities <- setNames(intensities, labels)
-    
+
     plot_width <- length(labels) * 16 + 90
-    
+
     plot.new()
     if (export) {
-      png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"), 
-          width = plot_width, 
+      png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"),
+          width = plot_width,
           height = 280)
     }
-    par(oma=c(2,0,0,0))
-    boxplot(intensities, 
-            col=c("green", rep("red", length(intensities)-1)),
-            names.arg = labels, 
-            las=2, 
-            main = hmdb_name) 
+    # set margins
+    par(oma = c(2, 0, 0, 0))
+    boxplot(intensities,
+            col = c("green", rep("red", length(intensities) - 1)),
+            names.arg = labels,
+            las = 2,
+            main = hmdb_name)
     dev.off()
-    
+
     file_png <- paste0(plotdir, "/", hmdb_name, "_box.png")
     if (is.null(temp_png)) {
       temp_png <- readPng(file_png)
-      img_dim <- dim(temp_png)[c(1,2)]
+      img_dim <- dim(temp_png)[c(1, 2)]
       cell_dim <- img_dim * imagesize_multiplier
-      setColWidths(wb, filelist, cols = 1, widths = cell_dim[2]/20)
+      setColWidths(wb, filelist, cols = 1, widths = cell_dim[2] / 20)
     }
-    
-    insertImage(wb, 
-                filelist, 
-                file_png, 
-                startRow = p + 1, 
-                startCol = 1, 
-                height = cell_dim[1], 
-                width = cell_dim[2], 
+
+    insertImage(wb,
+                filelist,
+                file_png,
+                startRow = p + 1,
+                startCol = 1,
+                height = cell_dim[1],
+                width = cell_dim[2],
                 units = "px")
-    
+
     if (p %% 100 == 0) {
       cat("at row: ", p, "\n")
     }
   }
-  
-  setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1]/4)
+
+  setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4)
   setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20)
 } else {
   setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18)
   setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20)
 }
+# write Excel file
 writeData(wb, sheet = 1, outlist, startCol = 1)
 xlsx_name <- paste0(outdir, "/", project, ".xlsx")
 saveWorkbook(wb, xlsx_name, overwrite = TRUE)
-# save a local copy in work directory as well
-saveWorkbook(wb, paste0("./", project, ".xlsx"), overwrite = TRUE)
 cat(xlsx_name)
 rm(wb)
 
-# INTERNE STANDAARDEN
-IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),]
+#### INTERNE STANDAARDEN ####
+IS <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ]
 IS_codes <- rownames(IS)
-cat(IS_codes,"\n")
+cat(IS_codes, "\n")
 
-# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS), 
+# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS),
 # thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data.
-sample_names_nodata <- setdiff(names(repl_pattern),names(IS))
+sample_names_nodata <- setdiff(names(repl_pattern), names(IS))
 if (!is.null(sample_names_nodata)) {
   write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
-  cat(sample_names_nodata,"\n")
+  cat(sample_names_nodata, "\n")
   for (sample_name in sample_names_nodata) {
     repl_pattern[[sample_name]] <- NULL
-  }}
+  }
+}
 
 # Retrieve IS summed adducts
 IS_summed <- IS[c(names(repl_pattern), "HMDB_code")]
 IS_summed$HMDB.name <- IS$name
-IS_summed <- reshape2::melt(IS_summed, id.vars=c('HMDB_code','HMDB.name'))
-colnames(IS_summed) <- c('HMDB.code','HMDB.name','Sample','Intensity')
+IS_summed <- reshape2::melt(IS_summed, id.vars = c("HMDB_code", "HMDB.name"))
+colnames(IS_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
 IS_summed$Intensity <- as.numeric(IS_summed$Intensity)
 IS_summed$Matrix <- dims_matrix
 IS_summed$Rundate <- rundate
@@ -276,29 +283,29 @@ IS_summed$Project <- project
 IS_summed$Intensity <- as.numeric(as.character(IS_summed$Intensity))
 
 # Retrieve IS positive mode
-IS_pos <- as.data.frame(subset(outlist.pos.adducts.HMDB,rownames(outlist.pos.adducts.HMDB) %in% IS_codes))
-IS_pos$HMDB_name <- IS[match(row.names(IS_pos),IS$HMDB_code,nomatch=NA),'name']
+IS_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% IS_codes))
+IS_pos$HMDB_name <- IS[match(row.names(IS_pos), IS$HMDB_code, nomatch = NA), "name"]
 IS_pos$HMDB.code <- row.names(IS_pos)
-IS_pos <- reshape2::melt(IS_pos, id.vars=c('HMDB.code','HMDB_name'))
-colnames(IS_pos) <- c('HMDB.code','HMDB.name','Sample','Intensity')
+IS_pos <- reshape2::melt(IS_pos, id.vars = c("HMDB.code", "HMDB_name"))
+colnames(IS_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
 IS_pos$Matrix <- dims_matrix
 IS_pos$Rundate <- rundate
 IS_pos$Project <- project
 IS_pos$Intensity <- as.numeric(as.character(IS_pos$Intensity))
 
 # Retrieve IS negative mode
-IS_neg <- as.data.frame(subset(outlist.neg.adducts.HMDB,rownames(outlist.neg.adducts.HMDB) %in% IS_codes))
-IS_neg$HMDB_name <- IS[match(row.names(IS_neg),IS$HMDB_code,nomatch=NA),'name']
+IS_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% IS_codes))
+IS_neg$HMDB_name <- IS[match(row.names(IS_neg), IS$HMDB_code, nomatch = NA), "name"]
 IS_neg$HMDB.code <- row.names(IS_neg)
-IS_neg <- reshape2::melt(IS_neg, id.vars=c('HMDB.code','HMDB_name'))
-colnames(IS_neg) <- c('HMDB.code','HMDB.name','Sample','Intensity')
+IS_neg <- reshape2::melt(IS_neg, id.vars = c("HMDB.code", "HMDB_name"))
+colnames(IS_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
 IS_neg$Matrix <- dims_matrix
 IS_neg$Rundate <- rundate
 IS_neg$Project <- project
 IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity))
 
 # Save results
-save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData'))
+save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, "_IS_results.RData"))
 
 # number of samples, for plotting length and width
 sample_count <- length(repl_pattern)
@@ -315,32 +322,33 @@ IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder))
 # function for ggplot theme
 # theme for all IS bar plots
 theme_IS_bar <- function(myPlot) {
-  myPlot + 
+  myPlot +
     scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
     theme(
-      legend.position = 'none',
-      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size=6),
-      axis.text.y = element_text(size=6))
+      legend.position = "none",
+      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
+      axis.text.y = element_text(size = 6)
+    )
 }
 
 # ggplot functions
 IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
-  geom_bar(aes(fill=HMDB.name), stat='identity') +
-  labs(x='', y='Intensity') +
-  facet_wrap(~HMDB.name, scales='free_y')
+  geom_bar(aes(fill = HMDB.name), stat = "identity") +
+  labs(x = "", y = "Intensity") +
+  facet_wrap(~HMDB.name, scales = "free_y")
 
 IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
-  geom_bar(aes(fill=HMDB.name), stat='identity') +
-  labs(x='', y='Intensity') +
-  facet_wrap(~HMDB.name, scales='free_y')
+  geom_bar(aes(fill = HMDB.name), stat = "identity") +
+  labs(x = "", y = "Intensity") +
+  facet_wrap(~HMDB.name, scales = "free_y")
 
 IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Summed)") +
-  geom_bar(aes(fill=HMDB.name), stat='identity') +
-  labs(x='', y='Intensity') +
-  facet_wrap(~HMDB.name, scales='free_y')
+  geom_bar(aes(fill = HMDB.name), stat = "identity") +
+  labs(x = "", y = "Intensity") +
+  facet_wrap(~HMDB.name, scales = "free_y")
 
 # add theme to ggplot functions
 IS_neg_bar_plot <- theme_IS_bar(IS_neg_bar_plot)
@@ -349,44 +357,46 @@ IS_sum_bar_plot <- theme_IS_bar(IS_sum_bar_plot)
 
 # save plots to disk
 plot_width <- 9 + 0.35 * sample_count
-ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot=IS_neg_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
-ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot=IS_pos_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
-ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot=IS_sum_bar_plot, height=plot_width/2.5, width=plot_width, units="in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot = IS_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot = IS_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot = IS_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
 
 ## Line plots with all IS
 
 # function for ggplot theme
 # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines
 theme_IS_line <- function(myPlot) {
-  myPlot + 
+  myPlot +
     guides(
       shape = guide_legend(override.aes = list(size = 0.5)),
-      color = guide_legend(override.aes = list(size = 0.5))) +
+      color = guide_legend(override.aes = list(size = 0.5))
+    ) +
     theme(
-      legend.title = element_text(size = 8), 
-      legend.text  = element_text(size = 6),
+      legend.title = element_text(size = 8),
+      legend.text = element_text(size = 6),
       legend.key.size = unit(0.7, "line"),
-      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8))
+      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)
+    )
 }
 
 # ggplot functions
 IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
-  geom_point(aes(col=HMDB.name)) +
-  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='', y='Intensity')
+  geom_point(aes(col = HMDB.name)) +
+  geom_line(aes(col = HMDB.name, group = HMDB.name)) +
+  labs(x = "", y = "Intensity")
 
 IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
-  labs(x = '', y = 'Intensity')
+  labs(x = "", y = "Intensity")
 
 IS_sum_line_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
-  labs(x = '', y = 'Intensity')
+  labs(x = "", y = "Intensity")
 
 # add theme to ggplot functions
 IS_sum_line_plot <- theme_IS_line(IS_sum_line_plot)
@@ -395,37 +405,49 @@ IS_pos_line_plot <- theme_IS_line(IS_pos_line_plot)
 
 # save plots to disk
 plot_width <- 8 + 0.2 * sample_count
-ggsave(paste0(outdir,"/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir,"/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir,"/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width/2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
 
 ## bar plots with a selection of IS
-IS_neg_selection <- c('2H2-Ornithine (IS)', '2H3-Glutamate (IS)', '2H2-Citrulline (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)')
-IS_pos_selection <- c('2H4-Alanine (IS)', '13C6-Phenylalanine (IS)', '2H4_13C5-Arginine (IS)', '2H3-Propionylcarnitine (IS)', '2H9-Isovalerylcarnitine (IS)')
-IS_sum_selection <- c('2H8-Valine (IS)', '2H3-Leucine (IS)', '2H3-Glutamate (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)')
+IS_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)")
+IS_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", "2H9-Isovalerylcarnitine (IS)")
+IS_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)")
 
 # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum)
-if (dims_matrix == "DBS"){
-  hline.data.neg <- 
-    data.frame(z = c(15000, 200000, 130000, 18000, 50000),
-               HMDB.name = IS_neg_selection)
-  hline.data.pos <- 
-    data.frame(z = c(150000, 3300000, 1750000, 150000, 270000),
-               HMDB.name = IS_pos_selection)
-  hline.data.sum <- 
-    data.frame(z = c(1300000, 2500000, 500000, 1800000, 1400000),
-               HMDB.name = IS_sum_selection)
-} else if (dims_matrix == "Plasma"){
-  hline.data.neg <- 
-    data.frame(z = c(6500, 100000, 75000, 7500, 25000),
-               HMDB.name = IS_neg_selection)
-  hline.data.pos <- 
-    data.frame(z = c(85000, 1000000, 425000, 70000, 180000),
-               HMDB.name = IS_pos_selection)
-  hline.data.sum <- 
-    data.frame(z = c(700000, 1250000, 150000, 425000, 300000),
-               HMDB.name = IS_sum_selection)
-} 
+if (dims_matrix == "DBS") {
+  hline.data.neg <-
+    data.frame(
+      z = c(15000, 200000, 130000, 18000, 50000),
+      HMDB.name = IS_neg_selection
+    )
+  hline.data.pos <-
+    data.frame(
+      z = c(150000, 3300000, 1750000, 150000, 270000),
+      HMDB.name = IS_pos_selection
+    )
+  hline.data.sum <-
+    data.frame(
+      z = c(1300000, 2500000, 500000, 1800000, 1400000),
+      HMDB.name = IS_sum_selection
+    )
+} else if (dims_matrix == "Plasma") {
+  hline.data.neg <-
+    data.frame(
+      z = c(6500, 100000, 75000, 7500, 25000),
+      HMDB.name = IS_neg_selection
+    )
+  hline.data.pos <-
+    data.frame(
+      z = c(85000, 1000000, 425000, 70000, 180000),
+      HMDB.name = IS_pos_selection
+    )
+  hline.data.sum <-
+    data.frame(
+      z = c(700000, 1250000, 150000, 425000, 300000),
+      HMDB.name = IS_sum_selection
+    )
+}
 
 # function for ggplot theme
 # see bar plots with all IS
@@ -433,35 +455,41 @@ if (dims_matrix == "DBS"){
 # ggplot functions
 IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
-  geom_bar(aes(fill=HMDB.name), stat='identity') +
-  labs(x='', y='Intensity') +
-  facet_wrap(~HMDB.name, scales='free', ncol = 2) + 
-  if(exists("hline.data.neg")){geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))} #subset, if some IS have no data, no empty plots will be generated with a line)
+  geom_bar(aes(fill = HMDB.name), stat = "identity") +
+  labs(x = "", y = "Intensity") +
+  facet_wrap(~HMDB.name, scales = "free", ncol = 2) +
+  if (exists("hline.data.neg")) {
+    geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))
+  } # subset, if some IS have no data, no empty plots will be generated with a line)
 
 IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
-  geom_bar(aes(fill=HMDB.name), stat='identity') +
-  labs(x='', y='Intensity') +
-  facet_wrap(~HMDB.name, scales='free', ncol = 2) +
-  if(exists("hline.data.pos")){geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))}
+  geom_bar(aes(fill = HMDB.name), stat = "identity") +
+  labs(x = "", y = "Intensity") +
+  facet_wrap(~HMDB.name, scales = "free", ncol = 2) +
+  if (exists("hline.data.pos")) {
+    geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))
+  }
 
 IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
-  geom_bar(aes(fill=HMDB.name), stat='identity') +
-  labs(x='', y='Intensity') +
-  facet_wrap(~HMDB.name, scales='free', ncol = 2) +
-  if(exists("hline.data.sum")){geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))}
+  geom_bar(aes(fill = HMDB.name), stat = "identity") +
+  labs(x = "", y = "Intensity") +
+  facet_wrap(~HMDB.name, scales = "free", ncol = 2) +
+  if (exists("hline.data.sum")) {
+    geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))
+  }
 
 # add theme to ggplot functions
-IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot) 
+IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot)
 IS_pos_selection_barplot <- theme_IS_bar(IS_pos_selection_barplot)
-IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot) 
+IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot)
 
 # save plots to disk
 plot_width <- 9 + 0.35 * sample_count
-ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
 
 ## line plots with a selection of IS
 
@@ -471,21 +499,21 @@ ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_b
 # ggplot functions
 IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
-  geom_point(aes(col=HMDB.name)) +
-  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='', y='Intensity')
+  geom_point(aes(col = HMDB.name)) +
+  geom_line(aes(col = HMDB.name, group = HMDB.name)) +
+  labs(x = "", y = "Intensity")
 
 IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
-  geom_point(aes(col=HMDB.name)) +
-  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='', y='Intensity')
+  geom_point(aes(col = HMDB.name)) +
+  geom_line(aes(col = HMDB.name, group = HMDB.name)) +
+  labs(x = "", y = "Intensity")
 
 IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
-  geom_point(aes(col=HMDB.name)) +
-  geom_line(aes(col=HMDB.name, group=HMDB.name)) +
-  labs(x='', y='Intensity')
+  geom_point(aes(col = HMDB.name)) +
+  geom_line(aes(col = HMDB.name, group = HMDB.name)) +
+  labs(x = "", y = "Intensity")
 
 # add theme to ggplot functions
 IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot)
@@ -494,9 +522,9 @@ IS_sum_selection_lineplot <- theme_IS_line(IS_sum_selection_lineplot)
 
 # save plots to disk
 plot_width <- 8 + 0.2 * sample_count
-ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
 
 
 ### POSITIVE CONTROLS CHECK
@@ -505,59 +533,59 @@ ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_
 
 column_list <- colnames(outlist)
 patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_")
-positive_controls_index <- grepl(pattern=paste(patterns, collapse="|"), column_list)
+positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list)
 positivecontrol_list <- column_list[positive_controls_index]
 
 if (z_score == 1) {
   # find if one or more positive control samples are missing
   pos_contr_warning <- c()
   # any() grep because you get a vector of FALSE's and TRUE's. only one grep match is needed for each positive control
-  if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) && 
-      any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) && 
-      any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))){
+  if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) &&
+    any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) &&
+    any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))) {
     cat("All three positive controls are present")
   } else {
-    pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse=" ")
+    pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse = " ")
   }
   # you need all positive control samples, thus starting the script only if all are available
   if (length(pos_contr_warning) == 0) {
     ### POSITIVE CONTROLS
     # make positive control excel with specific HMDB_codes in combination with specific control samples
-    PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] #P1001.x_Zscore
-    PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] #P1003.x_Zscore
-    LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] #P1005.x_Zscore
-    
-    PA_codes <- c('HMDB00824', 'HMDB00783', 'HMDB00123')
-    PKU_codes <- c('HMDB00159')
-    LPI_codes <- c('HMDB00904', 'HMDB00641', 'HMDB00182')
-    
-    PA_data <- outlist[PA_codes, c('HMDB_code','name', PA_sample_name)]
-    PA_data <- reshape2::melt(PA_data, id.vars = c('HMDB_code','name'))
-    colnames(PA_data) <- c('HMDB.code','HMDB.name','Sample','Zscore')
-    
-    PKU_data <- outlist[PKU_codes, c('HMDB_code','name', PKU_sample_name)]
-    PKU_data <- reshape2::melt(PKU_data, id.vars = c('HMDB_code','name'))
-    colnames(PKU_data) <- c('HMDB.code','HMDB.name','Sample','Zscore')
-    
-    LPI_data <- outlist[LPI_codes, c('HMDB_code','name', LPI_sample_name)]
-    LPI_data <- reshape2::melt(LPI_data, id.vars = c('HMDB_code','name'))
-    colnames(LPI_data) <- c('HMDB.code','HMDB.name','Sample','Zscore')
-    
+    PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] # P1001.x_Zscore
+    PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] # P1003.x_Zscore
+    LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] # P1005.x_Zscore
+
+    PA_codes <- c("HMDB00824", "HMDB00783", "HMDB00123")
+    PKU_codes <- c("HMDB00159")
+    LPI_codes <- c("HMDB00904", "HMDB00641", "HMDB00182")
+
+    PA_data <- outlist[PA_codes, c("HMDB_code", "name", PA_sample_name)]
+    PA_data <- reshape2::melt(PA_data, id.vars = c("HMDB_code", "name"))
+    colnames(PA_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
+
+    PKU_data <- outlist[PKU_codes, c("HMDB_code", "name", PKU_sample_name)]
+    PKU_data <- reshape2::melt(PKU_data, id.vars = c("HMDB_code", "name"))
+    colnames(PKU_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
+
+    LPI_data <- outlist[LPI_codes, c("HMDB_code", "name", LPI_sample_name)]
+    LPI_data <- reshape2::melt(LPI_data, id.vars = c("HMDB_code", "name"))
+    colnames(LPI_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
+
     Pos_Contr <- rbind(PA_data, PKU_data, LPI_data)
     Pos_Contr$Zscore <- as.numeric(Pos_Contr$Zscore)
     # extra information added to excel for future reference. made in beginning of this script
     Pos_Contr$Matrix <- dims_matrix
     Pos_Contr$Rundate <- rundate
     Pos_Contr$Project <- project
-    
-    #Save results
-    save(Pos_Contr,file = paste0(outdir, "/", project, '_Pos_Contr.RData'))
+
+    # Save results
+    save(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.RData"))
     Pos_Contr$Zscore <- round_df(Pos_Contr$Zscore, 2) # asked by Lab to round the number to 2 digits
-    write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, '_Pos_Contr.xlsx'), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE)
-    
+    write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.xlsx"), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE)
   } else {
     write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
-  }}
+  }
+}
 
 ### MISSING M/Z CHECK
 # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail
@@ -571,18 +599,18 @@ rm(outlist.ident)
 # check for missing m/z in negative and positive mode
 mode <- c("Negative", "Positive")
 index <- 1
-results_ident <- c() #empty results list
+results_ident <- c() # empty results list
 outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos)
 for (outlist.ident in outlist_ident_list) {
   current_mode <- mode[index]
   # retrieve all unique m/z values in whole numbers and check if all are available
-  mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits=0)))
-  mz_range <- seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600
+  mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits = 0)))
+  mz_range <- seq(70, 599, by = 1) # change accordingly to the machine m/z range. default = 70-600
   mz_missing <- c()
   for (mz in mz_range) {
     if (!mz %in% mz_values) {
       mz_missing <- c(mz_missing, mz)
-    } 
+    }
   }
   y <- mz_missing
   # check if m/z are missing and make an .txt file with information
@@ -595,6 +623,6 @@ for (outlist.ident in outlist_ident_list) {
   }
   index <- index + 1 # change to new mode in for loop
 }
-lapply(results_ident, write, file=paste(outdir, "missing_mz_warning.txt", sep = "/"), append=TRUE, ncolumns=1000)
+lapply(results_ident, write, file = paste(outdir, "missing_mz_warning.txt", sep = "/"), append = TRUE, ncolumns = 1000)
 
 cat("Ready excelExport.R")
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index a8b819c..896918f 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -1,10 +1,10 @@
-# For untargeted metabolomics, this tool calculates probability scores for 
-# metabolic disorders. In addition, it provides visual support with violin plots 
+# For untargeted metabolomics, this tool calculates probability scores for
+# metabolic disorders. In addition, it provides visual support with violin plots
 # of the DIMS measurements for the lab specialists.
-# Input needed: 
+# Input needed:
 # 1. Excel file in which metabolites are listed with their intensities for
 #    controls (with C in samplename) and patients (with P in samplename) and their
-#    corresponding Z-scores. 
+#    corresponding Z-scores.
 # 2. All files from github: https://github.com/UMCUGenetics/DIMS
 
 ## adapted from 15-dIEM_violin.R
@@ -17,13 +17,10 @@ library(gridExtra) # for table top highest/lowest
 
 # define parameters - check after addition to run.sh
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) {
-  cat("  ", arg, "\n", sep = "")
-}
 
-run_name <- cmd_args[1] 
+run_name    <- cmd_args[1]
 scripts_dir <- cmd_args[2]
-z_score <- as.numeric(cmd_args[3])
+z_score     <- as.numeric(cmd_args[3])
 
 # load functions
 source(paste0(scripts_dir, "AddOnFunctions/check_same_samplename.R"))
@@ -33,18 +30,20 @@ source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R"))
 source(paste0(scripts_dir, "AddOnFunctions/create_violin_plots.R"))
 source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R"))
 
-# The list of parameters can be shortened for HPC. Leave for now.
-top_nr_IEM       <- 5 # number of diseases that score highest in algorithm to plot
-threshold_IEM    <- 5 # probability score cut-off for plotting the top diseases
-ratios_cutoff    <- -5 # z-score cutoff of axis on the left for top diseases
-nr_plots_perpage <- 20 # number of violin plots per page in PDF
-
-# Settings from config.R
-# binary variable: run function, yes(1) or no(0). Can be removed at later stage
-if (z_score == 1) { 
-	algorithm <- ratios <- violin <- 1 
-} 
-# integer: are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
+# number of diseases that score highest in algorithm to plot
+top_nr_IEM       <- 5
+# probability score cut-off for plotting the top diseases
+threshold_IEM    <- 5
+# z-score cutoff of axis on the left for top diseases
+ratios_cutoff    <- -5
+# number of violin plots per page in PDF
+nr_plots_perpage <- 20
+
+# binary variable: run function, yes(1) or no(0)
+if (z_score == 1) {
+  algorithm <- ratios <- violin <- 1
+}
+# are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
 header_row <- 1
 # column name where the data starts (default B)
 col_start     <- "B"
@@ -52,15 +51,12 @@ zscore_cutoff <- 5
 xaxis_cutoff  <- 20
 
 # path to DIMS excel file
-path_DIMSfile <- paste0("./", run_name, ".xlsx")
+path_dims_file <- paste0("./", run_name, ".xlsx")
 
-# path: output folder for dIEM and violin plots 
-output_dir <- paste0("../../../dIEM")
-dir.create(output_dir, showWarnings = F) 
-print(getwd())
-print(path_DIMSfile)
+# path: output folder for dIEM and violin plots
+output_dir <- "./"
 
-# folder in which all metabolite lists are (.txt)
+# folder for all metabolite lists (.txt)
 path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups"
 # file for ratios step 3
 file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv"
@@ -72,26 +68,26 @@ file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt"
 # copy list of isomers to project folder.
 file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir)
 
-#### STEP 1: Preparation #### 
-# in: run_name, path_DIMSfile, header_row ||| out: output_dir, DIMS
+#### STEP 1: Preparation ####
+# in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS
 
 # Load the excel file.
-dims_xls <- readWorkbook(xlsxFile = path_DIMSfile, sheet = 1, startRow = header_row)
+dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row)
 if (exists("dims_xls")) {
-  cat(paste0("\nThe excel file is succesfully loaded:\n -> ",path_DIMSfile))
+  cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file))
 } else {
-  cat(paste0("\n\n**** Error: Could not find an excel file. Please check if path to excel file is correct in config.R:\n -> ",path_DIMSfile,"\n"))
+  cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n"))
 }
 
-#### STEP 2: Edit DIMS data #####      
+#### STEP 2: Edit DIMS data #####
 # in: dims_xls ||| out: Data, nr_contr, nr_pat
 # Input: the xlsx file that comes out of the pipeline with format:
 # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore]
 # Output: "_CSV.csv" file that is suited for the algorithm in shiny.
 
 # Determine the number of Contols and Patients in column names:
-nr_contr <- length(grep("C",names(dims_xls)))/2   # Number of control samples
-nr_pat   <- length(grep("P",names(dims_xls)))/2   # Number of patient samples
+nr_contr <- length(grep("C", names(dims_xls))) / 2
+nr_pat   <- length(grep("P", names(dims_xls))) / 2
 # total number of samples
 nrsamples <- nr_contr + nr_pat
 # check whether the number of intensity columns equals the number of Zscore columns
@@ -100,19 +96,19 @@ if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) {
 }
 cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n"))
 
-# Move the columns HMDB_code and HMDB_name to the beginning. 
-HMDB_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
-other_cols     <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols]
-dims_xls_copy  <- dims_xls[ , c(HMDB_info_cols, other_cols)]
+# Move the columns HMDB_code and HMDB_name to the beginning.
+hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
+other_cols     <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols]
+dims_xls_copy  <- dims_xls[, c(hmdb_info_cols, other_cols)]
 # Remove the columns from 'name' to 'pathway'
 from_col      <- which(colnames(dims_xls_copy) == "name")
 to_col        <- which(colnames(dims_xls_copy) == "pathway")
-dims_xls_copy <- dims_xls_copy[ , -c(from_col:to_col)]
+dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)]
 # in case the excel had an empty "plots" column, remove it
-if ("plots" %in% colnames(dims_xls_copy)) { 
-  dims_xls_copy <- dims_xls_copy[ , -grep("plots", colnames(dims_xls_copy))]
-} 
-# Rename columns 
+if ("plots" %in% colnames(dims_xls_copy)) {
+  dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))]
+}
+# Rename columns
 names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy))
 names(dims_xls_copy) <- gsub("sd.ctrls",  "SD_controls", names(dims_xls_copy))
 names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy))
@@ -121,7 +117,7 @@ names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy))
 # intensity columns and mean and standard deviation of controls
 numeric_cols <- c(3:ncol(dims_xls_copy))
 # make sure all values are numeric
-dims_xls_copy[ , numeric_cols] <- sapply(dims_xls_copy[ , numeric_cols], as.numeric)
+dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric)
 
 if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) {
   cat("\n### Step 2 # Edit dims data is done.\n")
@@ -129,9 +125,9 @@ if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) {
   cat("\n**** Error: Could not execute step 2 \n")
 }
 
-#### STEP 3: Calculate ratios of intensities for metabolites ####      
+#### STEP 3: Calculate ratios of intensities for metabolites ####
 # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file)
-# This script loads the file with Ratios (file_ratios_metabolites) and calculates 
+# This script loads the file with Ratios (file_ratios_metabolites) and calculates
 # the ratios of the intensities of the given metabolites. It also calculates
 # Zs-cores based on the avg and sd of the ratios of the controls.
 
@@ -141,38 +137,38 @@ if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) {
 # Output: "_CSV.csv" file that is suited for the algorithm, with format:
 # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added.
 
-if (ratios == 1) { 
+if (ratios == 1) {
   cat(paste0("\nloading ratios file:\n ->  ", file_ratios_metabolites, "\n"))
-  ratio_input <- read.csv(file_ratios_metabolites, sep=';', stringsAsFactors=FALSE)
-  
+  ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE)
+
   # Prepare empty data frame to fill with ratios
   ratio_list <- setNames(data.frame(matrix(
-    ncol=ncol(dims_xls_copy),
-    nrow=nrow(ratio_input)
+    ncol = ncol(dims_xls_copy),
+    nrow = nrow(ratio_input)
   )), colnames(dims_xls_copy))
-  
+
   # put HMDB info into first two columns of ratio_list
-  ratio_list[ ,1:2] <- ratio_input[ ,1:2]
+  ratio_list[, 1:2] <- ratio_input[, 1:2]
 
   # look for intensity columns (exclude Zscore columns)
   control_cols   <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
   patient_cols   <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
   intensity_cols <- c(control_cols, patient_cols)
-  # calculate each of the ratios of intensities 
+  # calculate each of the ratios of intensities
   for (ratio_index in 1:nrow(ratio_input)) {
-    ratio_numerator   <- ratio_input[ratio_index, "HMDB_numerator"] 
+    ratio_numerator   <- ratio_input[ratio_index, "HMDB_numerator"]
     ratio_numerator   <- strsplit(ratio_numerator, "plus")[[1]]
-    ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] 
+    ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"]
     ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]]
     # find these HMDB IDs in dataset. Could be a sum of multiple metabolites
     sel_denominator <- sel_numerator <- c()
-    for (numerator_index in 1:length(ratio_numerator)) { 
-      sel_numerator <- c(sel_numerator, which(dims_xls_copy[ , "HMDB.code"] == ratio_numerator[numerator_index])) 
+    for (numerator_index in 1:length(ratio_numerator)) {
+      sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index]))
     }
-    for (denominator_index in 1:length(ratio_denominator)) { 
-      # special case for sum of metabolites (dividing by one)  
+    for (denominator_index in 1:length(ratio_denominator)) {
+      # special case for sum of metabolites (dividing by one)
       if (ratio_denominator[denominator_index] != "one") {
-        sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) 
+        sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index]))
       }
     }
     # calculate ratio
@@ -184,182 +180,178 @@ if (ratios == 1) {
       ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum)
     }
     # calculate log of ratio
-    ratio_list[ratio_index, intensity_cols]<- log2(ratio_list[ratio_index, intensity_cols])
+    ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols])
   }
-  
+
   # Calculate means and SD's of the calculated ratios for Controls
-  ratio_list[ , "Mean_controls"] <- apply(ratio_list[ , control_cols], 1, mean)
-  ratio_list[ , "SD_controls"]   <- apply(ratio_list[ , control_cols], 1, sd)
+  ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean)
+  ratio_list[, "SD_controls"]   <- apply(ratio_list[, control_cols], 1, sd)
 
   # Calc z-scores with the means and SD's of Controls
   zscore_cols <- grep("Zscore", colnames(ratio_list))
   for (sample_index in 1:length(zscore_cols)) {
-    zscore_col <- zscore_cols[sample_index] 
+    zscore_col <- zscore_cols[sample_index]
     # matching intensity column
     int_col <- intensity_cols[sample_index]
     # test on column names
     if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) {
       # calculate Z-scores
-      ratio_list[ , zscore_col] <- (ratio_list[ , int_col] - ratio_list[ , "Mean_controls"]) / ratio_list[ , "SD_controls"]
+      ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"]
     }
   }
-  
+
   # Add rows of the ratio hmdb codes to the data of zscores from the pipeline.
   dims_xls_ratios <- rbind(ratio_list, dims_xls_copy)
 
   # Edit the DIMS output Zscores of all patients in format:
   # HMDB_code patientname1  patientname2
-  names(dims_xls_ratios) <- gsub("HMDB.code","HMDB_code", names(dims_xls_ratios))
+  names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios))
   names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios))
 
   # for debugging:
-  write.table(dims_xls_ratios, file=paste0(output_dir, "/ratios.txt"), sep="\t")
+  write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t")
 
-  # Select only the cols with zscores of the patients 
-  zscore_patients <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])]
+  # Select only the cols with zscores of the patients
+  zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])]
   # Select only the cols with zscores of the controls
-  zscore_controls <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])]
+  zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])]
 
 }
 
-#### STEP 4: Run the IEM algorithm #########      
+#### STEP 4: Run the IEM algorithm #########
 # in: algorithm, file_expected_biomarkers_IEM, zscore_patients ||| out: prob_score (+file)
 # algorithm taken from DOI: 10.3390/ijms21030979
 
 if (algorithm == 1) {
   # Load data
   cat(paste0("\nloading expected file:\n ->  ", file_expected_biomarkers_IEM, "\n"))
-  expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep=';', stringsAsFactors=FALSE)
+  expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep = ";", stringsAsFactors = FALSE)
   # modify column names
-  names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code",  names(expected_biomarkers))  
+  names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code",  names(expected_biomarkers))
   names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers))
-  
+
   # prepare dataframe scaffold rank_patients
   rank_patients <- zscore_patients
   # Fill df rank_patients with the ranks for each patient
   for (patient_index in 3:ncol(zscore_patients)) {
     # number of positive zscores in patient
-    pos <- sum(zscore_patients[ , patient_index] > 0) 
+    pos <- sum(zscore_patients[, patient_index] > 0)
     # sort the column on zscore; NB: this sorts the entire object, not just one column
     rank_patients <- rank_patients[order(-rank_patients[patient_index]), ]
     # Rank all positive zscores highest to lowest
     rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index]))
     # Rank all negative zscores lowest to highest
-    rank_patients[(pos+1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos+1):nrow(rank_patients), patient_index]))
+    rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):nrow(rank_patients), patient_index]))
   }
-  
+
   # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2).
-  expected_zscores <- merge(x=expected_biomarkers, y=zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
-  expected_zscores_original <- expected_zscores # necessary copy?
-  
+  expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
+  expected_zscores_original <- expected_zscores
+
   # determine which columns contain Z-scores and which contain disease info
   select_zscore_cols <- grep("_Zscore", colnames(expected_zscores))
-  select_info_cols <- 1:(min(select_zscore_cols) -1)
+  select_info_cols <- 1:(min(select_zscore_cols) - 1)
   # set some zscores to zero
-  select_incr_indisp <- which(expected_zscores$Change=="Increase" & expected_zscores$Dispensability=="Indispensable")
-  expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6 , 0, x))
-  select_decr_indisp <- which(expected_zscores$Change=="Decrease" & expected_zscores$Dispensability=="Indispensable")
-  expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2 , 0, x))
-
-  # calculate rank score: 
-  expected_ranks <- merge(x=expected_biomarkers, y=rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
-  rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols]/(expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols]*0.9)
+  select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable")
+  expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x))
+  select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable")
+  expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x))
+
+  # calculate rank score:
+  expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
+  rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9)
   # combine disease info with rank scores
   expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores)
-  
+
   # multiply weight score and rank score
   weight_score <- expected_zscores
-  weight_score[ , select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[ , select_zscore_cols]
-  
+  weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols]
+
   # sort table on Disease and Absolute_Weight
   weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ]
 
   # select columns to check duplicates
-  dup <- weight_score[ , c('Disease', 'M.z')] 
-  uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast=FALSE),]
-  
+  dup <- weight_score[, c("Disease", "M.z")]
+  uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ]
+
   # calculate probability score
-  prob_score <-  aggregate(uni[ , select_zscore_cols], uni["Disease"], sum)
+  prob_score <-  aggregate(uni[, select_zscore_cols], uni["Disease"], sum)
 
   # list of all diseases that have at least one metabolite Zscore at 0
   for (patient_index in 2:ncol(prob_score)) {
     patient_zscore_colname <- colnames(prob_score)[patient_index]
     matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname)
     # determine which Zscores are 0 for this patient
-    zscores_zero <- which(expected_zscores[ , matching_colname_expected] == 0)
-    # get Disease for these 
+    zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0)
+    # get Disease for these
     disease_zero <- unique(expected_zscores[zscores_zero, "Disease"])
-    # set the probability score of these diseases to 0 
-    prob_score[which(prob_score$Disease %in% disease_zero), patient_index]<- 0
+    # set the probability score of these diseases to 0
+    prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0
   }
-  
+
   # determine disease rank per patient
-  disease_rank <- prob_score 
+  disease_rank <- prob_score
   # rank diseases in decreasing order
   disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x])))
   # modify column names, Zscores have now been converted to probability scores
-  colnames(prob_score) <- gsub("_Zscore","_prob_score", colnames(prob_score)) # redundant?
-  colnames(disease_rank) <- gsub("_Zscore","", colnames(disease_rank))
-  
-  # Create conditional formatting for output excel sheet. Colors according to values.
+  colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score))
+  colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank))
+
+  # Create conditional formatting for output Excel sheet. Colors according to values.
   wb <- createWorkbook()
   addWorksheet(wb, "Probability Scores")
   writeData(wb, "Probability Scores", prob_score)
-  conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white","#FFFDA2","red"), rule = c(1, 10, 100))
-  saveWorkbook(wb, file = paste0(output_dir,"/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
+  conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100))
+  saveWorkbook(wb, file = paste0(output_dir, "/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
   # check whether prob_score df exists and has expected dimensions.
   if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) {
     cat("\n### Step 4 # Running the IEM algorithm is done.\n\n")
   } else {
     cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n")
   }
-  
+
   rm(wb)
 }
 
-#### STEP 5: Make violin plots #####      
+#### STEP 5: Make violin plots #####
 # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, top_diseases, top_metab, output_dir ||| out: pdf file
 
 if (violin == 1) { # make violin plots
-  
+
   # preparation
-  # isobarics_txt <- c()
-  zscore_patients_copy <- zscore_patients 
-  # keep the original for testing purposes, remove later. 
-  colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) # for robust scaler
+  zscore_patients_copy <- zscore_patients
+  # for robust scaler, rename Z-score columns
+  colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients))
+  colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls))
   colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients))
-  colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) # for robust scaler
   colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls))
-  
+
   # Make patient list for violin plots
-  patient_list <- names(zscore_patients)[-c(1,2)]
+  patient_list <- names(zscore_patients)[-c(1, 2)]
 
-  # from table expected_biomarkers, choose selected columns 
+  # from table expected_biomarkers, choose selected columns
   select_columns <- c("Disease", "HMDB_code", "HMDB_name")
   select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns)
-  expected_biomarkers_select <- expected_biomarkers[ , select_col_nrs]
+  expected_biomarkers_select <- expected_biomarkers[, select_col_nrs]
   # remove duplicates
-  expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[ , c(1,2)]), ]
-  
+  expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ]
+
   # load file with explanatory information to be included in PDF.
   explanation <- readLines(file_explanation)
-  
-  # for debugging:
-  #write.table(explanation, file=paste0(outdir, "explanation_read_in.txt"), sep="\t")
 
   # first step: normal violin plots
   # Find all text files in the given folder, which contain metabolite lists of which
   # each file will be a page in the pdf with violin plots.
   # Make a PDF file for each of the categories in metabolite_dirs
-  metabolite_dirs <- list.files(path=path_metabolite_groups, full.names=FALSE, recursive=FALSE)
+  metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE)
   for (metabolite_dir in metabolite_dirs) {
     # create a directory for the output PDFs
-    pdf_dir <- paste(output_dir, metabolite_dir, sep="/")
-    dir.create(pdf_dir, showWarnings=FALSE)
+    pdf_dir <- paste(output_dir, metabolite_dir, sep = "/")
+    dir.create(pdf_dir, showWarnings = FALSE)
     cat("making plots in category:", metabolite_dir, "\n")
-    
+
     # get a list of all metabolite files
-    metabolite_files <- list.files(path=paste(path_metabolite_groups, metabolite_dir, sep="/"), pattern="*.txt", full.names=FALSE, recursive=FALSE)
+    metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), pattern = "*.txt", full.names = FALSE, recursive = FALSE)
     # put all metabolites into one list
     metab_list_all <- list()
     metab_list_names <- c()
@@ -367,20 +359,20 @@ if (violin == 1) { # make violin plots
     # open the text files and add each to a list of dataframes (metab_list_all)
     for (file_index in seq_along(metabolite_files)) {
       infile <- metabolite_files[file_index]
-      metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep="/"), sep = "\t", header = TRUE, quote="")
+      metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), sep = "\t", header = TRUE, quote = "")
       # put into list of all lists
       metab_list_all[[file_index]] <- metab_list
       metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1])
       cat(paste0("\n", infile))
-    } 
+    }
     # include list of classes in metabolite list
     names(metab_list_all) <- metab_list_names
-    
+
     # prepare list of metabolites; max nr_plots_perpage on one page
     metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients)
     metab_interest_controls <- prepare_data(metab_list_all, zscore_controls)
     metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr)
-    
+
     # make violin plots per patient
     for (pt_nr in 1:length(patient_list)) {
       pt_name <- patient_list[pt_nr]
@@ -388,21 +380,19 @@ if (violin == 1) { # make violin plots
       # for category Other, make list of top highest and lowest Z-scores for this patient
       if (grepl("Diagnost", pdf_dir)) {
         top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted)
-        # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_alarmvalues.RData"))
       } else {
         top_metab_pt <- prepare_toplist(pt_name, zscore_patients)
-        # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_toplist.RData"))
       }
 
       # generate normal violin plots
       create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt)
-      
-    } # end for pt_nr
 
-  } # end for metabolite_dir
-   
+    }
+
+  }
+
   # Second step: dIEM plots in separate directory
-  dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep="/")
+  dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/")
   dir.create(dIEM_plot_dir)
 
   # Select the metabolites that are associated with the top highest scoring IEM, for each patient
@@ -411,7 +401,7 @@ if (violin == 1) { # make violin plots
     pt_name <- patient_list[pt_nr]
     # get top diseases for this patient
     pt_colnr <- which(colnames(disease_rank) == pt_name)
-    pt_top_indices <- which(disease_rank[ , pt_colnr] <= top_nr_IEM)
+    pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_IEM)
     pt_IEMs <- disease_rank[pt_top_indices, "Disease"]
     pt_top_IEMs <- pt_prob_score_top_IEMs <- c()
     for (single_IEM in pt_IEMs) {
@@ -441,7 +431,7 @@ if (violin == 1) { # make violin plots
         select_rows <- which(expected_biomarkers_select$Disease == single_IEM)
         metab_list <- expected_biomarkers_select[select_rows, ]
         metab_IEM_names <- c(metab_IEM_names, paste0(single_IEM, ", probability score ", single_prob_score))
-        metab_list <- metab_list[ , -1]
+        metab_list <- metab_list[, -1]
         metab_IEM_all[[single_IEM_index]] <- metab_list
       }
       # put all metabolites into one list
@@ -457,11 +447,10 @@ if (violin == 1) { # make violin plots
       create_violin_plots(dIEM_plot_dir, pt_name, dIEM_metab_perpage, top_metab_pt)
 
     } else {
-      cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM,".
-                   Therefore, this pdf was not made:\t ", pt_name ,"_IEM \n"))
+      cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM, ".
+                   Therefore, this pdf was not made:\t ", pt_name, "_IEM \n"))
     }
 
-  } # end for pt_nr
-
-} # end if violin = 1
+  }
 
+}
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 06e0f2c..7441959 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -4,10 +4,9 @@
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-#  Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm
-db_path <- cmd_args[1] # location of HMDB db file
-breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-standard_run  <- cmd_args[4] # "yes"
+db_path <- cmd_args[1] 
+breaks_filepath <- cmd_args[2] 
+standard_run  <- cmd_args[4] 
 
 # Cut up entire HMDB into small parts based on the new binning/breaks 
 load(breaks_filepath)
@@ -19,11 +18,11 @@ max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
 # test if standard mz range is used
 if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) {
   # skip generating HMDB parts
-  hmdb_parts_files <- cmd_args[3] 
-
-  hmdb_parts <- list.files(hmdb_parts_files, pattern="hmdb") # all files containing hmdb in file name
+  hmdb_parts_path <- cmd_args[3] 
+  # find all files containing hmdb in file name
+  hmdb_parts <- list.files(hmdb_parts_path, pattern = "hmdb") 
   for (hmdb_file in hmdb_parts) {
-    file.copy(paste(hmdb_parts_files, hmdb_file, sep="/"), "./", recursive = TRUE)
+    file.copy(paste(hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE)
   }
 } else { 
   # generate HMDB parts in case of non-standard mz range
@@ -31,7 +30,6 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
   ppm <- as.numeric(cmd_args[5])
 
   scanmodes <- c("positive", "negative")
-
   for (scanmode in scanmodes) {
     if (scanmode == "negative") {
       column_label <- "MNeg"
@@ -42,48 +40,49 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
     }
 
     # filter mass range meassured!!!
-    HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] &
-                                      HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+    HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] &
+                                      HMDB_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
     # sort on mass
-    outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),]
+    outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[ , column_label])),]
 
-    n            <- dim(outlist)[1]
-    sub          <- 20000 # max rows per file
+    nr_rows      <- dim(outlist)[1]
+    # maximum number of rows per file
+    sub          <- 20000 
     end          <- 0
-    min_1_last   <- sub
+    last_line    <- sub
     check        <- 0
     outlist_part <- NULL
 
 
-    if (n < sub) {
+    if (nr_rows < sub) {
       outlist_part <- outlist
       save(outlist_part, file = paste0("./", scanmode, "_hmdb.1.RData"))
     } else {
 
-        if (n >= sub & (floor(n/sub) - 1) >= 2){
-          for (i in 2:floor(n/sub) - 1){
-            start <- -(sub - 1) + i*sub
-            end <- i*sub
+        if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) {
+          for (i in 2:floor(nr_rows / sub) - 1) {
+            start <- -(sub - 1) + i * sub
+            end <- i * sub
 
             if (i > 1){
               outlist_i = outlist[c(start:end),]
 
-              n_moved = 0
+              nr_moved = 0
 
-              # Calculate 3ppm and replace border, avoid cut within peakgroup!
-              while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) {
-                outlist_part <- rbind(outlist_part, outlist_i[1,])
-                outlist_i <- outlist_i[-1,]
-                n_moved <- n_moved + 1
+              # Use ppm to replace border to avoid cut within peakgroup!
+              while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
+                      as.numeric(outlist_i[1, column_label]) < ppm) {
+                outlist_part <- rbind(outlist_part, outlist_i[1, ])
+                outlist_i <- outlist_i[-1, ]
+                nr_moved <- nr_moved + 1
               }
 
-              # message(paste("Process", i-1,":", dim(outlist_part)[1]))
-              save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep="."), sep=""))
+              save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = ""))
               check <- check + dim(outlist_part)[1]
 
               outlist_part <- outlist_i
-              min_1_last <- dim(outlist_part)[1]
+              last_line <- dim(outlist_part)[1]
 
             } else {
               outlist_part <- outlist[c(start:end),]
@@ -92,27 +91,26 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
         }
 
         start <- end + 1
-        end <- n
+        end <- nr_rows
         outlist_i <- outlist[c(start:end),]
-        n_moved <- 0
+        nr_moved <- 0
 
         if (!is.null(outlist_part)) {
           # Calculate ppm and replace border, avoid cut within peakgroup!
-          while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) {
+          while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
+                  as.numeric(outlist_i[1, column_label]) < ppm) {
             outlist_part <- rbind(outlist_part, outlist_i[1,])
             outlist_i <- outlist_i[-1,]
-            n_moved <- n_moved + 1
+            nr_moved <- nr_moved + 1
           }
 
-          save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i, "RData", sep = "."), sep = ""))
+          save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i, ".RData"))
           check <- check + dim(outlist_part)[1]
         }
 
         outlist_part <- outlist_i
-        save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep=""))
+        save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i + 1, ".RData"))
         check <- check + dim(outlist_part)[1]
-        # cat("\n", "Check", check == dim(outlist)[1])
-
       }
     }
 }
diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index 941d45f..00cf5c9 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -4,16 +4,14 @@
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-db_path <- cmd_args[1] # location of HMDB db file
-breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
+db_path <- cmd_args[1]
+breaks_filepath <- cmd_args[2]
 
 load(db_path)
 load(breaks_filepath)
 
 # Cut up HMDB minus adducts minus isotopes into small parts 
-
 scanmodes <- c("positive", "negative")
-
 for (scanmode in scanmodes) {
   if (scanmode == "negative") {
     column_label <- "MNeg"
@@ -23,37 +21,38 @@ for (scanmode in scanmodes) {
     HMDB_add_iso <- HMDB_add_iso.Pos
   }
 
-  # filter mass range meassured NB: remove the last comma?!
-  outlist <- HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] & HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+  # filter mass range measured
+  outlist <- HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & 
+             HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
   # remove adducts and isotopes, put internal standard at the beginning
-  outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed=TRUE), ]
-  outlist <- outlist[grep("HMDB", rownames(outlist), fixed=TRUE), ]
-  outlist <- outlist[-grep("_", rownames(outlist), fixed=TRUE), ]
+  outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ]
+  outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ]
+  outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ]
   outlist <- rbind(outlist_IS, outlist)
   # sort on m/z value
-  outlist <- outlist[order(outlist[ ,column_label]), ]
+  outlist <- outlist[order(outlist[ , column_label]), ]
 
-  n <- dim(outlist)[1]
+  nr_rows <- dim(outlist)[1]
   # size of hmdb parts in lines:
   sub <- 1000
   end <- 0
   check <- 0
 
   # generate hmdb parts
-  if (n >= sub & (floor(n/sub)) >= 2) {
-    for (i in 1:floor(n/sub)){
-      start <- -(sub-1) + i*sub
-      end <- i*sub
+  if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) {
+    for (i in 1:floor(nr_rows / sub)) {
+      start <- -(sub-1) + i * sub
+      end <- i * sub
       outlist_part <- outlist[c(start:end), ]
-      save(outlist_part, file=paste0(scanmode, "_hmdb.", i, ".RData"))
+      save(outlist_part, file=paste0(scanmode, "_hmdb_main_", i, ".RData"))
     }
   }
 }
 
 # finish last hmdb part
 start <- end + 1
-end <- n
+end <- nr_rows
 
 outlist_part <- outlist[c(start:end),]
 save(outlist_part, file = paste0(scanmode, "_hmdb.", i+1, ".RData"))
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 3b125e3..33794c5 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,29 +1,29 @@
 #!/usr/bin/env Rscript
 ## adapted from makeInit in old pipeline
 
-args          <- commandArgs(trailingOnly=TRUE)
-sample_sheet  <- read.csv(args[1], sep="\t") 
+args          <- commandArgs(trailingOnly = TRUE)
+sample_sheet  <- read.csv(args[1], sep = "\t")
 nr_replicates <- as.numeric(args[2])
 
-sampleNames <- trimws(as.vector(unlist(sample_sheet[1])))
-nr_sampgrps <- length(sampleNames)/nr_replicates
-groupNames  <- trimws(as.vector(unlist(sample_sheet[2])))
-groupNames  <- gsub('[^-.[:alnum:]]', '_', groupNames)
-groupNamesUnique <- unique(groupNames)
+sample_names  <- trimws(as.vector(unlist(sample_sheet[1])))
+nr_sampgrps   <- length(sampleNames) / nr_replicates
+group_names    <- trimws(as.vector(unlist(sample_sheet[2])))
+group_names    <- gsub("[^-.[:alnum:]]", "_", group_names)
+group_names_unique <- unique(group_names)
 
 repl_pattern <- c()
 for (sampgrp in 1:nr_sampgrps) {
   tmp <- c()
   for (repl in nr_replicates:1) {
-    index <- ((sampgrp*nr_replicates) - repl) + 1
+    index <- ((sampgrp * nr_replicates) - repl) + 1
     tmp <- c(tmp, sampleNames[index])
   }
   repl_pattern <- c(repl_pattern, list(tmp))
 }
 
-names(repl_pattern) <- groupNamesUnique
+names(repl_pattern) <- group_names_unique
 
 # preview the replication pattern
-print(head(repl_pattern))
+print(tail(repl_pattern))
 
-save(repl_pattern, file="./init.RData")
+save(repl_pattern, file = "./init.RData")
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 926cb77..e978d4b 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,15 +1,14 @@
 #!/usr/bin/Rscript
 # adapted from 4-peakFinding.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 filepath        <- cmd_args[1]
 breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
 resol           <- as.numeric(cmd_args[3])
 scripts_dir     <- cmd_args[4]
-
-thresh <- 2000
+thresh          <- 2000
 
 # load in function scripts
 source(paste0(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R"))
@@ -38,15 +37,18 @@ load(breaks_filepath)
 
 # Load output of AverageTechReplicates for a sample
 sample_avgtechrepl <- get(load(filepath))
-if (grepl("_pos", filepath)) { scanmode = "positive" } else
-  if (grepl("_neg", filepath)) { scanmode = "negative" }
+if (grepl("_pos", filepath)) {
+  scanmode <- "positive"
+} else if (grepl("_neg", filepath)) {
+  scanmode <- "negative"
+}
 
 # Initialize
 options(digits = 16)
-int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic)
+int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve
 scale      <- 2 # Initial value used to estimate scaling parameter
 width      <- 1024
 height     <- 768
 
 # run the findPeaks function
-findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index 423fb69..d04b6a2 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -1,176 +1,186 @@
 #!/usr/bin/Rscript
 # adapted from 6-peakGrouping.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-HMDB_part_file <- cmd_args[1]
+hmdb_part_file <- cmd_args[1]
 ppm <- as.numeric(cmd_args[2])
 
-options(digits=16)
+options(digits = 16)
 
 # load part of the HMDB
-HMDB_add_iso <- get(load(HMDB_part_file)) 
+hmdb_add_iso <- get(load(hmdb_part_file))
 
-# determine appropriate scanmode based on HMDB_part_file 
-if (grepl("negative", basename(HMDB_part_file))) { scanmode <- "negative" } else
-  if (grepl("positive", basename(HMDB_part_file))) { scanmode <- "positive" }
+# determine appropriate scanmode based on hmdb_part_file
+if (grepl("negative", basename(hmdb_part_file))) {
+  scanmode <- "negative"
+} else if (grepl("positive", basename(hmdb_part_file))) {
+  scanmode <- "positive"
+}
 
 # determine batch number of HMDB part file
-batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2]
+batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2]
 
 # load file with spectrum peaks
-SpecPeaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData")
-load(SpecPeaks_file)
-outlist.copy <- outlist.tot
-rm(outlist.tot)
+spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData")
+load(spec_peaks_file)
+outlist_copy <- outlist_tot
+rm(outlist_tot)
 
 # load replication pattern
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 load(pattern_file)
 
-# determine appropriate column name in HMDB part 
-if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" }
+# determine appropriate column name in HMDB part
+if (scanmode == "negative") {
+  column_label <- "MNeg"
+} else {
+  column_label <- "Mpos"
+}
 
 # Initialize
-outpgrlist.identified <- NULL
+peakgrouplist_identified <- NULL
 list_of_peaks_used_in_peak_groups_identified <- NULL
 
 # First find peak groups identified based on HMDB masses
-while (dim(HMDB_add_iso)[1] > 0) { 
+while (dim(hmdb_add_iso)[1] > 0) {
   index <- 1
- 
+
   # take one m/z value from the HMDB part and calculate mass tolerance
-  reference_mass <- as.numeric(HMDB_add_iso[index, column_label])
+  reference_mass <- as.numeric(hmdb_add_iso[index, column_label])
   mass_tolerance <- (reference_mass * ppm) / 10^6
-  
+
   # find the peaks in the dataset with corresponding m/z
-  mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"])
+  mzmed <- as.numeric(outlist_copy[, "mzmed.pkt"])
   selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance)))
-  tmplist <- outlist.copy[selp,,drop=FALSE]
+  tmplist <- outlist_copy[selp, , drop = FALSE]
   list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist)
 
   nrsamples <- length(selp)
   if (nrsamples > 0) {
-    mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
-    mzmin.pgrp <- reference_mass - mass_tolerance
-    mzmax.pgrp <- reference_mass + mass_tolerance
-    
+    mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"]))
+    mzmin_pgrp <- reference_mass - mass_tolerance
+    mzmax_pgrp <- reference_mass + mass_tolerance
+
     # determine fit quality fq
-    fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"]))
-    fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"]))
-    
+    fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"]))
+    fq_best_pgrp <- as.numeric(min(outlist_copy[selp, "fq"]))
+
     # set up object for intensities for all samples
-    ints.allsamps <- rep(0, length(names(repl_pattern_filtered)))
-    names(ints.allsamps) <- names(repl_pattern_filtered) 
-    
+    ints_allsamps <- rep(0, length(names(repl_pattern_filtered)))
+    names(ints_allsamps) <- names(repl_pattern_filtered)
+
     # Check for each sample if multiple peaks exist, if so take the sum of the intensities
-    labels <- unique(tmplist[ ,"samplenr"])
-    ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])) } )))
-    
-    # Initialize 
-    assi_HMDB <- iso_HMDB <- HMDB_code <- NA
-    tmplist.mass.iso <- tmplist.mass.adduct <- NULL
-    
+    labels <- unique(tmplist[, "samplenr"])
+    ints_allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) {
+      sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"]))
+    })))
+
+    # Initialize
+    assi_hmdb <- iso_hmdb <- hmdb_code <- NA
+    tmplist_mass_iso <- tmplist_mass_adduct <- NULL
+
     # Identification: find all entries in HMDB part with mass within ppm range
-    mass.all <- as.numeric(HMDB_add_iso[ , column_label])
-    index <- which((mass.all > (reference_mass - mass_tolerance)) & (mass.all < (reference_mass + mass_tolerance)))
-    tmplist.mass <- HMDB_add_iso[index,,drop=FALSE]
-    
-    if (dim(tmplist.mass)[1]>0) {
-      # find isotope entries    
-      index.iso <- grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE)
-      if (length(index.iso) > 0){
-        tmplist.mass.iso <- tmplist.mass[index.iso,,drop=FALSE]
-        tmplist.mass <- tmplist.mass[-index.iso,,drop=FALSE]
+    mass_all <- as.numeric(hmdb_add_iso[, column_label])
+    index <- which((mass_all > (reference_mass - mass_tolerance)) & (mass_all < (reference_mass + mass_tolerance)))
+    tmplist_mass <- hmdb_add_iso[index, , drop = FALSE]
+
+    if (dim(tmplist_mass)[1] > 0) {
+      # find isotope entries
+      index_iso <- grep(" iso ", tmplist_mass[, "CompoundName"], fixed = TRUE)
+      if (length(index_iso) > 0) {
+        tmplist_mass_iso <- tmplist_mass[index_iso, , drop = FALSE]
+        tmplist_mass <- tmplist_mass[-index_iso, , drop = FALSE]
       }
-      
-      if (dim(tmplist.mass)[1] > 0) {        
-	# find adduct entries
-        index.adduct <- grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE)
-        if (length(index.adduct) > 0) {
-          tmplist.mass.adduct <- tmplist.mass[index.adduct,,drop=FALSE]
-          tmplist.mass <- tmplist.mass[-index.adduct,,drop=FALSE]
-        }  
-      }  
-      
+
+      if (dim(tmplist_mass)[1] > 0) {
+        # find adduct entries
+        index_adduct <- grep(" [M", tmplist_mass[, "CompoundName"], fixed = TRUE)
+        if (length(index_adduct) > 0) {
+          tmplist_mass_adduct <- tmplist_mass[index_adduct, , drop = FALSE]
+          tmplist_mass <- tmplist_mass[-index_adduct, , drop = FALSE]
+        }
+      }
+
       # Compose a list compounds, adducts or isotopes with corresponding m/z
-      if (dim(tmplist.mass)[1]>0) {
-        
+      if (dim(tmplist_mass)[1] > 0) {
         # metabolites
-        assi_HMDB <- as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";"))
-        HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";"))
-        theormz_HMDB <- as.numeric(tmplist.mass[1, column_label])
-        
+        assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), collapse = ";"))
+        hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), collapse = ";"))
+        theormz_hmdb <- as.numeric(tmplist_mass[1, column_label])
+
         # adducts of metabolites
-        if (!is.null(tmplist.mass.adduct)) {
-          if (dim(tmplist.mass.adduct)[1] > 0) {
-            if (is.na(assi_HMDB)){
-              assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";"))
-              HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
+        if (!is.null(tmplist_mass_adduct)) {
+          if (dim(tmplist_mass_adduct)[1] > 0) {
+            if (is.na(assi_hmdb)) {
+              assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";"))
+              hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";"))
             } else {
-              assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
-              HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
+              assi_hmdb <- paste(assi_hmdb,
+                                 as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+              hmdb_code <- paste(hmdb_code,
+                                 as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";")
             }
-	  }
-	}
-        
+          }
+        }
+
         # isotopes of metabolites
-        if (!is.null(tmplist.mass.iso)) {
-          if (dim(tmplist.mass.iso)[1]>0) {
-            iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+        if (!is.null(tmplist_mass_iso)) {
+          if (dim(tmplist_mass_iso)[1] > 0) {
+            iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";"))
           }
-	}
-        
-      # if no metabolites have the correct m/z, look for adducts and isotopes only
-      } else if (!is.null(tmplist.mass.adduct)) {
-        
-        theormz_HMDB <- as.numeric(tmplist.mass.adduct[1, column_label])
-        
+        }
+
+        # if no metabolites have the correct m/z, look for adducts and isotopes only
+      } else if (!is.null(tmplist_mass_adduct)) {
+        theormz_hmdb <- as.numeric(tmplist_mass_adduct[1, column_label])
+
         # adducts of metabolites
-        if (!is.null(tmplist.mass.adduct)) {
-          if (dim(tmplist.mass.adduct)[1] > 0) {
-            if (is.na(assi_HMDB)) {
-              assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[ , "CompoundName"]), collapse = ";"))
-              HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";"))
+        if (!is.null(tmplist_mass_adduct)) {
+          if (dim(tmplist_mass_adduct)[1] > 0) {
+            if (is.na(assi_hmdb)) {
+              assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";"))
+              hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";"))
             } else {
-              assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";")
-              HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";")
+              assi_hmdb <- paste(assi_hmdb,
+                                 as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+              hmdb_code <- paste(hmdb_code,
+                                 as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";")
             }
-	  }
-	}
-        
+          }
+        }
+
         # isotopes of metabolites
-        if (!is.null(tmplist.mass.iso)) {
-          if (dim(tmplist.mass.iso)[1]>0) {
-            iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
+        if (!is.null(tmplist_mass_iso)) {
+          if (dim(tmplist_mass_iso)[1] > 0) {
+            iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";"))
           }
-	}
-        
-      # if no metabolites or adducts can be found, only look for isotopes  
-      } else if (!is.null(tmplist.mass.iso)) {
-        
-        if (dim(tmplist.mass.iso)[1]>0) {
-          theormz_HMDB <- as.numeric(tmplist.mass.iso[1,column_label])
-          iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";"))
         }
-      }  
-      
+
+        # if no metabolites or adducts can be found, only look for isotopes
+      } else if (!is.null(tmplist_mass_iso)) {
+        if (dim(tmplist_mass_iso)[1] > 0) {
+          theormz_hmdb <- as.numeric(tmplist_mass_iso[1, column_label])
+          iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";"))
+        }
+      }
     }
-   
-    # combine all information 
-    outpgrlist.identified <- rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp),
-                                         t(as.matrix(ints.allsamps)),
-                                         data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB)))
+
+    # combine all information
+    peakgrouplist_identified <- rbind(peakgrouplist_identified, cbind(
+      data.frame(mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp),
+      t(as.matrix(ints_allsamps)),
+      data.frame(assi_hmdb, iso_hmdb, hmdb_code, theormz_hmdb)
+    ))
   }
- 
-  # remove index metabolite from HMDB part and continue while loop 
-  HMDB_add_iso <- HMDB_add_iso[-index,]
-  
+
+  # remove index metabolite from HMDB part and continue while loop
+  hmdb_add_iso <- hmdb_add_iso[-index, ]
 }
 
 
 # save peak list corresponding to masses in HMDB part
 save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData"))
 # save peak group list, identified part
-save(outpgrlist.identified, file = paste0(batch_number, "_", scanmode, "_identified.RData"))
+save(peakgrouplist_identified, file = paste0(batch_number, "_", scanmode, "_identified.RData"))
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index a8f33f9..a44ea06 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -1,65 +1,62 @@
 #!/usr/bin/Rscript
-# adapted from 5-collectSamples.R 
+# adapted from 5-collectSamples.R
 
-# define parameters 
+# define parameters
 scanmodes <- c("positive", "negative")
 
 # Check whether all jobs terminated correct!
-notRun = NULL
+not_run <- NULL
 
 # collect spectrum peaks for each scanmode
 for (scanmode in scanmodes) {
   # load peak lists of all biological samples
-  input_dir <- getwd() # "./"
-  peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep=""))
+  peaklist_files <- list.files("/.", full.names = TRUE, pattern = paste0("*_", scanmode, ".RData"))
 
   # get sample names
   load(paste0("./", scanmode, "_repl_pattern", ".RData"))
-  groupNames = names(repl_pattern_filtered)
-  for (i in 1:length(groupNames)) {
-    group <- paste0(input_dir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData"))
+  group_names <- names(repl_pattern_filtered)
+  for (sample_nr in 1:length(group_names)) {
+    group <- paste0(input_dir, "/", paste0(paste(group_names[sample_nr], scanmode, sep = "_"), ".RData"))
     if (!(group %in% peaklist_files)) {
-      notRun = c(notRun, group)
+      not_run <- c(not_run, group)
     }
   }
-  
-  cat("\nCollecting samples")
 
-  outlist.tot=NULL
-  for (i in 1:length(peaklist_files)) {
-    cat("\n", peaklist_files[i])
-    load(peaklist_files[i])
+  # Collecting samples
+  outlist_total <- NULL
+  for (file_nr in 1:length(peaklist_files)) {
+    cat("\n", peaklist_files[file_nr])
+    load(peaklist_files[file_nr])
     if (is.null(outlist.persample) || (dim(outlist.persample)[1] == 0)) {
-      tmp=strsplit(peaklist_files[i], "/")[[1]]
-      fname = tmp[length(tmp)]
-      fname = strsplit(fname, ".RData")[[1]][1]
-      fname = substr(fname, 13, nchar(fname))
-      if (i == 1) { 
-        outlist.tot <- c(fname, rep("-1",5)) 
-      } else { 
-	outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5)))
+      tmp <- strsplit(peaklist_files[file_nr], "/")[[1]]
+      fname <- tmp[length(tmp)]
+      fname <- strsplit(fname, ".RData")[[1]][1]
+      fname <- substr(fname, 13, nchar(fname))
+      if (file_nr == 1) {
+        outlist_total <- c(fname, rep("-1", 5))
+      } else {
+        outlist_total <- rbind(outlist_total, c(fname, rep("-1", 5)))
       }
     } else {
-      if (i == 1) { 
-	outlist.tot <- outlist.persample 
-      } else { 
-	outlist.tot <- rbind(outlist.tot, outlist.persample) 
+      if (file_nr == 1) {
+        outlist_total <- outlist.persample
+      } else {
+        outlist_total <- rbind(outlist_total, outlist.persample)
       }
     }
   }
-  
+
   # remove negative values
-  index <- which(outlist.tot[ ,"height.pkt"] <= 0)
-  if (length(index) > 0) outlist.tot <- outlist.tot[-index, ]
-  index <- which(outlist.tot[ ,"mzmed.pkt"] <= 0)
-  if (length(index) > 0) outlist.tot <- outlist.tot[-index, ]
+  index <- which(outlist_total[, "height.pkt"] <= 0)
+  if (length(index) > 0) outlist_total <- outlist_total[-index, ]
+  index <- which(outlist_total[, "mzmed.pkt"] <= 0)
+  if (length(index) > 0) outlist_total <- outlist_total[-index, ]
 
-  save(outlist.tot, file = paste0("./SpectrumPeaks_", scanmode, ".RData"))
+  save(outlist_total, file = paste0("./SpectrumPeaks_", scanmode, ".RData"))
 
-  if (!is.null(notRun)){
-    for (i in 1:length(notRun)){
-      message(paste(notRun[i], "was not generated"))
+  if (!is.null(not_run)) {
+    for (i in 1:length(not_run)) {
+      message(paste(not_run[i], "was not generated"))
     }
   }
 }
-
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index 716b473..b4bd332 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -1,25 +1,22 @@
 #!/usr/bin/Rscript
 ## adapted from 11-runSumAdducts.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-# collect_file <- cmd_args[1]
 hmdbpart_main_file <- cmd_args[1]
 scripts_dir <- cmd_args[2]
 z_score <- as.numeric(cmd_args[3])
 # NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder
 
-if (grepl("positive_hmdb", hmdbpart_main_file)) { 
-  scanmode <- "positive" 
+if (grepl("positive_hmdb", hmdbpart_main_file)) {
+  scanmode <- "positive"
   # for the adduct sum: include adducts M+Na (1) and M+K (2)
-  adducts = c(1,2)
-} else {
-    if (grepl("negative_hmdb", hmdbpart_main_file)) { 
-       scanmode <- "negative" 
-       # for the adduct sum: include adduct M+Cl (1) 
-       adducts <- c(1)
-    }
+  adducts <- c(1, 2)
+} else if (grepl("negative_hmdb", hmdbpart_main_file)) {
+  scanmode <- "negative"
+  # for the adduct sum: include adduct M+Cl (1)
+  adducts <- c(1)
 }
 
 # load input files
@@ -29,66 +26,63 @@ repl_file <- paste0(scanmode, "_repl_pattern.RData")
 load(repl_file)
 outlist_part <- get(load(hmdbpart_main_file))
 
-batch_number <- strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2]
+batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2]
 
-outlist.tot <- unique(outlist.ident)
+outlist_total <- unique(outlist_ident)
 
-sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, scanmode, outdir, z_score){
-  hmdb_codes <- rownames(theor_MZ)
-  hmdb_names <- theor_MZ[,1, drop=FALSE]
+sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number, scanmode, outdir, z_score) {
+  hmdb_codes <- rownames(theor_mz)
+  hmdb_names <- theor_mz[, 1, drop = FALSE]
   hmdb_names[] <- lapply(hmdb_names, as.character)
-  
+
   # remove isotopes!!!
-  index <- grep("HMDB", hmdb_codes, fixed=TRUE)
+  index <- grep("HMDB", hmdb_codes, fixed = TRUE)
   hmdb_codes <- hmdb_codes[index]
-  hmdb_names <- hmdb_names[index,]
-  index = grep("_", rownames(hmdb_codes), fixed=TRUE)
+  hmdb_names <- hmdb_names[index, ]
+  index <- grep("_", rownames(hmdb_codes), fixed = TRUE)
   if (length(index) > 0) hmdb_codes <- hmdb_codes[-index]
   if (length(index) > 0) hmdb_names <- hmdb_names[-index]
-  
+
   # negative
   names <- NULL
   adductsum <- NULL
   names_long <- NULL
-  
+
   if (length(hmdb_codes) != 0) {
-    
-    for(i in 1:length(hmdb_codes)){
+    for (i in 1:length(hmdb_codes)) {
       compound <- hmdb_codes[i]
-      compound_plus <- c(compound,paste(compound, adducts, sep = "_"))
-      
-      metab <- unlist(lapply(peaklist$HMDB_code, function(x) {(length(intersect(unlist(strsplit(as.vector(x),";")),compound_plus))>0)}))
-      
+      compound_plus <- c(compound, paste(compound, adducts, sep = "_"))
+
+      metab <- unlist(lapply(peaklist$HMDB_code, function(x) {
+        (length(intersect(unlist(strsplit(as.vector(x), ";")), compound_plus)) > 0)
+      }))
+
       total <- c()
-      
+
       # get the intensities for selected metabolite.
-      # NB: column numbers in previous version of code are incorrect 
       if (z_score == 1) {
         int_cols_C <- grep("C", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")])
         int_cols_P <- grep("P", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")])
         int_cols <- c(int_cols_C, int_cols_P)
         ints <- peaklist[metab, int_cols]
-        # ints <- peaklist[metab, c(15:(length(grpnames.long)+14))]
       } else {
-        ints <- peaklist[metab, c(7:(length(grpnames.long)+6))]
+        ints <- peaklist[metab, c(7:(length(grpnames_long) + 6))]
       }
       total <- apply(ints, 2, sum)
- 
-      if (sum(total)!=0) {
+
+      if (sum(total) != 0) {
         names <- c(names, compound)
-        adductsum <- rbind(adductsum,total)
+        adductsum <- rbind(adductsum, total)
         names_long <- c(names_long, hmdb_names[i])
       }
     }
-    
-    if (!is.null(adductsum)){ 
+
+    if (!is.null(adductsum)) {
       rownames(adductsum) <- names
-      adductsum <- cbind(adductsum, "HMDB_name"=names_long)
-      save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep=""))
+      adductsum <- cbind(adductsum, "HMDB_name" = names_long)
+      save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep = ""))
     }
-  }  
+  }
 }
 
-
-sumAdducts(outlist.tot, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score)
-
+sum_adducts(outlist_total, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score)
diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
index 525bfc0..68dbfe6 100755
--- a/DIMS/UnidentifiedCalcZscores.R
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -1,9 +1,8 @@
 #!/usr/bin/Rscript
 ## adapted from 10-collectSamplesFilled.R
 
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n")
 
 scripts_dir        <- cmd_args[1]
 ppm     <- as.numeric(cmd_args[2])
@@ -11,57 +10,50 @@ z_score <- as.numeric(cmd_args[3])
 
 source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R"))
 source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R"))
-# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
   # get list of files
-  # filled_files <- list.files("./", full.names=TRUE, pattern=scanmode)
   filled_file <- paste0("./PeakGroupList_", scanmode, "_Unidentified_filled.RData")
-  print(filled_file)
-  # load file 
-  outlist.tot <- get(load(filled_file))
+  # load file
+  outlist_total <- get(load(filled_file))
 
   # remove duplicates; peak groups with exactly the same m/z
-  outlist.tot <- mergeDuplicatedRows(outlist.tot)
+  outlist_total <- mergeDuplicatedRows(outlist_total)
 
   # sort on mass
-  outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),]
+  outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ]
 
   # load replication pattern
   pattern_file <- paste0(scanmode, "_repl_pattern.RData")
   repl_pattern <- get(load(pattern_file))
 
-  # Normalization: not done.
-  # if (normalization != "disabled") {
-  #   outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB")
-  # }
-
   if (z_score == 1) {
-    outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE)
-    nr.removed.samples <- length(which(repl_pattern[]=="character(0)"))
-    order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)])
-    outlist.stats.more <- cbind(outlist.stats[,1:7],
-                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)],
-                               outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int],
-                               outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)])
-  
-    tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE)
-    tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index]))
-    tmp <- outlist.stats.more[,tmp.index[tmp.index.order]]
-    outlist.stats.more <- outlist.stats.more[,-tmp.index]
-    outlist.stats.more <- cbind(outlist.stats.more,tmp)
-    outlist.tot <- outlist.stats.more
+    # calculate Z-scores
+    outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE)
+    nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
+    order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
+    outlist_stats_more <- cbind(outlist_stats[, 1:7],
+                                outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
+                                outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
+                                outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)])
+
+    tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
+    tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
+    tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
+    outlist_stats_more <- outlist_stats_more[, -tmp_index]
+    outlist_stats_more <- cbind(outlist_stats_more, tmp)
+    outlist_total <- outlist_stats_more
   }
 
-  outlist.not.ident = outlist.tot
+  outlist_not_ident <- outlist_total
 
   # Extra output in Excel-readable format:
   remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
-  remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns)
-  outlist.not.ident <- outlist.not.ident[ , -remove_colindex]
-  write.table(outlist.not.ident, file=paste0("unidentified_outlist_", scanmode, ".txt"), sep="\t", row.names = FALSE)
+  remove_colindex <- which(colnames(outlist_not_ident) %in% remove_columns)
+  outlist_not_ident <- outlist_not_ident[, -remove_colindex]
+  write.table(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
 
 }
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index 322a1c1..5f1d5d7 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -1,102 +1,32 @@
 #!/usr/bin/Rscript
 ## adapted from 7-collectSamplesGroupedHMDB.R
 
-# load required packages 
-# none 
-
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 ppm <- as.numeric(cmd_args[1])
-outdir <- "./" 
+outdir <- "./"
 
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
   # get list of all files that contain lists of peaks that were used in identified peak grouping
-  files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep=""))
+  files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep = ""))
   # load the list of all peaks
-  load(paste0("SpectrumPeaks_", scanmode, ".RData")) #outlist.tot
+  load(paste0("SpectrumPeaks_", scanmode, ".RData"))
 
   # Make a list of indexes of peaks that have been identified, then remove these from the peaklist.
   remove <- NULL
   for (i in 1:length(files)) {
-    load(files[i]) # outlist.grouped, now called list_of_peaks_used_in_peak_groups_identified
-    remove <- c(remove, which(outlist.tot[ ,"mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i ,"mzmed.pkt"]))
+    # load list_of_peaks_used_in_peak_groups_identified
+    load(files[i])
+    remove <- c(remove, which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i, "mzmed.pkt"]))
   }
-  outlist.rest <- outlist.tot[-remove, ]
+  outlist_rest <- outlist_total[-remove, ]
 
   # sort on mass
-  outlist <- outlist.rest[order(as.numeric(outlist.rest[ ,"mzmed.pkt"])),]
-
-  save(outlist, file=paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData"))
-
-  # cut the unidentified peak list in parts of part_len length for parallel processing
-  # NB: the while statement below gives an error message. Skip the cutting into parts.
-  # part_len <- 10000
-  # nr_peaks <- dim(outlist)[1]
-  # end <- 0
-  # min_1_last <- part_len
-  # check <- 0
-  # outlist_i_min_1 <- NULL
-  # part_nr <- 0
-
-  # if (nr_peaks >= part_len & (floor(nr_peaks/part_len) - 1) >= 2) {
-  #   for (part_nr in 2:floor(nr_peaks/part_len) -1 ) {
-  #     start <- -(part_len-1) + part_nr*part_len
-  #     end <- part_nr*part_len
-  #     
-  #     if (part_nr > 1) {
-  #       outlist_i <- outlist[c(start:end),]
-  # 
-  #         n_moved <- 0
-       
-  #      # Calculate ppm and replace border, avoid cut within peakgroup!
-  #      while ((as.numeric(outlist_i[1, "mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last, "mzmed.pkt"])) * 1e+06/as.numeric(outlist_i[1, "mzmed.pkt"]) < ppm) {
-  #        outlist_i_min_1 <- rbind(outlist_i_min_1, outlist_i[1,])
-  #        outlist_i <- outlist_i[-1, ]
-  #        n_moved <- n_moved + 1
-  #      }
-      
-  #      # message(paste("Process", i-1,":", dim(outlist_i_min_1)[1]))
-  #      save(outlist_i_min_1, file = paste(outdir, paste(scanmode, paste("outlist_i_min_1", part_nr-1, "RData", sep="."), sep="_"), sep="/"))
-  #      check <- check + dim(outlist_i_min_1)[1]
-      
-  #      outlist_i_min_1 <- outlist_i
-  #      min_1_last <- dim(outlist_i_min_1)[1]
-      
-  #    } else {
-  #      outlist_i_min_1 = outlist[c(start:end), ]
-  #    }
-  #  }
-  #}
-
-  #start <- end + 1
-  #end <- nr_peaks
-  #outlist_i <- outlist[c(start:end), ]
-  #n_moved <- 0
-
-  #if(!is.null(outlist_i_min_1)){
-  #  # Calculate ppm and replace border, avoid cut within peakgroup!
-  #  while ((as.numeric(outlist_i[1,"mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last,"mzmed.pkt"]))*1e+06/as.numeric(outlist_i[1,"mzmed.pkt"]) < ppm) {
-  #    outlist_i_min_1 = rbind(outlist_i_min_1, outlist_i[1,])
-  #    outlist_i = outlist_i[-1,]
-  #    n_moved = n_moved + 1
-  #  }
-  
-  #  cat(paste("Process", i+1-1,":", dim(outlist_i_min_1)[1]))
-  #  save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i,"RData", sep="."), sep="_"), sep="/"))
-  #  check=check+dim(outlist_i_min_1)[1]
-  #}
-
-  #outlist_i_min_1=outlist_i
-  #cat("Process", i+2-1,":", dim(outlist_i_min_1)[1], "\n")
-  #save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i+1,"RData", sep="."), sep="_"), sep="/"))
-  #}
+  outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ]
+  # save output
+  save(outlist, file = paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData"))
 
-  #check <- check + dim(outlist_i_min_1)[1]
-  #if (check==dim(outlist)[1]){
-  #  cat("Check is oke!\n")
-  #} else {
-  #  cat("Check is failed!\n")
 }
diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R
index edfc8a7..5416b15 100755
--- a/DIMS/UnidentifiedFillMissing.R
+++ b/DIMS/UnidentifiedFillMissing.R
@@ -3,18 +3,15 @@
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep="")
 
-# define parameters 
+# define parameters
 peakgrouplist_file1 <- cmd_args[1]
 peakgrouplist_file2 <- cmd_args[2]
-scripts_dir        <- cmd_args[3]
-thresh  <- as.numeric(cmd_args[4])
-resol   <- as.numeric(cmd_args[5])
-ppm     <- as.numeric(cmd_args[6])
-outdir <- "./"
-
-print(scripts_dir)
+scripts_dir         <- cmd_args[3]
+thresh   <- as.numeric(cmd_args[4])
+resol    <- as.numeric(cmd_args[5])
+ppm      <- as.numeric(cmd_args[6])
+outdir   <- "./"
 
 # load in function scripts
 source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
@@ -30,8 +27,11 @@ source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R"))
 peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2)
 for (peakgrouplist_file in peakgrouplist_files) {
 
-  if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else
-      if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" }
+  if (grepl("_pos", peakgrouplist_file)) {
+    scanmode <- "positive"
+  } else if (grepl("_neg", peakgrouplist_file)) {
+    scanmode <- "negative"
+  }
 
   # get replication pattern for sample names
   pattern_file <- paste0(scanmode, "_repl_pattern.RData")
@@ -45,6 +45,6 @@ for (peakgrouplist_file in peakgrouplist_files) {
   # replace missing values (zeros) with random noise
   peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
-  # save output 
-  save(peakgrouplist_filled, file=paste0("./", outputfile_name))
+  # save output
+  save(peakgrouplist_filled, file = paste0("./", outputfile_name))
 }
diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R
index d7a3dee..73ac4c1 100755
--- a/DIMS/UnidentifiedPeakGrouping.R
+++ b/DIMS/UnidentifiedPeakGrouping.R
@@ -1,93 +1,82 @@
 #!/usr/bin/Rscript
 ## adapted from 8-peakGrouping.rest.R
 
-# load required packages 
-# none 
-
-# define parameters 
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-resol <- as.numeric(cmd_args[1])
-ppm <- as.numeric(cmd_args[2])
+resol  <- as.numeric(cmd_args[1])
+ppm    <- as.numeric(cmd_args[2])
 outdir <- "./"
-options(digits=16)
+options(digits = 16)
 
-print(list.files(outdir, pattern="RData"))
+print(list.files(outdir, pattern = "RData"))
 
 scanmodes <- c("positive", "negative")
 
 # function for grouping unidentified peaks
-groupingRest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
-  outlist.copy <- get(load(unidentified_peaklist))
-  # batch = strsplit(unidentified_peaklist, ".",fixed = TRUE)[[1]][2] 
+grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
+  outlist_copy <- get(load(unidentified_peaklist))
   load(paste0("./", scanmode, "_repl_pattern.RData"))
-  
   outpgrlist <- NULL
-  
-  # Then group on highest peaks
-  range <- ppm*1e-06
-  startcol <- 7
-  
-  # while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) {
-  while (dim(outlist.copy)[1] > 0) {
-    
-    sel <- which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1]
-    
+
+  # group on highest peaks
+  range <- ppm * 1e-06
+
+  while (dim(outlist_copy)[1] > 0) {
+    sel <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1]
+
     # ppm range around max
-    mzref <- as.numeric(outlist.copy[sel, "mzmed.pkt"])
-    pkmin <- -(range*mzref - mzref)
-    pkmax <- 2*mzref-pkmin
-    
-    selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax
-    tmplist <- outlist.copy[selp,,drop=FALSE]
-    
-    nrsamples <- length(unique(tmplist[,"samplenr"]))
+    mzref <- as.numeric(outlist_copy[sel, "mzmed.pkt"])
+    pkmin <- -(range * mzref - mzref)
+    pkmax <- 2 * mzref - pkmin
+
+    selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax
+    tmplist <- outlist_copy[selp, , drop = FALSE]
+
+    nrsamples <- length(unique(tmplist[, "samplenr"]))
     if (nrsamples > 0) {
-      
-      mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"]))
-      mzmin.pgrp <- -(range*mzmed.pgrp - mzmed.pgrp)
-      mzmax.pgrp <- 2*mzmed.pgrp - mzmin.pgrp
-      
-      selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp
-      tmplist <- outlist.copy[selp,,drop=FALSE]
-      
-      # remove used peaks!!!
-      tmp <- as.vector(which(tmplist[,"height.pkt"]==-1))
-      if (length(tmp)>0) tmplist<-tmplist[-tmp,,drop=FALSE]
-      
-      nrsamples <- length(unique(tmplist[,"samplenr"]))
-      
-      fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"]))
-      fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"]))
-      ints.allsamps <- rep(0, length(names(repl_pattern_filtered)))
-      names(ints.allsamps) <- names(repl_pattern_filtered) # same order as sample list!!!
-      
-      # Check for each sample if multiple peaks exists, if so take the sum!
-      labels <- unique(tmplist[,"samplenr"])
-      ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))})))
-      
-      outpgrlist <- rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps,NA,NA,NA,NA))
+      mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"]))
+      mzmin_pgrp <- -(range * mzmed_pgrp - mzmed_pgrp)
+      mzmax_pgrp <- 2 * mzmed_pgrp - mzmin_pgrp
+      # select peaks within mz range
+      selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp
+      tmplist <- outlist_copy[selp, , drop = FALSE]
+
+      # remove used peaks
+      tmp <- as.vector(which(tmplist[, "height.pkt"] == -1))
+      if (length(tmp) > 0) tmplist <- tmplist[-tmp, , drop = FALSE]
+
+      nrsamples <- length(unique(tmplist[, "samplenr"]))
+      fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"]))
+      fq_best_pgrp  <- as.numeric(min(outlist_copy[selp, "fq"]))
+      ints_allsamps <- rep(0, length(names(repl_pattern_filtered)))
+      names(ints_allsamps) <- names(repl_pattern_filtered)
+
+      # Check for each sample if multiple peaks exists, if so take the sum
+      labels <- unique(tmplist[, "samplenr"])
+      ints_allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) {
+        sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"]))
+      })))
+
+      outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA))
     }
-   
-    outlist.copy <- outlist.copy[-which(selp==TRUE),,drop=FALSE]
+
+    outlist_copy <- outlist_copy[-which(selp == TRUE), , drop = FALSE]
   }
-  
-  outpgrlist <- as.data.frame(outpgrlist)  # ignore warnings of duplicate row names
-  colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp")
-  colnames(outpgrlist)[(length(repl_pattern_filtered)+7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB")
- 
-  return(outpgrlist) 
-  
-}
 
+  outpgrlist <- as.data.frame(outpgrlist)
+  colnames(outpgrlist)[1:6] <- c("mzmed_pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin_pgrp", "mzmax_pgrp")
+  colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB")
+
+  return(outpgrlist)
+}
 
 for (scanmode in scanmodes) {
-  unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData") 
   # generate peak group lists of the unidentified peaks
-  outpgrlist <- groupingRest(outdir, unidentified_peaklist, scanmode, ppm=ppm)
+  unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData")
+  outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm)
+  write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt"))
 
-  # save output in RData format for further processing  
+  # save output in RData format for further processing
   save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData"))
-  write.table(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.txt"))
 }
-

From 2996892753102eed865636d031799c88394ce130 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Wed, 21 Feb 2024 08:32:15 +0100
Subject: [PATCH 26/73] convert qc_value to float

---
 CheckQC/check_qc.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index db94a4b..d9b5609 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -162,6 +162,12 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
                 .loc[:, qc_metric_out.columns]
             )
         ])
+    # Try to convert column qc_value to float.
+    # If ValueError is raised, probably because column is a string, continue.    
+    try:
+        qc_metric_out["qc_value"] = qc_metric_out["qc_value"].astype("float")
+    except ValueError:
+        pass
     # In case 'multiple sample qc check',
     # output could contain duplicate rows for individual samples used in multiple comparisons.
     return qc_metric_out.sort_values(by=["qc_check", "qc_status"]).drop_duplicates(keep="first")

From 6177ce92b3bc5b7515b639a68812a43f98b5f127 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Wed, 21 Feb 2024 10:12:46 +0100
Subject: [PATCH 27/73] change input to qc settings metrics

---
 CheckQC/check_qc.py      |  6 +++---
 CheckQC/test_check_qc.py | 22 +++++++++++-----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index d9b5609..a0b0e31 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -64,9 +64,9 @@ def check_allowed_operators(qc_operator):
         raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}")
 
 
-def check_required_keys_metrics(qc_settings):
+def check_required_keys_metrics(qc_metrics):
     for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]:
-        if any([req_key not in setting.keys() for setting in qc_settings["metrics"]]):
+        if any([req_key not in setting.keys() for setting in qc_metrics]):
             raise KeyError(f"Required key {req_key} not in all metrics settings.")
 
 
@@ -219,7 +219,7 @@ def read_and_judge_metrics(qc, metrics):
 def check_qc(input_files, settings, output_path, output_prefix):
     # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings.
     qc_settings = read_yaml(settings)
-    check_required_keys_metrics(qc_settings)
+    check_required_keys_metrics(qc_settings["metrics"])
     duplicated_sample_file = []
     for qc_metric_settings in qc_settings["metrics"]:
         check_allowed_operators(qc_metric_settings["operator"])
diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py
index d6ef6ae..1a54060 100644
--- a/CheckQC/test_check_qc.py
+++ b/CheckQC/test_check_qc.py
@@ -66,29 +66,29 @@ def test_not_existing_operator(self):
 
 class TestCheckRequiredKeysMetrics():
     def test_required_keys_present(self):
-        qc_settings = {"metrics": [
+        qc_metrics = [
             {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake",  "report_cols": "fake"},
-        ]}
-        check_qc.check_required_keys_metrics(qc_settings)
+        ]
+        check_qc.check_required_keys_metrics(qc_metrics)
         assert True
 
     @pytest.mark.parametrize(
-        "incomplete_qc_settings",
+        "incomplete_qc_metrics",
         [
-            {"metrics": [{"filename": "fakename"}]},
-            {"metrics": [
+            [{"filename": "fakename"}],
+            [
                 {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake",  "report_cols": "fake"},
                 {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
-            ]},
-            {"metrics": [
+            ],
+            [
                 {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
                 {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
-            ]}
+            ]
         ]
     )
-    def test_missing_keys(self, incomplete_qc_settings):
+    def test_missing_keys(self, incomplete_qc_metrics):
         with pytest.raises(KeyError) as required_error:
-            check_qc.check_required_keys_metrics(incomplete_qc_settings)
+            check_qc.check_required_keys_metrics(incomplete_qc_metrics)
         error_val = str(required_error.value)
         assert "not in all metrics settings." in error_val
         assert error_val.split(" ")[2] in ["filename", "qc_col", "threshold", "operator", "report_cols"]

From e162db029d6baa9e99082352639ba223d24c2cee Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Wed, 21 Feb 2024 10:12:57 +0100
Subject: [PATCH 28/73] Add docstrings checkqc

---
 CheckQC/check_qc.py | 155 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index a0b0e31..37ea919 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -14,6 +14,18 @@
 
 
 def non_empty_existing_path(file_or_dir):
+    """This function checks whether the provided file or dir exists and is not empty.
+
+    Args:
+        file_or_dir (string): Input file or directory
+
+    Raises:
+        FileNotFoundError: If input string file_or_dir is neither a file nor a dir.
+        OSError: If input is not a dir and file is empty.
+
+    Returns:
+        string: Provided input file or directory. If dir, suffix '/' might be added.
+    """    
     input_path = Path(file_or_dir)
     if not input_path.is_file() and not input_path.is_dir():
         raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir)
@@ -25,6 +37,14 @@ def non_empty_existing_path(file_or_dir):
 
 
 def parse_arguments_and_check(args_in):
+    """Parses arguments and validates / checks format of input.
+
+    Args:
+        args_in (list of strings): Commandline input arguments.
+
+    Returns:
+        Namespace: Convert argument strings to objects and assign them as attributes of the namespace.
+    """    
     parser = argparse.ArgumentParser(
         description="Check and summarize sample quality using qc metrics and their thresholds."
     )
@@ -52,6 +72,17 @@ def parse_arguments_and_check(args_in):
 
 
 def read_yaml(yaml_file):
+    """Read input yaml
+
+    Args:
+        yaml_file (string): String with path to yaml file
+
+    Raises:
+        ValueError: If reading the file returns None (not recognized as YAML format).
+
+    Returns:
+        Object: Content of the YAML file.
+    """    
     yaml_loaded = yaml.safe_load(open(yaml_file))
     if not yaml_loaded:
         raise ValueError("Could not load YAML.")
@@ -59,18 +90,43 @@ def read_yaml(yaml_file):
 
 
 def check_allowed_operators(qc_operator):
+    """Check if provided qc_operator is allowed.
+
+    Args:
+        qc_operator (string): (Custom / math) operator
+
+    Raises:
+        ValueError: If provided qc_operator is invalid / unsupported.
+    """    
     operators = ["<", "<=", ">", ">=", "==", "!=", "match"]
     if qc_operator not in operators:
         raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}")
 
 
 def check_required_keys_metrics(qc_metrics):
+    """Check if all required settings are included in the qc_settings
+
+    Args:
+        qc_metrics (list with dicts): qc settings per qc metric
+
+    Raises:
+        KeyError: Required key is not provided for the qc metric settings
+    """    
     for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]:
         if any([req_key not in setting.keys() for setting in qc_metrics]):
             raise KeyError(f"Required key {req_key} not in all metrics settings.")
 
 
 def select_metrics(filename, input_files):
+    """Using regular expression to match the qc metric filename with the input files
+
+    Args:
+        filename (string): Filename of qc metric, could be regex.
+        input_files (list): All qc metrics input files.
+
+    Returns:
+        list: Input files matching the given filename.
+    """    
     metrics = list(filter(re.compile(f".*{filename}").match, input_files))
     if not metrics:
         warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}"))
@@ -79,6 +135,20 @@ def select_metrics(filename, input_files):
 
 
 def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
+    """Get valid columns to include in final output report
+
+    Args:
+        qc_report_cols (list, string): column name(s) to include in report.
+        qc_metric_cols (list, string): column name(s) in qc metric.
+        qc_col (string): column name that contains the qc value/score.
+
+    Raises:
+        TypeError: qc_report_cols is neither a string nor list.
+        ValueError: Provided column names in qc_report_cols does not exist in the qc metric.
+
+    Returns:
+        list of strings: Valid column names to include in report.
+    """    
     not_existing_cols = list(set(qc_report_cols) - set(qc_metric_cols))
     if qc_report_cols == "@all":
         qc_report_cols = qc_metric_cols
@@ -94,6 +164,18 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
 
 
 def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshold):
+    """Add and rename columns in qc_metric.
+
+    Args:
+        qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
+        qc_title (string): Title of the qc check
+        qc_col (string): qc column name with qc value/score
+        qc_operator (string): (Custom / math) operator
+        qc_threshold (string, int or float): qc threshold
+
+    Returns:
+        Pandas DataFrame: DataFrame with qc metric.
+    """
     qc_metric_assigned = qc_metric.assign(
         qc_title=qc_title.lower(),
         qc_status="PASS",
@@ -105,6 +187,20 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol
 
 
 def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
+    """Get rows that fail provided qc threshold 
+
+    Args:
+        qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
+        qc_col (string): column name that contains the qc value/score.
+        qc_operator (string): (Custom / math) operator
+        qc_threshold (string, int or float): qc threshold
+
+    Raises:
+        TypeError: If qc threshold is neither 'match', str, int or float.
+
+    Returns:
+        object: The indexes of failed rows in qc_metric
+    """    
     # Select failed rows using qc_threshold regex pattern and qc_operator 'match'
     if qc_operator == "match" and isinstance(qc_threshold, str):
         return qc_metric[qc_col].str.match(qc_threshold)
@@ -121,6 +217,18 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
 
 
 def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
+    """Failed samples are added to the output metric, and removed from qc_metric.
+
+    Args:
+        qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
+        failed_rows (object): Object with indexes of failed rows in qc_metric
+        report_cols (list): Valid column names (strings) to include in report.
+        sample_cols (list): Columnames (strings) of sample names.
+
+    Returns:
+        qc_metric (DataFrame): DataFrame of qc metric without failed rows
+        qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows
+    """    
     qc_metric_out = DataFrame(columns=["sample", "qc_check", "qc_status", "qc_msg", "qc_value"])
     failed_samples = []
     if failed_rows.to_list():
@@ -152,6 +260,16 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
 
 
 def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
+    """Passed samples are added to the output metric
+
+    Args:
+        qc_metric (DataFrame): DataFrame of qc metric without failed rows
+        qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows
+        sample_cols (list): Columnames (strings) of sample names.
+
+    Returns:
+        pandas DataFrame: Sorted qc metric for both passed and failed samples, without duplicates.
+    """    
     # Add passed samples to output
     for sample_col in sample_cols:
         qc_metric_out = concat([
@@ -174,6 +292,13 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
 
 
 def create_and_write_output(qc_output, output_path, output_prefix):
+    """Joined qc metrics is created and written to output file. 
+
+    Args:
+        qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates.
+        output_path (string): Relative or absolute path where output should be saved. 
+        output_prefix (string): Output prefix for output file.
+    """    
     # Add qc_summary
     qc_output.insert(1, "qc_summary", "PASS")
     qc_output.loc[qc_output.isin(["FAIL"]).any(axis=1), "qc_summary"] = "FAIL"
@@ -182,6 +307,15 @@ def create_and_write_output(qc_output, output_path, output_prefix):
 
 
 def read_and_judge_metrics(qc, metrics):
+    """Read and judge each single qc metric and join results. 
+
+    Args:
+        qc (dict): qc settings of the metric
+        metrics (list): List of input files specific for single qc metric
+
+    Returns:
+        pandas DataFrame: Joined and judged qc metrics.
+    """    
     for qc_file in metrics:
         qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"')
         report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
@@ -217,6 +351,27 @@ def read_and_judge_metrics(qc, metrics):
 
 
 def check_qc(input_files, settings, output_path, output_prefix):
+    """
+    Main function to judge input files on configured qc settings. 
+    It creates a single results table, each row representing 
+        sample (string): sample name
+        qc_summary: Summarized status of all qcs for sample (pass or fail)
+        qc columns (5 per each qc metric);
+            qc_check (string): QC check consiting of qc title, operator and threshold
+            qc_status (string): Status of performed qc check (pass or fail)
+            qc_msg (string): String with human readable message if sample failed qc check, empty if passed.  
+            qc_value (string, int, float): qc value/score to check.
+
+    Args:
+        input_files (list): All qc metrics input files.
+        settings (string): Path to yaml file
+        output_path (string): Relative or absolute path where output should be saved. 
+        output_prefix (string): Output prefix for output file.
+
+    Raises:
+        ValueError: No input files found to match any qc metric patterns defined in settings.
+        ValueError: Duplicated samples with different values found in some of the input files. 
+    """    
     # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings.
     qc_settings = read_yaml(settings)
     check_required_keys_metrics(qc_settings["metrics"])

From e5e3acb535283131496bf5c068fc35797b501fa8 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Wed, 21 Feb 2024 10:13:38 +0100
Subject: [PATCH 29/73] Split awk and rename output

---
 Utils/EditSummaryFileHappy.nf | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
index 83ef755..8050c46 100644
--- a/Utils/EditSummaryFileHappy.nf
+++ b/Utils/EditSummaryFileHappy.nf
@@ -8,10 +8,10 @@ process EditSummaryFileHappy {
         tuple(val(meta), path(summary_csv))
 
     output:
-        path("INDEL_PASS_summary.csv"), emit: indel_pass_summary_csv
-        path("INDEL_ALL_summary.csv"), emit: indel_all_summary_csv
-        path("SNP_PASS_summary.csv"), emit: snp_pass_summary_csv
-        path("SNP_ALL_summary.csv"), emit: snp_all_summary_csv
+        path("${meta.truth}_${meta.query}_INDEL_PASS.csv"), emit: indel_pass_csv
+        path("${meta.truth}_${meta.query}_INDEL_ALL.csv"), emit: indel_all_csv
+        path("${meta.truth}_${meta.query}_SNP_PASS.csv"), emit: snp_pass_csv
+        path("${meta.truth}_${meta.query}_SNP_ALL.csv"), emit: snp_all_csv
 
     script:
         """
@@ -19,7 +19,10 @@ process EditSummaryFileHappy {
         sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp
         
         # Split file including header (first line)
-        awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$4"_"\$5"_summary.csv"; print \$0>>\$4"_"\$5"_summary.csv"}' ${summary_csv}.tmp
+        awk -F',' 'FNR==1{hdr=\$0;next} {
+            print hdr>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv";
+            print \$0>>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv"
+        }' ${summary_csv}.tmp
         
         # Remove tmp files
         rm ${summary_csv}.tmp     

From 7d03db3cb48fb0111a7c3e1a81d24f25ee047d6b Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Wed, 21 Feb 2024 11:51:48 +0100
Subject: [PATCH 30/73] Only add .* if string is provided

---
 CheckQC/check_qc.py      |  7 +++++--
 CheckQC/test_check_qc.py | 43 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index 37ea919..e6f2cc1 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -126,8 +126,11 @@ def select_metrics(filename, input_files):
 
     Returns:
         list: Input files matching the given filename.
-    """    
-    metrics = list(filter(re.compile(f".*{filename}").match, input_files))
+    """
+    # If filename is string, change into regex to match absolute and relative paths in input_files.
+    if filename.isalpha():
+        filename=".*" + filename
+    metrics = list(filter(re.compile(f"{filename}").match, input_files))
     if not metrics:
         warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}"))
         return None
diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py
index 1a54060..a757fac 100644
--- a/CheckQC/test_check_qc.py
+++ b/CheckQC/test_check_qc.py
@@ -95,12 +95,45 @@ def test_missing_keys(self, incomplete_qc_metrics):
 
 
 class TestSelectMetrics():
-    @pytest.mark.parametrize("input_files,expected", [
-        (["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]),  # multi match
-        (["test1.txt", "fake2.txt"],  ["test1.txt"]),  # single match
+    @pytest.mark.parametrize("filename_or_regex,input_files,expected", [
+        # multi match
+        ("test", ["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]),
+        # single match
+        ("test", ["test1.txt", "fake2.txt"],  ["test1.txt"]),
+        # # match with relative path
+        ("test", ["./random/path/to/test1.txt"],  ["./random/path/to/test1.txt"]),
+        # match with absolute path
+        ("test", ["/random/path/to/test1.txt"],  ["/random/path/to/test1.txt"]),
+        # match regex: kinship file suffix
+        (
+            ".*.kinship_check.out$",
+            ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"],
+            ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"]
+        ),
+        # match on word truth and SNP
+        (
+            ".*truth.*SNP",
+            ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'],
+            ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
+        ),
+        # match on word truth and SNP
+        (
+            ".*truth.*SNP",
+            ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', 'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'],
+            ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
+        ),
+        # match if 'truth' is absent and contains 'SNP'
+        # ?: Match expression but do not capture it
+        # ?! Match if 'truth' is absent.
+        (
+            "(?:(?!truth).)*SNP.*$",
+            ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'],
+            ['12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv']
+        ),
+
     ])
-    def test_select_metric(self, input_files, expected):
-        metrics = check_qc.select_metrics("test", input_files)
+    def test_select_metric(self, filename_or_regex, input_files, expected):
+        metrics = check_qc.select_metrics(filename_or_regex, input_files)
         assert metrics == expected
 
     def test_no_match(self):

From 277d756e733d5ffabd950be60c8b535327f5618a Mon Sep 17 00:00:00 2001
From: mraves2 <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 22 Feb 2024 10:05:29 +0100
Subject: [PATCH 31/73] fixed output for TIC plots

---
 DIMS/AverageTechReplicates.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 4c3d49b..f3117d0 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -169,5 +169,5 @@ tic_plot_pdf <- marrangeGrob(
   ))
 )
 # save to file
-ggsave(filename = paste0("./../../../Bioinformatics/", run_name, "_TICplots.pdf"),
+ggsave(filename = paste0("./", run_name, "_TICplots.pdf"),
        tic_plot_pdf, width = 21, height = 29.7, units = "cm")

From a38c3131c50875b8e245d0efdd0ecb5ea0949c44 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 26 Feb 2024 11:11:32 +0100
Subject: [PATCH 32/73] fixed code after cleaning with lintr

---
 DIMS/AverageTechReplicates.R  | 19 ++++++++++---------
 DIMS/AverageTechReplicates.nf |  5 +++--
 DIMS/CollectFilled.R          | 12 ++++++------
 DIMS/CollectFilled.nf         |  2 +-
 DIMS/FillMissing.R            |  1 -
 DIMS/FillMissing.nf           |  2 +-
 DIMS/GenerateBreaks.R         | 16 ++++++++--------
 DIMS/GenerateBreaks.nf        |  4 ++--
 DIMS/MakeInit.R               | 14 +++++++-------
 DIMS/PeakGrouping.R           |  9 +++++----
 DIMS/PeakGrouping.nf          |  4 ++--
 DIMS/SpectrumPeakFinding.R    |  4 ++--
 DIMS/SumAdducts.R             |  3 ++-
 DIMS/SumAdducts.nf            |  4 ++--
 14 files changed, 51 insertions(+), 48 deletions(-)

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 4c3d49b..d470d7d 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -4,13 +4,14 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-init_filepath <- cmd_args[1]
-nr_replicates <- as.numeric(cmd_args[2])
-thresh2remove <- 1000000000
-dims_thresh    <- 100
-run_name      <- cmd_args[3]
-dims_matrix   <- cmd_args[4]
-highest_mz    <- as.numeric(cmd_args[5])
+init_filepath   <- cmd_args[1]
+nr_replicates   <- as.numeric(cmd_args[2])
+thresh2remove   <- 1000000000
+dims_thresh     <- 100
+run_name        <- cmd_args[3]
+dims_matrix     <- cmd_args[4]
+highest_mz_file <- cmd_args[5]
+highest_mz <- get(load(highest_mz_file))
 
 # lower the threshold below which a sample will be removed for DBS and for high m/z
 if (dims_matrix == "DBS") {
@@ -147,7 +148,7 @@ for (sample_nr in c(1:length(repl_pattern))) {
     tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) +
       geom_line(linewidth = 0.3) +
       geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
-      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[j], "  ||  ", sample_name)) +
+      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_name], "  ||  ", sample_name)) +
       theme(plot.background = element_rect(fill = plot_color),
             axis.text = element_text(size = 4),
             axis.title = element_text(size = 4),
@@ -169,5 +170,5 @@ tic_plot_pdf <- marrangeGrob(
   ))
 )
 # save to file
-ggsave(filename = paste0("./../../../Bioinformatics/", run_name, "_TICplots.pdf"),
+ggsave(filename = paste0("./", run_name, "_TICplots.pdf"),
        tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index 8775976..264d2f7 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -8,9 +8,10 @@ process AverageTechReplicates {
        path(RData_file)     // input files need to be linked, but called within R script
        path(TIC_txt_files)  // input files need to be linked, but called within R script
        path(init_filepath)
+       val(nr_replicates)
        val(analysis_id)
        val(matrix)
-       val(highest_mz)
+       path(highest_mz)
 
     output:
        path('*_repl_pattern.RData'), emit: patterns
@@ -20,7 +21,7 @@ process AverageTechReplicates {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix $highest_mz
         """
 }
 
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 9e97db2..ae18ad8 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -16,11 +16,11 @@ scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
   # get list of files
-  filled_files <- list.files("./", full.names = TRUE, pattern = scanmode)
+  filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
   # load files and combine into one object
   outlist_total <- NULL
   for (file_nr in 1:length(filled_files)) {
-    load(filled_files[file_nr])
+    peakgrouplist_filled <- get(load(filled_files[file_nr]))
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
 
@@ -45,10 +45,10 @@ for (scanmode in scanmodes) {
       outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
     )
 
-    tmp_index <- grep("_Zscore", colnames(outlist_stats.more), fixed = TRUE)
-    tmp_index_order <- order(colnames(outlist_stats.more[, tmp_index]))
-    tmp <- outlist_stats.more[, tmp_index[tmp_index_order]]
-    outlist_stats_more <- outlist_stats.more[, -tmp_index]
+    tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
+    tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
+    tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
+    outlist_stats_more <- outlist_stats_more[, -tmp_index]
     outlist_stats_more <- cbind(outlist_stats_more, tmp)
     outlist_total <- outlist_stats_more
   }
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index a7819f4..4261410 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -6,7 +6,7 @@ process CollectFilled {
 
     input:
        path(filled_files)
-       path(replication_pattern)  // input files need to be linked, but called within R script
+       each path(replication_pattern)  // input files need to be linked, but called within R script
 
     output:
        path('outlist*.txt')
diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index 0364ad2..1c1540c 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -3,7 +3,6 @@
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n", sep = "")
 
 # define parameters
 peakgrouplist_file <- cmd_args[1]
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index 3a1f61b..4f71b77 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -6,7 +6,7 @@ process FillMissing {
 
     input:
        path(GroupedList_file)
-       path(replication_pattern) // input files need to be linked, but called within R script
+       each path(replication_pattern) // input files need to be linked, but called within R script
 
     output:
        path('*_filled.RData')
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index fd60723..0de2426 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -8,17 +8,17 @@ suppressPackageStartupMessages(library("xcms"))
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-filepath <- cmd_args[1] # 1 of the mzML files
+filepath <- cmd_args[1]
 outdir   <- cmd_args[2]
-trim     <- as.numeric(cmd_args[3]) # 0.1
-resol    <- as.numeric(cmd_args[4]) # 140000
+trim     <- as.numeric(cmd_args[3])
+resol    <- as.numeric(cmd_args[4])
 
 # initialize
 trim_left        <- NULL
 trim_right       <- NULL
-breaks_fwhm     <- NULL
-breaks_fwhm_avg <- NULL
-bins            <- NULL
+breaks_fwhm      <- NULL
+breaks_fwhm_avg  <- NULL
+bins             <- NULL
 
 # read in mzML file
 raw_data <- suppressMessages(xcmsRaw(filepath))
@@ -39,7 +39,7 @@ segment  <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
 for (i in 1:nr_segments) {
   start_segment   <- segment[i]
   end_segment     <- segment[i+1]
-  resol_mz        <- resol*(1 / sqrt(2) ^ (log2(start_segment / 200)))
+  resol_mz        <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
   fwhm_segment    <- start_segment / resol_mz
   breaks_fwhm     <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
   # average the m/z instead of start value
@@ -50,4 +50,4 @@ for (i in 1:nr_segments) {
 
 # generate output file
 save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "./breaks.fwhm.RData")
-write(high_mz, file = "./highest_mz.txt")
+save(high_mz, file = "./highest_mz.RData")
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index 8f37438..4e855f2 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -9,8 +9,8 @@ process GenerateBreaks {
 
 
     output:
-       path('breaks.fwhm.RData')
-       path('highest_mz.txt'), emit: highest_mz
+       path('breaks.fwhm.RData'), emit: breaks
+       path('highest_mz.RData'), emit: highest_mz
 
     script:
         """
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 33794c5..7d8dd1d 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -5,18 +5,18 @@ args          <- commandArgs(trailingOnly = TRUE)
 sample_sheet  <- read.csv(args[1], sep = "\t")
 nr_replicates <- as.numeric(args[2])
 
-sample_names  <- trimws(as.vector(unlist(sample_sheet[1])))
-nr_sampgrps   <- length(sampleNames) / nr_replicates
-group_names    <- trimws(as.vector(unlist(sample_sheet[2])))
-group_names    <- gsub("[^-.[:alnum:]]", "_", group_names)
+sample_names       <- trimws(as.vector(unlist(sample_sheet[1])))
+nr_sample_groups   <- length(sample_names) / nr_replicates
+group_names        <- trimws(as.vector(unlist(sample_sheet[2])))
+group_names        <- gsub("[^-.[:alnum:]]", "_", group_names)
 group_names_unique <- unique(group_names)
 
 repl_pattern <- c()
-for (sampgrp in 1:nr_sampgrps) {
+for (sample_group in 1:nr_sample_groups) {
   tmp <- c()
   for (repl in nr_replicates:1) {
-    index <- ((sampgrp * nr_replicates) - repl) + 1
-    tmp <- c(tmp, sampleNames[index])
+    index <- ((sample_group * nr_replicates) - repl) + 1
+    tmp <- c(tmp, sample_names[index])
   }
   repl_pattern <- c(repl_pattern, list(tmp))
 }
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index d04b6a2..b574206 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -25,8 +25,8 @@ batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2]
 # load file with spectrum peaks
 spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData")
 load(spec_peaks_file)
-outlist_copy <- outlist_tot
-rm(outlist_tot)
+outlist_copy <- outlist_total
+rm(outlist_total)
 
 # load replication pattern
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
@@ -169,9 +169,10 @@ while (dim(hmdb_add_iso)[1] > 0) {
 
     # combine all information
     peakgrouplist_identified <- rbind(peakgrouplist_identified, cbind(
-      data.frame(mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp),
+      data.frame("mzmed.pgrp" = mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, 
+		 "mzmin.pgrp" = mzmin_pgrp, "mzmax.pgrp" = mzmax_pgrp),
       t(as.matrix(ints_allsamps)),
-      data.frame(assi_hmdb, iso_hmdb, hmdb_code, theormz_hmdb)
+      data.frame("assi_HMDB" = assi_hmdb, "iso_HMDB" = iso_hmdb, "HMDB_code" = hmdb_code, "theormz_HMDB" = theormz_hmdb)
     ))
   }
 
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index 8da1e76..5c48cde 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -6,8 +6,8 @@ process PeakGrouping {
 
     input:
        path(HMDBpart_file)
-       path(SpectrumPeak_file) // input files need to be linked, but called within R script
-       path(pattern_file)
+       each path(SpectrumPeak_file) // input files need to be linked, but called within R script
+       each path(pattern_file)      // Execute the process for each element in the input collection (HMDBpart_file)
 
     output:
        path '*_peaks_used.RData', emit: peaks_used
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index a44ea06..5c77b12 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -10,13 +10,13 @@ not_run <- NULL
 # collect spectrum peaks for each scanmode
 for (scanmode in scanmodes) {
   # load peak lists of all biological samples
-  peaklist_files <- list.files("/.", full.names = TRUE, pattern = paste0("*_", scanmode, ".RData"))
+  peaklist_files <- list.files(pattern = paste0("_", scanmode, ".RData"))
 
   # get sample names
   load(paste0("./", scanmode, "_repl_pattern", ".RData"))
   group_names <- names(repl_pattern_filtered)
   for (sample_nr in 1:length(group_names)) {
-    group <- paste0(input_dir, "/", paste0(paste(group_names[sample_nr], scanmode, sep = "_"), ".RData"))
+    group <- paste0("./", group_names[sample_nr], "_", scanmode, ".RData")
     if (!(group %in% peaklist_files)) {
       not_run <- c(not_run, group)
     }
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index b4bd332..e9fa189 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -26,7 +26,8 @@ repl_file <- paste0(scanmode, "_repl_pattern.RData")
 load(repl_file)
 outlist_part <- get(load(hmdbpart_main_file))
 
-batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2]
+# get the number from the file name
+batch_number <- strsplit(strsplit(basename(hmdbpart_main_file), "main_", fixed = TRUE)[[1]][2], ".RData")[[1]]
 
 outlist_total <- unique(outlist_ident)
 
diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf
index 531308e..66a404f 100644
--- a/DIMS/SumAdducts.nf
+++ b/DIMS/SumAdducts.nf
@@ -5,8 +5,8 @@ process SumAdducts {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(collect_file) // input files need to be linked, but called within R script
-       path(replication_pattern) // input files need to be linked, but called within R script
+       each path(collect_file) // input files need to be linked, but called within R script
+       each path(replication_pattern) // input files need to be linked, but called within R script
        path(HMDBpart_main_file)
 
     output:

From 5631ab52966f3789917fc271145bb88132199fa4 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Thu, 29 Feb 2024 09:28:00 +0100
Subject: [PATCH 33/73] corrections after code clean-up with lintr

---
 DIMS/GenerateExcel.R             | 32 ++++++++++++++++----------------
 DIMS/GenerateExcel.nf            |  3 ++-
 DIMS/HMDBparts_main.R            |  4 ++--
 DIMS/SumAdducts.R                |  2 +-
 DIMS/UnidentifiedCollectPeaks.nf |  2 +-
 5 files changed, 22 insertions(+), 21 deletions(-)

diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index 029498f..3e55f7e 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -591,21 +591,22 @@ if (z_score == 1) {
 # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail
 print("Nu in missing m/z check")
 # Load the outlist_identified files + remove the loaded files
-load(paste0(outdir, "RData/outlist_identified_negative.RData"))
-outlist.ident.neg <- outlist.ident
-load(paste0(outdir, "RData/outlist_identified_positive.RData"))
-outlist.ident.pos <- outlist.ident
-rm(outlist.ident)
+load(paste0(outdir, "/outlist_identified_negative.RData"))
+outlist_ident_neg <- outlist_ident
+load(paste0(outdir, "/outlist_identified_positive.RData"))
+outlist_ident_pos <- outlist_ident
+rm(outlist_ident)
 # check for missing m/z in negative and positive mode
-mode <- c("Negative", "Positive")
+scanmode <- c("Negative", "Positive")
 index <- 1
-results_ident <- c() # empty results list
-outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos)
-for (outlist.ident in outlist_ident_list) {
-  current_mode <- mode[index]
+results_ident <- c() 
+outlist_ident_list <- list(outlist_ident_neg, outlist_ident_pos)
+for (outlist_ident in outlist_ident_list) {
+  current_mode <- scanmode[index]
   # retrieve all unique m/z values in whole numbers and check if all are available
-  mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits = 0)))
-  mz_range <- seq(70, 599, by = 1) # change accordingly to the machine m/z range. default = 70-600
+  mz_values <- as.numeric(unique(format(outlist_ident$mzmed.pgrp, digits = 0)))
+  # m/z range for a standard run = 70-600
+  mz_range <- seq(70, 599, by = 1)
   mz_missing <- c()
   for (mz in mz_range) {
     if (!mz %in% mz_values) {
@@ -621,8 +622,7 @@ for (outlist.ident in outlist_ident_list) {
   } else {
     results_ident <- c(results_ident, paste0(current_mode, " mode did not have missing mz values"))
   }
-  index <- index + 1 # change to new mode in for loop
+  # change to other scanmode
+  index <- index + 1
 }
-lapply(results_ident, write, file = paste(outdir, "missing_mz_warning.txt", sep = "/"), append = TRUE, ncolumns = 1000)
-
-cat("Ready excelExport.R")
+lapply(results_ident, write, file = paste0(outdir, "/missing_mz_warning.txt"), append = TRUE, ncolumns = 1000)
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 9e861e2..f73b881 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -6,12 +6,13 @@ process GenerateExcel {
 
     input:
        path(collect_file) // input files need to be linked, but called within R script
+       path(identified_file) // input files need to be lined, but called within R script
        path(init_filepath) 
        val(analysis_id) 
        path(relevance_file)
 
     output:
-       path('AdductSums_*.RData')
+       path('AdductSums_*.txt')
        path('*.xlsx'), emit: excel_file 
 
     script:
diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index 00cf5c9..ff468c7 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -45,7 +45,7 @@ for (scanmode in scanmodes) {
       start <- -(sub-1) + i * sub
       end <- i * sub
       outlist_part <- outlist[c(start:end), ]
-      save(outlist_part, file=paste0(scanmode, "_hmdb_main_", i, ".RData"))
+      save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData"))
     }
   }
 }
@@ -55,4 +55,4 @@ start <- end + 1
 end <- nr_rows
 
 outlist_part <- outlist[c(start:end),]
-save(outlist_part, file = paste0(scanmode, "_hmdb.", i+1, ".RData"))
+save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i+1, ".RData"))
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index e9fa189..db74623 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -27,7 +27,7 @@ load(repl_file)
 outlist_part <- get(load(hmdbpart_main_file))
 
 # get the number from the file name
-batch_number <- strsplit(strsplit(basename(hmdbpart_main_file), "main_", fixed = TRUE)[[1]][2], ".RData")[[1]]
+batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2]
 
 outlist_total <- unique(outlist_ident)
 
diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf
index 475b0a4..bcd1264 100644
--- a/DIMS/UnidentifiedCollectPeaks.nf
+++ b/DIMS/UnidentifiedCollectPeaks.nf
@@ -5,7 +5,7 @@ process UnidentifiedCollectPeaks {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(SpectrumPeaks_file)
+       path(SpectrumPeaks_file)  // input files need to be linked, but called within R script
        path(PeakList_identified)
 
     output:

From 8f99bb80c8d89f355b0c2b31ff87fd359dec5042 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 26 Mar 2024 08:27:59 +0100
Subject: [PATCH 34/73] rename output checkqc

---
 CheckQC/CheckQC.nf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CheckQC/CheckQC.nf b/CheckQC/CheckQC.nf
index 2fe48db..cb62014 100644
--- a/CheckQC/CheckQC.nf
+++ b/CheckQC/CheckQC.nf
@@ -9,10 +9,10 @@ process CheckQC {
         path(input_files, stageAs: "?/*")
      
     output:
-        path("${identifier}_summary.csv", emit: qc_output)
+        path("${identifier}_checkqc_summary.csv", emit: qc_output)
 
     script:
         """
-        python ${baseDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier} ${input_files} 
+        python ${baseDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier}_checkqc ${input_files} 
         """
 }

From a662b5e54b2bccd24c3a6656c3d1496ef72b6735 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 26 Mar 2024 13:33:44 +0100
Subject: [PATCH 35/73] add versions yaml to VersionLog

---
 Utils/VersionLog.nf | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf
index 4a295b1..bdc5f4d 100644
--- a/Utils/VersionLog.nf
+++ b/Utils/VersionLog.nf
@@ -10,6 +10,7 @@ process VersionLog {
 
     output:
         path('repository_version.log')
+        path("versions.yml", emit: versions)
 
     script:
         """
@@ -17,6 +18,8 @@ process VersionLog {
         do
             echo "\${git_dir}" >> repository_version.log
             git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log
+            described_tags=$(git describe --tags)
+            echo "\${git_dir}: \"${described_tags}\"" >> versions.yml
         done
         """
 }
\ No newline at end of file

From 00ff0e6495a5b19a3cec1d803ca8b5a2fa952eba Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 26 Mar 2024 14:01:03 +0100
Subject: [PATCH 36/73] escape $

---
 Utils/VersionLog.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf
index bdc5f4d..91bf377 100644
--- a/Utils/VersionLog.nf
+++ b/Utils/VersionLog.nf
@@ -18,7 +18,7 @@ process VersionLog {
         do
             echo "\${git_dir}" >> repository_version.log
             git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log
-            described_tags=$(git describe --tags)
+            described_tags=\$(git describe --tags)
             echo "\${git_dir}: \"${described_tags}\"" >> versions.yml
         done
         """

From 27f7ed550dd5612f1388985a6a72690c5f1c4f78 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 26 Mar 2024 14:02:58 +0100
Subject: [PATCH 37/73] escape $

---
 Utils/VersionLog.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf
index 91bf377..70179c3 100644
--- a/Utils/VersionLog.nf
+++ b/Utils/VersionLog.nf
@@ -19,7 +19,7 @@ process VersionLog {
             echo "\${git_dir}" >> repository_version.log
             git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log
             described_tags=\$(git describe --tags)
-            echo "\${git_dir}: \"${described_tags}\"" >> versions.yml
+            echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml
         done
         """
 }
\ No newline at end of file

From 2cc7d389de5437d5412125c75d97087aeb6daa60 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Tue, 26 Mar 2024 14:17:52 +0100
Subject: [PATCH 38/73] add git dir to describe

---
 Utils/VersionLog.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf
index 70179c3..9d5c654 100644
--- a/Utils/VersionLog.nf
+++ b/Utils/VersionLog.nf
@@ -18,7 +18,7 @@ process VersionLog {
         do
             echo "\${git_dir}" >> repository_version.log
             git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log
-            described_tags=\$(git describe --tags)
+            described_tags=\$(git --git-dir=\${git_dir}/.git describe --tags)
             echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml
         done
         """

From e7eabb3917df25b099ca1aaf97d050817d7c7695 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Wed, 27 Mar 2024 17:09:52 +0100
Subject: [PATCH 39/73] code review suggestions applied

---
 DIMS/AssignToBins.R              |  53 +++--
 DIMS/AssignToBins.nf             |   8 +-
 DIMS/AverageTechReplicates.R     | 103 ++++----
 DIMS/AverageTechReplicates.nf    |  17 +-
 DIMS/CollectFilled.R             |  22 +-
 DIMS/CollectFilled.nf            |   2 +-
 DIMS/CollectSumAdducts.R         |   4 +-
 DIMS/CollectSumAdducts.nf        |   2 +-
 DIMS/FillMissing.R               |  36 ++-
 DIMS/FillMissing.nf              |   8 +-
 DIMS/GenerateBreaks.R            |  35 ++-
 DIMS/GenerateBreaks.nf           |   2 +-
 DIMS/GenerateExcel.R             | 397 ++++++++++++++++---------------
 DIMS/GenerateExcel.nf            |  10 +-
 DIMS/GenerateViolinPlots.R       | 214 ++++++++++-------
 DIMS/GenerateViolinPlots.nf      |  15 +-
 DIMS/HMDBparts.R                 | 115 +++++----
 DIMS/HMDBparts.nf                |   2 -
 DIMS/HMDBparts_main.R            |  20 +-
 DIMS/HMDBparts_main.nf           |   1 -
 DIMS/MakeInit.R                  |  17 +-
 DIMS/MakeInit.nf                 |   3 +-
 DIMS/PeakFinding.R               |  82 ++++---
 DIMS/PeakFinding.nf              |   6 +-
 DIMS/PeakGrouping.R              |  46 ++--
 DIMS/PeakGrouping.nf             |  10 +-
 DIMS/SpectrumPeakFinding.R       |   6 +-
 DIMS/SpectrumPeakFinding.nf      |   4 +-
 DIMS/SumAdducts.R                |  47 ++--
 DIMS/SumAdducts.nf               |   6 +-
 DIMS/ThermoRawFileParser.nf      |   5 +-
 DIMS/UnidentifiedCalcZscores.R   |  18 +-
 DIMS/UnidentifiedCalcZscores.nf  |   4 +-
 DIMS/UnidentifiedCollectPeaks.R  |  11 +-
 DIMS/UnidentifiedCollectPeaks.nf |   8 +-
 DIMS/UnidentifiedFillMissing.R   |  31 ++-
 DIMS/UnidentifiedFillMissing.nf  |   8 +-
 DIMS/UnidentifiedPeakGrouping.R  |  31 ++-
 DIMS/UnidentifiedPeakGrouping.nf |   6 +-
 39 files changed, 751 insertions(+), 664 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index ed6984a..7c7a95a 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -7,40 +7,31 @@ suppressPackageStartupMessages(library("xcms"))
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-filepath        <- cmd_args[1]
+mzml_filepath <- cmd_args[1]
 breaks_filepath <- cmd_args[2]
-resol           <- as.numeric(cmd_args[3])
-trim            <- 0.1
-dims_thresh     <- 100
+resol <- as.numeric(cmd_args[3])
+trim <- 0.1
+dims_thresh <- 100
+
+# load breaks_fwhm
+load(breaks_filepath)
 
 # get sample name
-sample_name <- sub("\\..*$", "", basename(filepath))
+sample_name <- sub("\\..*$", "", basename(mzml_filepath))
 
 options(digits = 16)
 
 # Initialize
-int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve
-scale      <- 2 # Initial value used to estimate scaling parameter
-width      <- 1024
-height     <- 768
-trim_left       <- NULL
-trim_right      <- NULL
-breaks_fwhm     <- NULL
-breaks_fwhm_avg <- NULL
-bins            <- NULL
-pos_results     <- NULL
-neg_results     <- NULL
+pos_results <- NULL
+neg_results <- NULL
 
 # read in the data for 1 sample
-raw_data <- suppressMessages(xcmsRaw(filepath))
+raw_data <- suppressMessages(xcmsRaw(mzml_filepath))
 
 # for TIC plots: prepare txt files with data for plots
 tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
 colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity")
-write.table(tic_intensity_persample, file = paste0("./", sample_name, "_TIC.txt"))
-
-# load breaks_fwhm
-load(breaks_filepath)
+write.table(tic_intensity_persample, file = paste0(sample_name, "_TIC.txt"))
 
 # Create empty placeholders for later use
 bins <- rep(0, length(breaks_fwhm) - 1)
@@ -65,10 +56,20 @@ pos_raw_data_matrix <- raw_data_matrix[pos_index, ]
 neg_raw_data_matrix <- raw_data_matrix[neg_index, ]
 
 # Get index for binning intensity values
-bin_indices_pos <- cut(pos_raw_data_matrix[, "mz"], breaks_fwhm,
-                       include.lowest = TRUE, right = TRUE, labels = FALSE)
-bin_indices_neg <- cut(neg_raw_data_matrix[, "mz"], breaks_fwhm,
-                       include.lowest = TRUE, right = TRUE, labels = FALSE)
+bin_indices_pos <- cut(
+  pos_raw_data_matrix[, "mz"], 
+  breaks_fwhm,
+  include.lowest = TRUE, 
+  right = TRUE, 
+  labels = FALSE
+)
+bin_indices_neg <- cut(
+  neg_raw_data_matrix[, "mz"], 
+  breaks_fwhm, 
+  include.lowest = TRUE, 
+  right = TRUE, 
+  labels = FALSE
+)
 
 # Get the list of intensity values for each bin, and add the
 # intensity values which are in the same bin
@@ -121,4 +122,4 @@ neg_results_final <- t(neg_results_transpose)
 
 peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm)
 
-save(peak_list, file = paste0("./", sample_name, ".RData"))
+save(peak_list, file = paste0(sample_name, ".RData"))
diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf
index 25b3891..d0a7098 100644
--- a/DIMS/AssignToBins.nf
+++ b/DIMS/AssignToBins.nf
@@ -5,15 +5,15 @@ process AssignToBins {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple val(file_id), path(mzML_filename) , path(breaks_file)
+       tuple(val(file_id), path(mzML_file), path(breaks_file))
 
     output:
-       path("${file_id}.RData"), emit: RData_files
-       path("${file_id}_TIC.txt"), emit: TIC_txt_files
+       path("${file_id}.RData"), emit: rdata_file
+       path("${file_id}_TIC.txt"), emit: tic_txt_file
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $params.resolution
+        Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution
         """
 }
 
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index d470d7d..b243025 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -1,38 +1,31 @@
 #!/usr/bin/Rscript
 # adapted from 3-AverageTechReplicates.R
 
+# load packages
+library("ggplot2")
+library("gridExtra")
+
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-init_filepath   <- cmd_args[1]
-nr_replicates   <- as.numeric(cmd_args[2])
-thresh2remove   <- 1000000000
-dims_thresh     <- 100
-run_name        <- cmd_args[3]
-dims_matrix     <- cmd_args[4]
+init_file <- cmd_args[1]
+nr_replicates <- as.numeric(cmd_args[2])
+run_name <- cmd_args[3]
+dims_matrix <- cmd_args[4]
 highest_mz_file <- cmd_args[5]
 highest_mz <- get(load(highest_mz_file))
-
-# lower the threshold below which a sample will be removed for DBS and for high m/z
-if (dims_matrix == "DBS") {
-  thresh2remove <- 50000000
-}
-if (highest_mz > 700) {
-  thresh2remove <- 1000000
-}
-
-library("ggplot2")
-library("gridExtra")
+thresh2remove <- 1000000000
+dims_thresh <- 100
 
 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
-  tmp <- repl_pattern
+  # collect list of samples to remove from replication pattern
   remove_from_group <- NULL
-  for (sample_nr in 1:length(tmp)){
-    tmp2 <- repl_pattern[[sample_nr]]
+  for (sample_nr in 1:length(repl_pattern)){
+    repl_pattern_1sample <- repl_pattern[[sample_nr]]
     remove <- NULL
-    for (file_name in 1:length(tmp2)) {
-      if (tmp2[file_name] %in% bad_samples) {
-        remove <- c(remove, file_name)
+    for (file_nr in 1:length(repl_pattern_1sample)) {
+      if (repl_pattern_1sample[file_nr] %in% bad_samples) {
+        remove <- c(remove, file_nr)
       }
     }
     if (length(remove) == nr_replicates) {
@@ -49,26 +42,35 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) {
 }
 
 # get repl_pattern
-load("./init.RData")
+load(init_file)
 
+# lower the threshold below which a sample will be removed for DBS and for high m/z
+if (dims_matrix == "DBS") {
+  thresh2remove <- 50000000
+}
+if (highest_mz > 700) {
+  thresh2remove <- 1000000
+}
+
+# remove technical replicates which are below the threshold
 remove_neg <- NULL
 remove_pos <- NULL
 cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n")
 for (sample_nr in 1:length(repl_pattern)) {
+  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   tech_reps_array_pos <- NULL
   tech_reps_array_neg <- NULL
-  tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   sum_neg <- 0
   sum_pos <- 0
   nr_pos <- 0
   nr_neg <- 0
-  for (file_name in 1:length(tech_reps)) {
-    load(paste("./", tech_reps[file_name], ".RData", sep = ""))
+  for (file_nr in 1:length(tech_reps)) {
+    load(paste(tech_reps[file_nr], ".RData", sep = ""))
     # negative scanmode
     cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
     if (sum(peak_list$neg[, 1]) < thresh2remove) {
       cat(" ... Removed")
-      remove_neg <- c(remove_neg, tech_reps[file_name])
+      remove_neg <- c(remove_neg, tech_reps[file_nr])
     } else {
       nr_neg <- nr_neg + 1
       sum_neg <- sum_neg + peak_list$neg
@@ -78,7 +80,7 @@ for (sample_nr in 1:length(repl_pattern)) {
     cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1]))
     if (sum(peak_list$pos[, 1]) < thresh2remove) {
       cat(" ... Removed")
-      remove_pos <- c(remove_pos, tech_reps[file_name])
+      remove_pos <- c(remove_pos, tech_reps[file_nr])
     } else {
       nr_pos <- nr_pos + 1
       sum_pos <- sum_pos + peak_list$pos
@@ -89,28 +91,38 @@ for (sample_nr in 1:length(repl_pattern)) {
   if (nr_neg != 0) {
     sum_neg[, 1] <- sum_neg[, 1] / nr_neg
     colnames(sum_neg) <- names(repl_pattern)[sample_nr]
-    save(sum_neg, file = paste0("./", names(repl_pattern)[sample_nr], "_neg_avg.RData"))
+    save(sum_neg, file = paste0(names(repl_pattern)[sample_nr], "_neg_avg.RData"))
   }
   if (nr_pos != 0) {
     sum_pos[, 1] <- sum_pos[, 1] / nr_pos
     colnames(sum_pos) <- names(repl_pattern)[sample_nr]
-    save(sum_pos, file = paste0("./", names(repl_pattern)[sample_nr], "_pos_avg.RData"))
+    save(sum_pos, file = paste0(names(repl_pattern)[sample_nr], "_pos_avg.RData"))
   }
 }
 
 pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
-save(repl_pattern_filtered, file = "./negative_repl_pattern.RData")
-write.table(remove_neg, file = "./miss_infusions_negative.txt",
-            row.names = FALSE, col.names = FALSE, sep = "\t")
+save(repl_pattern_filtered, file = "negative_repl_pattern.RData")
+write.table(
+  remove_neg, 
+  file = "miss_infusions_negative.txt", 
+  row.names = FALSE, 
+  col.names = FALSE, 
+  sep = "\t"
+)
 
 pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates)
 repl_pattern_filtered <- pattern_list$pattern
-save(repl_pattern_filtered, file = "./positive_repl_pattern.RData")
-write.table(remove_pos, file = "./miss_infusions_positive.txt",
-            row.names = FALSE, col.names = FALSE, sep = "\t")
+save(repl_pattern_filtered, file = "positive_repl_pattern.RData")
+write.table(
+  remove_pos, 
+  file = "miss_infusions_positive.txt", 
+  row.names = FALSE, 
+  col.names = FALSE, 
+  sep = "\t"
+)
 
-# New: generate TIC plots
+## generate TIC plots
 # get all txt files
 tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt")
 all_samps <- sub("_TIC\\..*$", "", basename(tic_files))
@@ -126,16 +138,17 @@ for (file in tic_files) {
   }
 }
 
+# create a list with information for all TIC plots
 tic_plot_list <- list()
 plot_nr <-  0
 for (sample_nr in c(1:length(repl_pattern))) {
   tech_reps <- as.vector(unlist(repl_pattern[sample_nr]))
   sample_name <- names(repl_pattern)[sample_nr]
-  for (file_name in 1:length(tech_reps)) {
+  for (file_nr in 1:length(tech_reps)) {
     plot_nr <- plot_nr + 1
-    repl1_nr <- read.table(tic_files[file_name])
-    bad_color_pos <- tech_reps[file_name] %in% remove_pos[[1]]
-    bad_color_neg <- tech_reps[file_name] %in% remove_neg[[1]]
+    repl1_nr <- read.table(tic_files[file_nr])
+    bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]]
+    bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]]
     if (bad_color_neg & bad_color_pos) {
       plot_color <- "#F8766D"
     } else if (bad_color_pos) {
@@ -148,7 +161,7 @@ for (sample_nr in c(1:length(repl_pattern))) {
     tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) +
       geom_line(linewidth = 0.3) +
       geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) +
-      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_name], "  ||  ", sample_name)) +
+      labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], "  ||  ", sample_name)) +
       theme(plot.background = element_rect(fill = plot_color),
             axis.text = element_text(size = 4),
             axis.title = element_text(size = 4),
@@ -156,6 +169,7 @@ for (sample_nr in c(1:length(repl_pattern))) {
     tic_plot_list[[plot_nr]] <- tic_plot
   }
 }
+
 # create a layout matrix dependent on number of replicates
 layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE)
 # put TIC plots in matrix
@@ -169,6 +183,7 @@ tic_plot_pdf <- marrangeGrob(
     g, "/", npages
   ))
 )
+
 # save to file
-ggsave(filename = paste0("./", run_name, "_TICplots.pdf"),
+ggsave(filename = paste0(run_name, "_TICplots.pdf"),
        tic_plot_pdf, width = 21, height = 29.7, units = "cm")
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index 264d2f7..9fe9a98 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -5,23 +5,26 @@ process AverageTechReplicates {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(RData_file)     // input files need to be linked, but called within R script
-       path(TIC_txt_files)  // input files need to be linked, but called within R script
-       path(init_filepath)
+       path(rdata_file)
+       path(tic_txt_files)
+       path(init_file)
        val(nr_replicates)
        val(analysis_id)
        val(matrix)
-       path(highest_mz)
+       path(highest_mz_file)
 
     output:
-       path('*_repl_pattern.RData'), emit: patterns
-       path('*_avg.RData'), emit: binned
+       path('*_repl_pattern.RData'), emit: pattern_files
+       path('*_avg.RData'), emit: binned_files
        path('miss_infusions_negative.txt')
        path('miss_infusions_positive.txt')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix $highest_mz
+        Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \
+                                                                      $params.nr_replicates \
+                                                                      $analysis_id $matrix \
+                                                                      $highest_mz_file
         """
 }
 
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index ae18ad8..f28e1c9 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -4,16 +4,15 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-scripts_dir        <- cmd_args[1]
-ppm     <- as.numeric(cmd_args[2])
+scripts_dir <- cmd_args[1]
+ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R"))
-source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R"))
+source(paste0(scripts_dir, "mergeDuplicatedRows.R"))
+source(paste0(scripts_dir, "statistics_z.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
-
 for (scanmode in scanmodes) {
   # get list of files
   filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled"))
@@ -23,17 +22,14 @@ for (scanmode in scanmodes) {
     peakgrouplist_filled <- get(load(filled_files[file_nr]))
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
-
   # remove duplicates; peak groups with exactly the same m/z
   outlist_total <- mergeDuplicatedRows(outlist_total)
-
   # sort on mass
   outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ]
-
   # load replication pattern
   pattern_file <- paste0(scanmode, "_repl_pattern.RData")
   repl_pattern <- get(load(pattern_file))
-
+  # calculate Z-scores
   if (z_score == 1) {
     outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE)
     nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
@@ -44,7 +40,7 @@ for (scanmode in scanmodes) {
       outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
       outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]
     )
-
+    # sort Z-score columns and append to peak group list
     tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE)
     tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index]))
     tmp <- outlist_stats_more[, tmp_index[tmp_index_order]]
@@ -52,9 +48,10 @@ for (scanmode in scanmodes) {
     outlist_stats_more <- cbind(outlist_stats_more, tmp)
     outlist_total <- outlist_stats_more
   }
-
+  
+  # make a copy of the outlist
   outlist_ident <- outlist_total
-
+  # select identified peak groups if ppm deviation is within limits
   if (z_score == 1) {
     outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev)
     outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ]
@@ -70,5 +67,6 @@ for (scanmode in scanmodes) {
   remove_colindex <- which(colnames(outlist_ident) %in% remove_columns)
   outlist_ident <- outlist_ident[, -remove_colindex]
   write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
+  # output in RData format
   save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData"))
 }
diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf
index 4261410..53b1967 100644
--- a/DIMS/CollectFilled.nf
+++ b/DIMS/CollectFilled.nf
@@ -6,7 +6,7 @@ process CollectFilled {
 
     input:
        path(filled_files)
-       each path(replication_pattern)  // input files need to be linked, but called within R script
+       each path(replication_pattern)
 
     output:
        path('outlist*.txt')
diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R
index e935eb8..dcd05af 100755
--- a/DIMS/CollectSumAdducts.R
+++ b/DIMS/CollectSumAdducts.R
@@ -4,10 +4,10 @@
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
+# collect all AdductSums part files for each scanmode
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
-  # collect all AdductSums part files for each scanmode
   adductsum_part_files <- list.files("./", pattern = scanmode)
 
   outlist.tot <- NULL
@@ -17,6 +17,6 @@ for (scanmode in scanmodes) {
   }
 
   # save output file
-  save(outlist.tot, file=paste0("AdductSums_", scanmode, ".RData"))
+  save(outlist.tot, file = paste0("AdductSums_", scanmode, ".RData"))
 }
 
diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf
index da7ab0c..2bd43ff 100644
--- a/DIMS/CollectSumAdducts.nf
+++ b/DIMS/CollectSumAdducts.nf
@@ -5,7 +5,7 @@ process CollectSumAdducts {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(collect_file) // input files need to be linked, but called within R script
+       path(collect_files)
 
     output:
        path('AdductSums_*.RData')
diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index 1c1540c..f7166d0 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -4,37 +4,35 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-# define parameters
 peakgrouplist_file <- cmd_args[1]
-scripts_dir        <- cmd_args[2]
-thresh  <- as.numeric(cmd_args[3])
-resol   <- as.numeric(cmd_args[4])
-ppm     <- as.numeric(cmd_args[5])
+scripts_dir <- cmd_args[2]
+thresh <- as.numeric(cmd_args[3])
+resol <- as.numeric(cmd_args[4])
+ppm <- as.numeric(cmd_args[5])
 outdir <- "./"
 
+# load in function scripts
+source(paste0(scripts_dir, "replaceZeros.R"))
+source(paste0(scripts_dir, "generateGaussian.R"))
+source(paste0(scripts_dir, "getFwhm.R"))
+source(paste0(scripts_dir, "getSD.R"))
+source(paste0(scripts_dir, "getArea.R"))
+source(paste0(scripts_dir, "optimizeGauss.R"))
+source(paste0(scripts_dir, "ident.hires.noise.HPC.R"))
+source(paste0(scripts_dir, "elementInfo.R"))
+source(paste0(scripts_dir, "globalAssignments.HPC.R"))
+
+# determine scan mode
 if (grepl("_pos", peakgrouplist_file)) {
   scanmode <- "positive"
 } else if (grepl("_neg", peakgrouplist_file)) {
   scanmode <- "negative"
 }
 
-# load in function scripts
-source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
-source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getSD.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getArea.R"))
-source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R"))
-source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R"))
-source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R"))
-source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R"))
-
 # get replication pattern for sample names
 pattern_file <- paste0(scanmode, "_repl_pattern.RData")
 repl_pattern <- get(load(pattern_file))
 
-print(head(repl_pattern))
-
 # load peak group list and determine output file name
 outpgrlist_identified <- get(load(peakgrouplist_file))
 
@@ -44,4 +42,4 @@ outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
 # save output
-save(peakgrouplist_filled, file = paste0("./", outputfile_name))
+save(peakgrouplist_filled, file = outputfile_name)
diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf
index 4f71b77..b0d1850 100644
--- a/DIMS/FillMissing.nf
+++ b/DIMS/FillMissing.nf
@@ -1,18 +1,18 @@
 process FillMissing {
-    tag "DIMS FillMissing ${GroupedList_file}"
+    tag "DIMS FillMissing ${peakgrouplist_file}"
     label 'FillMissing'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(GroupedList_file)
-       each path(replication_pattern) // input files need to be linked, but called within R script
+       path(peakgrouplist_file)
+       each path(replication_pattern)
 
     output:
        path('*_filled.RData')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
         """
 }
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 0de2426..3e38f1c 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -1,6 +1,5 @@
-#### GenerateBreaks.R ####
-## adapted from 1-generateBreaksFwhm.HPC.R ##
 #!/usr/bin/Rscript
+## adapted from 1-generateBreaksFwhm.HPC.R ##
 
 # load required package
 suppressPackageStartupMessages(library("xcms"))
@@ -9,16 +8,16 @@ suppressPackageStartupMessages(library("xcms"))
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 filepath <- cmd_args[1]
-outdir   <- cmd_args[2]
-trim     <- as.numeric(cmd_args[3])
-resol    <- as.numeric(cmd_args[4])
+outdir <- cmd_args[2]
+trim <- as.numeric(cmd_args[3])
+resol <- as.numeric(cmd_args[4])
 
 # initialize
-trim_left        <- NULL
-trim_right       <- NULL
-breaks_fwhm      <- NULL
-breaks_fwhm_avg  <- NULL
-bins             <- NULL
+trim_left <- NULL
+trim_right <- NULL
+breaks_fwhm <- NULL
+breaks_fwhm_avg <- NULL
+bins <- NULL
 
 # read in mzML file
 raw_data <- suppressMessages(xcmsRaw(filepath))
@@ -33,15 +32,15 @@ high_mz <- raw_data@mzrange[2]
 
 # determine number of segments (bins)
 nr_segments <- 2 * (high_mz - low_mz)
-segment  <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
+segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1)
 
 # determine start and end of each bin.
 for (i in 1:nr_segments) {
-  start_segment   <- segment[i]
-  end_segment     <- segment[i+1]
-  resol_mz        <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
-  fwhm_segment    <- start_segment / resol_mz
-  breaks_fwhm     <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
+  start_segment <- segment[i]
+  end_segment <- segment[i+1]
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200)))
+  fwhm_segment <- start_segment / resol_mz
+  breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment))
   # average the m/z instead of start value
   range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)
   delta_mz <- range[2] - range[1]
@@ -49,5 +48,5 @@ for (i in 1:nr_segments) {
 }
 
 # generate output file
-save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "./breaks.fwhm.RData")
-save(high_mz, file = "./highest_mz.RData")
+save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "breaks.fwhm.RData")
+save(high_mz, file = "highest_mz.RData")
diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf
index 4e855f2..af617a8 100644
--- a/DIMS/GenerateBreaks.nf
+++ b/DIMS/GenerateBreaks.nf
@@ -5,7 +5,7 @@ process GenerateBreaks {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple val(file_id), path(mzML_file)
+       tuple(val(file_id), path(mzML_file))
 
 
     output:
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index 3e55f7e..420b2e1 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -1,15 +1,6 @@
 #!/usr/bin/Rscript
 ## adapted from 13-excelExport.R
 
-# define parameters
-cmd_args <- commandArgs(trailingOnly = TRUE)
-
-init_filepath <- cmd_args[1]
-project       <- cmd_args[2]
-dims_matrix   <- cmd_args[3]
-hmdb          <- cmd_args[4]
-z_score       <- as.numeric(cmd_args[5])
-
 # load required packages
 library("ggplot2")
 library("reshape2")
@@ -18,55 +9,65 @@ library("loder")
 suppressMessages(library("dplyr"))
 suppressMessages(library("stringr"))
 
+# define parameters
+cmd_args <- commandArgs(trailingOnly = TRUE)
+
+init_file <- cmd_args[1]
+project <- cmd_args[2]
+dims_matrix <- cmd_args[3]
+hmdb_file <- cmd_args[4]
+z_score <- as.numeric(cmd_args[5])
+
 round_df <- function(df, digits) {
-  #' function for rounding numbers to x digits for numeric values
+  #' Round numbers to a set number of digits for numeric values
   #'
-  #' @param df dataframe
-  #' @param digits integer number of digits to round off to
+  #' @param df: Dataframe containing numeric values
+  #' @param digits: Number of digits to round off to (integer)
   #'
-  #' @return df
+  #' @return df: Dataframe with rounded numbers
   numeric_columns <- sapply(df, mode) == "numeric"
   df[numeric_columns] <- round(df[numeric_columns], digits)
   return(df)
 }
 
 robust_scaler <- function(control_intensities, control_col_ids, perc = 5) {
-  #' calculate robust scaler: Z-score based on controls without outliers
+  #' Calculate robust scaler: Z-score based on controls without outliers
   #'
-  #' @param control_intensities matrix with intensities for control samples
-  #' @param control_col_ids vector with column names for control samples
-  #' @param perc float percentage of outliers which will be removed from controls
+  #' @param control_intensities: Matrix with intensities for control samples
+  #' @param control_col_ids: Vector with column names for control samples
+  #' @param perc: Percentage of outliers which will be removed from controls (float)
   #'
-  #' @return trimmed_control_intensities
+  #' @return trimmed_control_intensities: Intensities trimmed for outliers
   nr_toremove <- ceiling(length(control_col_ids) * perc / 100)
   sorted_control_intensities <- sort(as.numeric(control_intensities))
-  trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : (length(sorted_control_intensities) - nr_toremove)]
+  trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : 
+							    (length(sorted_control_intensities) - nr_toremove)]
   return(trimmed_control_intensities)
 }
 
-# Initialise and load data
-plot   <- TRUE
+# Initialise
+plot <- TRUE
 export <- TRUE
 control_label <- "C"
-case_label    <- "P"
+case_label <- "P"
 imagesize_multiplier <- 2
-# setting outdir to export files to the project directory
+# setting outdir to export files to the working directory
 outdir <- "./"
 # percentage of outliers to remove from calculation of robust scaler
 perc <- 5
 
 # load information on samples
-load(init_filepath)
+load(init_file)
 # load the HMDB file with info on biological relevance of metabolites
-load(hmdb)
+load(hmdb_file)
 
 # get current date
 rundate <- Sys.Date()
 
 # create a directory for plots in project directory
-plotdir <- paste0(outdir, "/plots/adducts")
 dir.create(paste0(outdir, "/plots"), showWarnings = FALSE)
-dir.create(plotdir, showWarnings = FALSE)
+plot_dir <- paste0(outdir, "/plots/adducts")
+dir.create(plot_dir, showWarnings = FALSE)
 
 # set the number of digits for floats
 options(digits = 16)
@@ -128,11 +129,14 @@ if (z_score == 1) {
 
   # Get columns with control intensities
   control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE)
-  control_columns <- outlist[, control_col_ids]
+  control_columns <- as.data.frame(outlist[, control_col_ids])
+  colnames(control_columns) <- colnames(outlist)[control_col_ids]
 
   # Get columns with patient intensities
   patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE)
-  patient_columns <- outlist[, patient_col_ids]
+  patient_columns <- as.data.frame(outlist[, patient_col_ids])
+  colnames(patient_columns) <- colnames(outlist)[patient_col_ids]
+
   intensity_col_ids <- c(control_col_ids, patient_col_ids)
 
   # if there are any intensities of 0 left, set them to NA for stats
@@ -198,7 +202,13 @@ if (z_score == 1) {
     for (i in 1:length(patient_ids)) {
       id <- patient_ids[i]
       # combine all intensities that start with the same string for patients
-      patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ])[startsWith(names(patient_columns[1, ]), paste0(id, "."))]])))
+      # exception: if there is only one patient id, skip this step; nothing to combine
+      if (ncol(patient_columns) > 1) {
+        patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ])
+  						 [startsWith(names(patient_columns[1, ]), paste0(id, "."))]])))
+      } else {
+	patient_int <- as.numeric(unlist(as.vector(patient_columns)))
+      }
       intensities[[i + 1]] <- patient_int
     }
     intensities <- setNames(intensities, labels)
@@ -207,7 +217,7 @@ if (z_score == 1) {
 
     plot.new()
     if (export) {
-      png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"),
+      png(filename = paste0(plot_dir, "/", hmdb_name, "_box.png"),
           width = plot_width,
           height = 280)
     }
@@ -220,7 +230,7 @@ if (z_score == 1) {
             main = hmdb_name)
     dev.off()
 
-    file_png <- paste0(plotdir, "/", hmdb_name, "_box.png")
+    file_png <- paste0(plot_dir, "/", hmdb_name, "_box.png")
     if (is.null(temp_png)) {
       temp_png <- readPng(file_png)
       img_dim <- dim(temp_png)[c(1, 2)]
@@ -248,23 +258,25 @@ if (z_score == 1) {
   setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18)
   setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20)
 }
+
 # write Excel file
 writeData(wb, sheet = 1, outlist, startCol = 1)
 xlsx_name <- paste0(outdir, "/", project, ".xlsx")
 saveWorkbook(wb, xlsx_name, overwrite = TRUE)
-cat(xlsx_name)
 rm(wb)
 
-#### INTERNE STANDAARDEN ####
-IS <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ]
-IS_codes <- rownames(IS)
-cat(IS_codes, "\n")
+#### INTERNAL STANDARDS ####
+is_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ]
+is_codes <- rownames(is_list)
 
-# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS),
-# thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data.
-sample_names_nodata <- setdiff(names(repl_pattern), names(IS))
+# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates 
+# because of too little data (threshold parameter)i, the init.RData (repl_pattern) will contain more sample_names 
+# than the peak data (IS), so this data needs to be removed first, before the retrieval of the summed adducts. 
+# Write sample_names to a log file, to let user know that this sample_name contained no data.
+sample_names_nodata <- setdiff(names(repl_pattern), names(is_list))
 if (!is.null(sample_names_nodata)) {
-  write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
+  write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), 
+	      row.names = FALSE, col.names = FALSE, quote = FALSE)
   cat(sample_names_nodata, "\n")
   for (sample_name in sample_names_nodata) {
     repl_pattern[[sample_name]] <- NULL
@@ -272,324 +284,327 @@ if (!is.null(sample_names_nodata)) {
 }
 
 # Retrieve IS summed adducts
-IS_summed <- IS[c(names(repl_pattern), "HMDB_code")]
-IS_summed$HMDB.name <- IS$name
-IS_summed <- reshape2::melt(IS_summed, id.vars = c("HMDB_code", "HMDB.name"))
-colnames(IS_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
-IS_summed$Intensity <- as.numeric(IS_summed$Intensity)
-IS_summed$Matrix <- dims_matrix
-IS_summed$Rundate <- rundate
-IS_summed$Project <- project
-IS_summed$Intensity <- as.numeric(as.character(IS_summed$Intensity))
+is_summed <- is_list[c(names(repl_pattern), "HMDB_code")]
+is_summed$HMDB.name <- is_list$name
+is_summed <- reshape2::melt(is_summed, id.vars = c("HMDB_code", "HMDB.name"))
+colnames(is_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
+is_summed$Intensity <- as.numeric(is_summed$Intensity)
+is_summed$Matrix <- dims_matrix
+is_summed$Rundate <- rundate
+is_summed$Project <- project
+is_summed$Intensity <- as.numeric(as.character(is_summed$Intensity))
 
 # Retrieve IS positive mode
-IS_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% IS_codes))
-IS_pos$HMDB_name <- IS[match(row.names(IS_pos), IS$HMDB_code, nomatch = NA), "name"]
-IS_pos$HMDB.code <- row.names(IS_pos)
-IS_pos <- reshape2::melt(IS_pos, id.vars = c("HMDB.code", "HMDB_name"))
-colnames(IS_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
-IS_pos$Matrix <- dims_matrix
-IS_pos$Rundate <- rundate
-IS_pos$Project <- project
-IS_pos$Intensity <- as.numeric(as.character(IS_pos$Intensity))
+is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes))
+is_pos$HMDB_name <- is_list[match(row.names(is_pos), is_list$HMDB_code, nomatch = NA), "name"]
+is_pos$HMDB.code <- row.names(is_pos)
+is_pos <- reshape2::melt(is_pos, id.vars = c("HMDB.code", "HMDB_name"))
+colnames(is_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
+is_pos$Matrix <- dims_matrix
+is_pos$Rundate <- rundate
+is_pos$Project <- project
+is_pos$Intensity <- as.numeric(as.character(is_pos$Intensity))
 
 # Retrieve IS negative mode
-IS_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% IS_codes))
-IS_neg$HMDB_name <- IS[match(row.names(IS_neg), IS$HMDB_code, nomatch = NA), "name"]
-IS_neg$HMDB.code <- row.names(IS_neg)
-IS_neg <- reshape2::melt(IS_neg, id.vars = c("HMDB.code", "HMDB_name"))
-colnames(IS_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
-IS_neg$Matrix <- dims_matrix
-IS_neg$Rundate <- rundate
-IS_neg$Project <- project
-IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity))
+is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes))
+is_neg$HMDB_name <- is_list[match(row.names(is_neg), is_list$HMDB_code, nomatch = NA), "name"]
+is_neg$HMDB.code <- row.names(is_neg)
+is_neg <- reshape2::melt(is_neg, id.vars = c("HMDB.code", "HMDB_name"))
+colnames(is_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
+is_neg$Matrix <- dims_matrix
+is_neg$Rundate <- rundate
+is_neg$Project <- project
+is_neg$Intensity <- as.numeric(as.character(is_neg$Intensity))
 
 # Save results
-save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, "_IS_results.RData"))
+save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData"))
 
 # number of samples, for plotting length and width
 sample_count <- length(repl_pattern)
 
 # change the order of the x-axis summed plots to a natural sorted one
-Sample_naturalorder <- unique(as.character(IS_summed$Sample))
-Sample_naturalorder <- str_sort(Sample_naturalorder, numeric = TRUE)
-IS_summed$Sample_level <- factor(IS_summed$Sample, levels = c(Sample_naturalorder))
-IS_pos$Sample_level <- factor(IS_pos$Sample, levels = c(Sample_naturalorder))
-IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder))
+sample_naturalorder <- unique(as.character(is_summed$Sample))
+sample_naturalorder <- str_sort(sample_naturalorder, numeric = TRUE)
+is_summed$Sample_level <- factor(is_summed$Sample, levels = c(sample_naturalorder))
+is_pos$Sample_level <- factor(is_pos$Sample, levels = c(sample_naturalorder))
+is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder))
 
 ## bar plots with all IS
-
-# function for ggplot theme
 # theme for all IS bar plots
-theme_IS_bar <- function(myPlot) {
-  myPlot +
+theme_is_bar <- function(my_plot) {
+  my_plot +
     scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
-    theme(
-      legend.position = "none",
-      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6),
-      axis.text.y = element_text(size = 6)
+    theme(legend.position = "none", 
+	  axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), 
+	  axis.text.y = element_text(size = 6)
     )
 }
 
 # ggplot functions
-IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
+is_neg_bar_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
   geom_bar(aes(fill = HMDB.name), stat = "identity") +
   labs(x = "", y = "Intensity") +
   facet_wrap(~HMDB.name, scales = "free_y")
 
-IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
+is_pos_bar_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
   geom_bar(aes(fill = HMDB.name), stat = "identity") +
   labs(x = "", y = "Intensity") +
   facet_wrap(~HMDB.name, scales = "free_y")
 
-IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
+is_sum_bar_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Summed)") +
   geom_bar(aes(fill = HMDB.name), stat = "identity") +
   labs(x = "", y = "Intensity") +
   facet_wrap(~HMDB.name, scales = "free_y")
 
 # add theme to ggplot functions
-IS_neg_bar_plot <- theme_IS_bar(IS_neg_bar_plot)
-IS_pos_bar_plot <- theme_IS_bar(IS_pos_bar_plot)
-IS_sum_bar_plot <- theme_IS_bar(IS_sum_bar_plot)
+is_neg_bar_plot <- theme_is_bar(is_neg_bar_plot)
+is_pos_bar_plot <- theme_is_bar(is_pos_bar_plot)
+is_sum_bar_plot <- theme_is_bar(is_sum_bar_plot)
 
 # save plots to disk
 plot_width <- 9 + 0.35 * sample_count
-ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot = IS_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot = IS_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot = IS_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), 
+       plot = is_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), 
+       plot = is_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), 
+       plot = is_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in")
 
 ## Line plots with all IS
-
 # function for ggplot theme
 # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines
-theme_IS_line <- function(myPlot) {
-  myPlot +
-    guides(
-      shape = guide_legend(override.aes = list(size = 0.5)),
-      color = guide_legend(override.aes = list(size = 0.5))
+theme_is_line <- function(my_plot) {
+  my_plot +
+    guides(shape = guide_legend(override.aes = list(size = 0.5)), 
+	   color = guide_legend(override.aes = list(size = 0.5))
     ) +
-    theme(
-      legend.title = element_text(size = 8),
-      legend.text = element_text(size = 6),
-      legend.key.size = unit(0.7, "line"),
-      axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)
+    theme(legend.title = element_text(size = 8), 
+	  legend.text = element_text(size = 6), 
+	  legend.key.size = unit(0.7, "line"), 
+	  axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)
     )
 }
 
 # ggplot functions
-IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) +
+is_neg_line_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
   labs(x = "", y = "Intensity")
 
-IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) +
+is_pos_line_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
   labs(x = "", y = "Intensity")
 
-IS_sum_line_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) +
+is_sum_line_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
   labs(x = "", y = "Intensity")
 
 # add theme to ggplot functions
-IS_sum_line_plot <- theme_IS_line(IS_sum_line_plot)
-IS_neg_line_plot <- theme_IS_line(IS_neg_line_plot)
-IS_pos_line_plot <- theme_IS_line(IS_pos_line_plot)
+is_sum_line_plot <- theme_is_line(is_sum_line_plot)
+is_neg_line_plot <- theme_is_line(is_neg_line_plot)
+is_pos_line_plot <- theme_is_line(is_pos_line_plot)
 
 # save plots to disk
 plot_width <- 8 + 0.2 * sample_count
-ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), 
+       plot = is_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), 
+       plot = is_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), 
+       plot = is_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in")
 
 ## bar plots with a selection of IS
-IS_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)")
-IS_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", "2H9-Isovalerylcarnitine (IS)")
-IS_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)")
+is_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", 
+		      "13C6-Tyrosine (IS)")
+is_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", 
+		      "2H9-Isovalerylcarnitine (IS)")
+is_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", 
+		      "13C6-Tyrosine (IS)")
 
 # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum)
 if (dims_matrix == "DBS") {
-  hline.data.neg <-
+  hline_data_neg <-
     data.frame(
       z = c(15000, 200000, 130000, 18000, 50000),
-      HMDB.name = IS_neg_selection
+      HMDB.name = is_neg_selection
     )
-  hline.data.pos <-
+  hline_data_pos <-
     data.frame(
       z = c(150000, 3300000, 1750000, 150000, 270000),
-      HMDB.name = IS_pos_selection
+      HMDB.name = is_pos_selection
     )
-  hline.data.sum <-
+  hline_data_sum <-
     data.frame(
       z = c(1300000, 2500000, 500000, 1800000, 1400000),
-      HMDB.name = IS_sum_selection
+      HMDB.name = is_sum_selection
     )
 } else if (dims_matrix == "Plasma") {
-  hline.data.neg <-
+  hline_data_neg <-
     data.frame(
       z = c(6500, 100000, 75000, 7500, 25000),
-      HMDB.name = IS_neg_selection
+      HMDB.name = is_neg_selection
     )
-  hline.data.pos <-
+  hline_data_pos <-
     data.frame(
       z = c(85000, 1000000, 425000, 70000, 180000),
-      HMDB.name = IS_pos_selection
+      HMDB.name = is_pos_selection
     )
-  hline.data.sum <-
+  hline_data_sum <-
     data.frame(
       z = c(700000, 1250000, 150000, 425000, 300000),
-      HMDB.name = IS_sum_selection
+      HMDB.name = is_sum_selection
     )
 }
 
-# function for ggplot theme
-# see bar plots with all IS
-
 # ggplot functions
-IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
+is_neg_selection_barplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
   geom_bar(aes(fill = HMDB.name), stat = "identity") +
   labs(x = "", y = "Intensity") +
   facet_wrap(~HMDB.name, scales = "free", ncol = 2) +
-  if (exists("hline.data.neg")) {
-    geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))
-  } # subset, if some IS have no data, no empty plots will be generated with a line)
+  if (exists("hline_data_neg")) {
+    geom_hline(aes(yintercept = z), subset(hline_data_neg, HMDB.name %in% is_neg$HMDB.name))
+  }
 
-IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
+is_pos_selection_barplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
   geom_bar(aes(fill = HMDB.name), stat = "identity") +
   labs(x = "", y = "Intensity") +
   facet_wrap(~HMDB.name, scales = "free", ncol = 2) +
-  if (exists("hline.data.pos")) {
-    geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))
+  if (exists("hline_data_pos")) {
+    geom_hline(aes(yintercept = z), subset(hline_data_pos, HMDB.name %in% is_pos$HMDB.name))
   }
 
-IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
+is_sum_selection_barplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
   geom_bar(aes(fill = HMDB.name), stat = "identity") +
   labs(x = "", y = "Intensity") +
   facet_wrap(~HMDB.name, scales = "free", ncol = 2) +
-  if (exists("hline.data.sum")) {
-    geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))
+  if (exists("hline_data_sum")) {
+    geom_hline(aes(yintercept = z), subset(hline_data_sum, HMDB.name %in% is_summed$HMDB.name))
   }
 
 # add theme to ggplot functions
-IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot)
-IS_pos_selection_barplot <- theme_IS_bar(IS_pos_selection_barplot)
-IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot)
+is_neg_selection_barplot <- theme_is_bar(is_neg_selection_barplot)
+is_pos_selection_barplot <- theme_is_bar(is_pos_selection_barplot)
+is_sum_selection_barplot <- theme_is_bar(is_sum_selection_barplot)
 
 # save plots to disk
 plot_width <- 9 + 0.35 * sample_count
-ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), 
+       plot = is_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), 
+       plot = is_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), 
+       plot = is_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in")
 
 ## line plots with a selection of IS
-
-# function for ggplot theme
-# see line plots with all IS
-
 # ggplot functions
-IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) +
+is_neg_selection_lineplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Neg)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
   labs(x = "", y = "Intensity")
 
-IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) +
+is_pos_selection_lineplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Pos)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
   labs(x = "", y = "Intensity")
 
-IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) +
+is_sum_selection_lineplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) +
   ggtitle("Interne Standaard (Sum)") +
   geom_point(aes(col = HMDB.name)) +
   geom_line(aes(col = HMDB.name, group = HMDB.name)) +
   labs(x = "", y = "Intensity")
 
 # add theme to ggplot functions
-IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot)
-IS_pos_selection_lineplot <- theme_IS_line(IS_pos_selection_lineplot)
-IS_sum_selection_lineplot <- theme_IS_line(IS_sum_selection_lineplot)
+is_neg_selection_lineplot <- theme_is_line(is_neg_selection_lineplot)
+is_pos_selection_lineplot <- theme_is_line(is_pos_selection_lineplot)
+is_sum_selection_lineplot <- theme_is_line(is_sum_selection_lineplot)
 
 # save plots to disk
 plot_width <- 8 + 0.2 * sample_count
-ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
-ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), 
+       plot = is_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), 
+       plot = is_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
+ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), 
+       plot = is_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in")
 
 
 ### POSITIVE CONTROLS CHECK
-# these positive controls need to be in the samplesheet, in order to make the Pos_Contr.RData file
+# these positive controls need to be in the samplesheet, in order to make the positive_control.RData file
 # Positive control samples all have the format P1002.x, P1003.x and P1005.x (where x is a number)
 
 column_list <- colnames(outlist)
 patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_")
 positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list)
-positivecontrol_list <- column_list[positive_controls_index]
+positive_control_list <- column_list[positive_controls_index]
 
 if (z_score == 1) {
   # find if one or more positive control samples are missing
   pos_contr_warning <- c()
-  # any() grep because you get a vector of FALSE's and TRUE's. only one grep match is needed for each positive control
-  if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) &&
-    any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) &&
-    any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))) {
+  if (any(grep("^(P1002\\.)[[:digit:]]+_", positive_control_list)) &&
+    any(grep("^(P1003\\.)[[:digit:]]+_", positive_control_list)) &&
+    any(grep("^(P1005\\.)[[:digit:]]+_", positive_control_list))) {
     cat("All three positive controls are present")
   } else {
-    pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse = " ")
+    pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", 
+				  positive_control_list, " is/are present"), collapse = " ")
   }
   # you need all positive control samples, thus starting the script only if all are available
   if (length(pos_contr_warning) == 0) {
-    ### POSITIVE CONTROLS
     # make positive control excel with specific HMDB_codes in combination with specific control samples
-    PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] # P1001.x_Zscore
-    PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] # P1003.x_Zscore
-    LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] # P1005.x_Zscore
+    pa_sample_name <- positive_control_list[grepl("P1002", positive_control_list)]
+    pku_sample_name <- positive_control_list[grepl("P1003", positive_control_list)]
+    lpi_sample_name <- positive_control_list[grepl("P1005", positive_control_list)]
 
-    PA_codes <- c("HMDB00824", "HMDB00783", "HMDB00123")
-    PKU_codes <- c("HMDB00159")
-    LPI_codes <- c("HMDB00904", "HMDB00641", "HMDB00182")
+    pa_codes <- c("HMDB00824", "HMDB00783", "HMDB00123")
+    pku_codes <- c("HMDB00159")
+    lpi_codes <- c("HMDB00904", "HMDB00641", "HMDB00182")
 
-    PA_data <- outlist[PA_codes, c("HMDB_code", "name", PA_sample_name)]
-    PA_data <- reshape2::melt(PA_data, id.vars = c("HMDB_code", "name"))
-    colnames(PA_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
+    pa_data <- outlist[pa_codes, c("HMDB_code", "name", pa_sample_name)]
+    pa_data <- reshape2::melt(pa_data, id.vars = c("HMDB_code", "name"))
+    colnames(pa_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
 
-    PKU_data <- outlist[PKU_codes, c("HMDB_code", "name", PKU_sample_name)]
-    PKU_data <- reshape2::melt(PKU_data, id.vars = c("HMDB_code", "name"))
-    colnames(PKU_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
+    pku_data <- outlist[pku_codes, c("HMDB_code", "name", pku_sample_name)]
+    pku_data <- reshape2::melt(pku_data, id.vars = c("HMDB_code", "name"))
+    colnames(pku_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
 
-    LPI_data <- outlist[LPI_codes, c("HMDB_code", "name", LPI_sample_name)]
-    LPI_data <- reshape2::melt(LPI_data, id.vars = c("HMDB_code", "name"))
-    colnames(LPI_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
+    lpi_data <- outlist[lpi_codes, c("HMDB_code", "name", lpi_sample_name)]
+    lpi_data <- reshape2::melt(lpi_data, id.vars = c("HMDB_code", "name"))
+    colnames(lpi_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore")
 
-    Pos_Contr <- rbind(PA_data, PKU_data, LPI_data)
-    Pos_Contr$Zscore <- as.numeric(Pos_Contr$Zscore)
+    positive_control <- rbind(pa_data, pku_data, lpi_data)
+    positive_control$Zscore <- as.numeric(positive_control$Zscore)
     # extra information added to excel for future reference. made in beginning of this script
-    Pos_Contr$Matrix <- dims_matrix
-    Pos_Contr$Rundate <- rundate
-    Pos_Contr$Project <- project
+    positive_control$Matrix <- dims_matrix
+    positive_control$Rundate <- rundate
+    positive_control$Project <- project
 
     # Save results
-    save(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.RData"))
-    Pos_Contr$Zscore <- round_df(Pos_Contr$Zscore, 2) # asked by Lab to round the number to 2 digits
-    write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.xlsx"), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE)
+    save(positive_control, file = paste0(outdir, "/", project, "_positive_control.RData"))
+    # round the Z-scores to 2 digits
+    positive_control$Zscore <- round_df(positive_control$Zscore, 2)
+    write.xlsx(positive_control, file = paste0(outdir, "/", project, "_positive_control.xlsx"), 
+	       sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE)
   } else {
-    write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE)
+    write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), 
+		row.names = FALSE, col.names = FALSE, quote = FALSE)
   }
 }
 
 ### MISSING M/Z CHECK
 # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail
-print("Nu in missing m/z check")
 # Load the outlist_identified files + remove the loaded files
 load(paste0(outdir, "/outlist_identified_negative.RData"))
 outlist_ident_neg <- outlist_ident
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index f73b881..32ec928 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -5,18 +5,20 @@ process GenerateExcel {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(collect_file) // input files need to be linked, but called within R script
-       path(identified_file) // input files need to be lined, but called within R script
-       path(init_filepath) 
+       path(collect_files)
+       path(identified_files)
+       path(init_file) 
        val(analysis_id) 
        path(relevance_file)
 
     output:
        path('AdductSums_*.txt')
+       path('*IS_results.RData')
        path('*.xlsx'), emit: excel_file 
+       path('plots'), emit: plot_files 
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $analysis_id $params.matrix $relevance_file $params.zscore 
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_file $analysis_id $params.matrix $relevance_file $params.zscore 
         """
 }
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index 896918f..097d030 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -7,35 +7,70 @@
 #    corresponding Z-scores.
 # 2. All files from github: https://github.com/UMCUGenetics/DIMS
 
+#!/usr/bin/Rscript
 ## adapted from 15-dIEM_violin.R
 
-library(dplyr) # tidytable is for other_isobaric.R (left_join)
-library(reshape2) # used in prepare_data.R
-library(openxlsx) # for opening Excel file
-library(ggplot2) # for plotting
-library(gridExtra) # for table top highest/lowest
+# load packages
+library(dplyr)
+library(reshape2)
+library(openxlsx)
+library(ggplot2)
+library(gridExtra)
 
-# define parameters - check after addition to run.sh
+# define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-run_name    <- cmd_args[1]
+run_name <- cmd_args[1]
 scripts_dir <- cmd_args[2]
-z_score     <- as.numeric(cmd_args[3])
+z_score <- as.numeric(cmd_args[3])
+path_metabolite_groups <- cmd_args[4]
+file_ratios_metabolites <- cmd_args[5]
+file_expected_biomarkers_iem <- cmd_args[6]
+file_explanation <- cmd_args[7]
+file_isomers <- cmd_args[8]
+
+print("arguments")
+print(run_name)
+print(scripts_dir)
+print(z_score)
+print(path_metabolite_groups)
+print(file_ratios_metabolites)
+print(file_expected_biomarkers_iem)
+print(file_explanation)
+print(file_isomers)
+print("end arguments")
+
+# path: output folder for dIEM and violin plots
+output_dir <- "./"
+
+file.copy(file_isomers, output_dir)
+
+# folder for all metabolite lists (.txt)
+# path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups"
+# file for ratios step 3
+# file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv"
+# file for algorithm step 4
+# file_expected_biomarkers_iem <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv"
+# explanation: file with text to be included in violin plots
+# file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt"
+
+# copy list of isomers to project folder.
+# file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir)
 
 # load functions
-source(paste0(scripts_dir, "AddOnFunctions/check_same_samplename.R"))
-source(paste0(scripts_dir, "AddOnFunctions/prepare_data.R"))
-source(paste0(scripts_dir, "AddOnFunctions/prepare_data_perpage.R"))
-source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R"))
-source(paste0(scripts_dir, "AddOnFunctions/create_violin_plots.R"))
-source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R"))
+source(paste0(scripts_dir, "check_same_samplename.R"))
+source(paste0(scripts_dir, "prepare_data.R"))
+source(paste0(scripts_dir, "prepare_data_perpage.R"))
+source(paste0(scripts_dir, "prepare_toplist.R"))
+source(paste0(scripts_dir, "create_violin_plots.R"))
+source(paste0(scripts_dir, "prepare_alarmvalues.R"))
 
 # number of diseases that score highest in algorithm to plot
-top_nr_IEM       <- 5
+top_nr_iem <- 5
 # probability score cut-off for plotting the top diseases
-threshold_IEM    <- 5
+threshold_iem <- 5
 # z-score cutoff of axis on the left for top diseases
-ratios_cutoff    <- -5
+ratios_cutoff <- -5
 # number of violin plots per page in PDF
 nr_plots_perpage <- 20
 
@@ -46,31 +81,16 @@ if (z_score == 1) {
 # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
 header_row <- 1
 # column name where the data starts (default B)
-col_start     <- "B"
+col_start <- "B"
 zscore_cutoff <- 5
-xaxis_cutoff  <- 20
-
-# path to DIMS excel file
-path_dims_file <- paste0("./", run_name, ".xlsx")
-
-# path: output folder for dIEM and violin plots
-output_dir <- "./"
-
-# folder for all metabolite lists (.txt)
-path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups"
-# file for ratios step 3
-file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv"
-# file for algorithm step 4
-file_expected_biomarkers_IEM <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv"
-# explanation: file with text to be included in violin plots
-file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt"
-
-# copy list of isomers to project folder.
-file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir)
+xaxis_cutoff <- 20
 
 #### STEP 1: Preparation ####
 # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS
 
+# path to DIMS excel file
+path_dims_file <- paste0(run_name, ".xlsx")
+
 # Load the excel file.
 dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row)
 if (exists("dims_xls")) {
@@ -98,11 +118,11 @@ cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n-
 
 # Move the columns HMDB_code and HMDB_name to the beginning.
 hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
-other_cols     <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols]
-dims_xls_copy  <- dims_xls[, c(hmdb_info_cols, other_cols)]
+other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols]
+dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)]
 # Remove the columns from 'name' to 'pathway'
-from_col      <- which(colnames(dims_xls_copy) == "name")
-to_col        <- which(colnames(dims_xls_copy) == "pathway")
+from_col <- which(colnames(dims_xls_copy) == "name")
+to_col <- which(colnames(dims_xls_copy) == "pathway")
 dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)]
 # in case the excel had an empty "plots" column, remove it
 if ("plots" %in% colnames(dims_xls_copy)) {
@@ -146,18 +166,19 @@ if (ratios == 1) {
     ncol = ncol(dims_xls_copy),
     nrow = nrow(ratio_input)
   )), colnames(dims_xls_copy))
+  ratio_list <- as.data.frame(ratio_list)
 
   # put HMDB info into first two columns of ratio_list
   ratio_list[, 1:2] <- ratio_input[, 1:2]
 
   # look for intensity columns (exclude Zscore columns)
-  control_cols   <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
-  patient_cols   <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+  control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+  patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
   intensity_cols <- c(control_cols, patient_cols)
   # calculate each of the ratios of intensities
   for (ratio_index in 1:nrow(ratio_input)) {
-    ratio_numerator   <- ratio_input[ratio_index, "HMDB_numerator"]
-    ratio_numerator   <- strsplit(ratio_numerator, "plus")[[1]]
+    ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"]
+    ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]]
     ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"]
     ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]]
     # find these HMDB IDs in dataset. Could be a sum of multiple metabolites
@@ -219,13 +240,13 @@ if (ratios == 1) {
 }
 
 #### STEP 4: Run the IEM algorithm #########
-# in: algorithm, file_expected_biomarkers_IEM, zscore_patients ||| out: prob_score (+file)
+# in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file)
 # algorithm taken from DOI: 10.3390/ijms21030979
 
 if (algorithm == 1) {
   # Load data
-  cat(paste0("\nloading expected file:\n ->  ", file_expected_biomarkers_IEM, "\n"))
-  expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep = ";", stringsAsFactors = FALSE)
+  cat(paste0("\nloading expected file:\n ->  ", file_expected_biomarkers_iem, "\n"))
+  expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE)
   # modify column names
   names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code",  names(expected_biomarkers))
   names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers))
@@ -241,7 +262,8 @@ if (algorithm == 1) {
     # Rank all positive zscores highest to lowest
     rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index]))
     # Rank all negative zscores lowest to highest
-    rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):nrow(rank_patients), patient_index]))
+    rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):
+										      nrow(rank_patients), patient_index]))
   }
 
   # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2).
@@ -253,13 +275,16 @@ if (algorithm == 1) {
   select_info_cols <- 1:(min(select_zscore_cols) - 1)
   # set some zscores to zero
   select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable")
-  expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x))
+  expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, 
+								     select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x))
   select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable")
-  expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x))
+  expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, 
+								     select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x))
 
   # calculate rank score:
   expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
-  rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9)
+  rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / 
+	         (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9)
   # combine disease info with rank scores
   expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores)
 
@@ -292,7 +317,8 @@ if (algorithm == 1) {
   # determine disease rank per patient
   disease_rank <- prob_score
   # rank diseases in decreasing order
-  disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x])))
+  disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) 
+					       as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x])))
   # modify column names, Zscores have now been converted to probability scores
   colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score))
   colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank))
@@ -301,8 +327,9 @@ if (algorithm == 1) {
   wb <- createWorkbook()
   addWorksheet(wb, "Probability Scores")
   writeData(wb, "Probability Scores", prob_score)
-  conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100))
-  saveWorkbook(wb, file = paste0(output_dir, "/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
+  conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), 
+			type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100))
+  saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
   # check whether prob_score df exists and has expected dimensions.
   if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) {
     cat("\n### Step 4 # Running the IEM algorithm is done.\n\n")
@@ -314,9 +341,10 @@ if (algorithm == 1) {
 }
 
 #### STEP 5: Make violin plots #####
-# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, top_diseases, top_metab, output_dir ||| out: pdf file
+# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, 
+# top_diseases, top_metab, output_dir ||| out: pdf file
 
-if (violin == 1) { # make violin plots
+if (violin == 1) {
 
   # preparation
   zscore_patients_copy <- zscore_patients
@@ -351,7 +379,8 @@ if (violin == 1) { # make violin plots
     cat("making plots in category:", metabolite_dir, "\n")
 
     # get a list of all metabolite files
-    metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), pattern = "*.txt", full.names = FALSE, recursive = FALSE)
+    metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), 
+				   pattern = "*.txt", full.names = FALSE, recursive = FALSE)
     # put all metabolites into one list
     metab_list_all <- list()
     metab_list_names <- c()
@@ -359,7 +388,8 @@ if (violin == 1) { # make violin plots
     # open the text files and add each to a list of dataframes (metab_list_all)
     for (file_index in seq_along(metabolite_files)) {
       infile <- metabolite_files[file_index]
-      metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), sep = "\t", header = TRUE, quote = "")
+      metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), 
+			       sep = "\t", header = TRUE, quote = "")
       # put into list of all lists
       metab_list_all[[file_index]] <- metab_list
       metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1])
@@ -392,8 +422,8 @@ if (violin == 1) { # make violin plots
   }
 
   # Second step: dIEM plots in separate directory
-  dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/")
-  dir.create(dIEM_plot_dir)
+  diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/")
+  dir.create(diem_plot_dir)
 
   # Select the metabolites that are associated with the top highest scoring IEM, for each patient
   # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely.
@@ -401,54 +431,54 @@ if (violin == 1) { # make violin plots
     pt_name <- patient_list[pt_nr]
     # get top diseases for this patient
     pt_colnr <- which(colnames(disease_rank) == pt_name)
-    pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_IEM)
-    pt_IEMs <- disease_rank[pt_top_indices, "Disease"]
-    pt_top_IEMs <- pt_prob_score_top_IEMs <- c()
-    for (single_IEM in pt_IEMs) {
+    pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem)
+    pt_iems <- disease_rank[pt_top_indices, "Disease"]
+    pt_top_iems <- pt_prob_score_top_iems <- c()
+    for (single_iem in pt_iems) {
       # get the probability score
-      prob_score_IEM <- prob_score[which(prob_score$Disease == single_IEM), pt_colnr]
+      prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr]
       # use only diseases for which probability score is above threshold
-      if (prob_score_IEM >= threshold_IEM) {
-        pt_top_IEMs <- c(pt_top_IEMs, single_IEM)
-        pt_prob_score_top_IEMs <- c(pt_prob_score_top_IEMs, prob_score_IEM)
+      if (prob_score_iem >= threshold_iem) {
+        pt_top_iems <- c(pt_top_iems, single_iem)
+        pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem)
       }
     }
 
     # prepare data for plotting dIEM violin plots
-    # If prob_score_top_IEM is an empty list, don't make a plot
-    if (length(pt_top_IEMs) > 0) {
-      # Sorting from high to low, both prob_score_top_IEMs and pt_top_IEMs.
-      pt_prob_score_order <- order(-pt_prob_score_top_IEMs)
-      pt_prob_score_top_IEMs <- round(pt_prob_score_top_IEMs, 1)
-      pt_prob_score_top_IEM_sorted <- pt_prob_score_top_IEMs[pt_prob_score_order]
-      pt_top_IEM_sorted <- pt_top_IEMs[pt_prob_score_order]
-      # getting metabolites for each top_IEM disease exactly like in metab_list_all
-      metab_IEM_all <- list()
-      metab_IEM_names <- c()
-      for (single_IEM_index in 1:length(pt_top_IEM_sorted)) {
-        single_IEM <- pt_top_IEM_sorted[single_IEM_index]
-        single_prob_score <- pt_prob_score_top_IEM_sorted[single_IEM_index]
-        select_rows <- which(expected_biomarkers_select$Disease == single_IEM)
+    # If prob_score_top_iem is an empty list, don't make a plot
+    if (length(pt_top_iems) > 0) {
+      # Sorting from high to low, both prob_score_top_iems and pt_top_iems.
+      pt_prob_score_order <- order(-pt_prob_score_top_iems)
+      pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1)
+      pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order]
+      pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order]
+      # getting metabolites for each top_iem disease exactly like in metab_list_all
+      metab_iem_all <- list()
+      metab_iem_names <- c()
+      for (single_iem_index in 1:length(pt_top_iem_sorted)) {
+        single_iem <- pt_top_iem_sorted[single_iem_index]
+        single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index]
+        select_rows <- which(expected_biomarkers_select$Disease == single_iem)
         metab_list <- expected_biomarkers_select[select_rows, ]
-        metab_IEM_names <- c(metab_IEM_names, paste0(single_IEM, ", probability score ", single_prob_score))
+        metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score))
         metab_list <- metab_list[, -1]
-        metab_IEM_all[[single_IEM_index]] <- metab_list
+        metab_iem_all[[single_iem_index]] <- metab_list
       }
       # put all metabolites into one list
-      names(metab_IEM_all) <- metab_IEM_names
+      names(metab_iem_all) <- metab_iem_names
 
       # get Zscore information from zscore_patients_copy, similar to normal violin plots
-      metab_IEM_sorted <- prepare_data(metab_IEM_all, zscore_patients_copy)
-      metab_IEM_controls <- prepare_data(metab_IEM_all, zscore_controls)
+      metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy)
+      metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls)
       # make sure every page has 20 metabolites
-      dIEM_metab_perpage <- prepare_data_perpage(metab_IEM_sorted, metab_IEM_controls, nr_plots_perpage, nr_pat)
+      diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat)
 
       # generate dIEM violin plots
-      create_violin_plots(dIEM_plot_dir, pt_name, dIEM_metab_perpage, top_metab_pt)
+      create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt)
 
     } else {
-      cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM, ".
-                   Therefore, this pdf was not made:\t ", pt_name, "_IEM \n"))
+      cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ".
+                   Therefore, this pdf was not made:\t ", pt_name, "_iem \n"))
     }
 
   }
diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
index f002341..27bc20b 100644
--- a/DIMS/GenerateViolinPlots.nf
+++ b/DIMS/GenerateViolinPlots.nf
@@ -5,15 +5,22 @@ process GenerateViolinPlots {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(excel_file) // input files need to be linked, but called within R script
+       path(excel_file)
        val(analysis_id)
 
     output:
-       // path('*.pdf') // pdf files are generated, but in different directory
-       path('*.xlsx')
+       path('Diagnostics/*.pdf'), emit: diag_plot_files
+       path('Other/*.pdf'), emit: other_plot_files
+       path('dIEM/*.pdf'), emit: diem_plot_files
+       path('*.xlsx'), emit: excel_file
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore
+        Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore \
+                                                                    $params.path_metabolite_groups \
+                                                                    $params.file_ratios_metabolites \
+                                                                    $params.file_expected_biomarkers_IEM \
+                                                                    $params.file_explanation \
+                                                                    $params.file_isomers
         """
 }
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 7441959..4cb343f 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -4,18 +4,16 @@
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-db_path <- cmd_args[1] 
-breaks_filepath <- cmd_args[2] 
-standard_run  <- cmd_args[4] 
+db_file <- cmd_args[1] 
+breaks_file <- cmd_args[2] 
+standard_run <- cmd_args[4] 
 
-# Cut up entire HMDB into small parts based on the new binning/breaks 
-load(breaks_filepath)
-
-# In case of a standard run (m/z 69-606) use external HMDB parts
+# load file with binning breaks 
+load(breaks_file)
 min_mz <- round(breaks_fwhm[1])
 max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
 
-# test if standard mz range is used
+# In case of a standard run (m/z 69-606) use external HMDB parts
 if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) {
   # skip generating HMDB parts
   hmdb_parts_path <- cmd_args[3] 
@@ -26,7 +24,7 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
   }
 } else { 
   # generate HMDB parts in case of non-standard mz range
-  load(db_path)
+  load(db_file)
   ppm <- as.numeric(cmd_args[5])
 
   scanmodes <- c("positive", "negative")
@@ -39,79 +37,74 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
       HMDB_add_iso <- HMDB_add_iso.Pos
     }
 
-    # filter mass range meassured!!!
+    # filter mass range meassured
     HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] &
                                       HMDB_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
     # sort on mass
     outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[ , column_label])),]
+    nr_rows <- dim(outlist)[1]
 
-    nr_rows      <- dim(outlist)[1]
     # maximum number of rows per file
-    sub          <- 20000 
-    end          <- 0
-    last_line    <- sub
-    check        <- 0
+    sub <- 20000 
+    end <- 0
+    last_line <- sub
+    check <- 0
     outlist_part <- NULL
 
-
+    # create parts and save to file
     if (nr_rows < sub) {
       outlist_part <- outlist
-      save(outlist_part, file = paste0("./", scanmode, "_hmdb.1.RData"))
-    } else {
-
-        if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) {
-          for (i in 2:floor(nr_rows / sub) - 1) {
-            start <- -(sub - 1) + i * sub
-            end <- i * sub
-
-            if (i > 1){
-              outlist_i = outlist[c(start:end),]
-
-              nr_moved = 0
-
-              # Use ppm to replace border to avoid cut within peakgroup!
-              while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
-                      as.numeric(outlist_i[1, column_label]) < ppm) {
-                outlist_part <- rbind(outlist_part, outlist_i[1, ])
-                outlist_i <- outlist_i[-1, ]
-                nr_moved <- nr_moved + 1
-              }
-
-              save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = ""))
-              check <- check + dim(outlist_part)[1]
-
-              outlist_part <- outlist_i
-              last_line <- dim(outlist_part)[1]
-
-            } else {
-              outlist_part <- outlist[c(start:end),]
-            }
-          }
-        }
-
-        start <- end + 1
-        end <- nr_rows
-        outlist_i <- outlist[c(start:end),]
-        nr_moved <- 0
-
-        if (!is.null(outlist_part)) {
-          # Calculate ppm and replace border, avoid cut within peakgroup!
+      save(outlist_part, file = paste0(scanmode, "_hmdb.1.RData"))
+    } else if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) {
+      for (i in 2:floor(nr_rows / sub) - 1) {
+        start <- -(sub - 1) + i * sub
+        end <- i * sub
+
+        if (i > 1){
+          outlist_i = outlist[c(start:end),]
+          nr_moved = 0
+          # Use ppm to replace border to avoid cut within peak group
           while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
                   as.numeric(outlist_i[1, column_label]) < ppm) {
-            outlist_part <- rbind(outlist_part, outlist_i[1,])
-            outlist_i <- outlist_i[-1,]
+            outlist_part <- rbind(outlist_part, outlist_i[1, ])
+            outlist_i <- outlist_i[-1, ]
             nr_moved <- nr_moved + 1
           }
 
-          save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i, ".RData"))
+          save(outlist_part, file = paste(scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = ""))
           check <- check + dim(outlist_part)[1]
+
+          outlist_part <- outlist_i
+          last_line <- dim(outlist_part)[1]
+
+        } else {
+          outlist_part <- outlist[c(start:end),]
+        }
+      }
+
+      start <- end + 1
+      end <- nr_rows
+      outlist_i <- outlist[c(start:end), ]
+      nr_moved <- 0
+
+      if (!is.null(outlist_part)) {
+        # Calculate ppm and replace border, avoid cut within peak group
+        while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / 
+                as.numeric(outlist_i[1, column_label]) < ppm) {
+          outlist_part <- rbind(outlist_part, outlist_i[1, ])
+          outlist_i <- outlist_i[-1, ]
+          nr_moved <- nr_moved + 1
         }
 
-        outlist_part <- outlist_i
-        save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i + 1, ".RData"))
+        save(outlist_part, file = paste0(scanmode, "_hmdb_", i, ".RData"))
         check <- check + dim(outlist_part)[1]
       }
+
+      outlist_part <- outlist_i
+      save(outlist_part, file = paste0(scanmode, "_hmdb_", i + 1, ".RData"))
+      check <- check + dim(outlist_part)[1]
     }
+  }
 }
 
diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf
index 50786af..5f19f75 100644
--- a/DIMS/HMDBparts.nf
+++ b/DIMS/HMDBparts.nf
@@ -1,6 +1,5 @@
 process HMDBparts {
     tag "DIMS HMDBparts"
-    // Custom process to cut HMDB db into parts
     label 'HMDBparts'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
@@ -13,7 +12,6 @@ process HMDBparts {
         path('*.RData')
 
     script:
-
         """
         Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm
         """
diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index ff468c7..0486b03 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -4,11 +4,11 @@
 # define parameters 
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-db_path <- cmd_args[1]
-breaks_filepath <- cmd_args[2]
+db_file <- cmd_args[1]
+breaks_file <- cmd_args[2]
 
-load(db_path)
-load(breaks_filepath)
+load(db_file)
+load(breaks_file)
 
 # Cut up HMDB minus adducts minus isotopes into small parts 
 scanmodes <- c("positive", "negative")
@@ -26,14 +26,14 @@ for (scanmode in scanmodes) {
              HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
   # remove adducts and isotopes, put internal standard at the beginning
-  outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ]
+  outlist_is <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ]
   outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ]
   outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ]
-  outlist <- rbind(outlist_IS, outlist)
+  outlist <- rbind(outlist_is, outlist)
   # sort on m/z value
   outlist <- outlist[order(outlist[ , column_label]), ]
-
   nr_rows <- dim(outlist)[1]
+
   # size of hmdb parts in lines:
   sub <- 1000
   end <- 0
@@ -42,7 +42,7 @@ for (scanmode in scanmodes) {
   # generate hmdb parts
   if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) {
     for (i in 1:floor(nr_rows / sub)) {
-      start <- -(sub-1) + i * sub
+      start <- -(sub - 1) + i * sub
       end <- i * sub
       outlist_part <- outlist[c(start:end), ]
       save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData"))
@@ -54,5 +54,5 @@ for (scanmode in scanmodes) {
 start <- end + 1
 end <- nr_rows
 
-outlist_part <- outlist[c(start:end),]
-save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i+1, ".RData"))
+outlist_part <- outlist[c(start:end), ]
+save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData"))
diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf
index 3f9845b..b38bac0 100644
--- a/DIMS/HMDBparts_main.nf
+++ b/DIMS/HMDBparts_main.nf
@@ -1,6 +1,5 @@
 process HMDBparts_main {
     tag "DIMS HMDBparts_main"
-    // Custom process to cut HMDB db into parts for main entry only, no adducts, no isotopes
     label 'HMDBparts_main'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index 7d8dd1d..b01a31b 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,16 +1,19 @@
 #!/usr/bin/env Rscript
 ## adapted from makeInit in old pipeline
 
-args          <- commandArgs(trailingOnly = TRUE)
-sample_sheet  <- read.csv(args[1], sep = "\t")
+# define parameters
+args <- commandArgs(trailingOnly = TRUE)
+
+sample_sheet <- read.csv(args[1], sep = "\t")
 nr_replicates <- as.numeric(args[2])
 
-sample_names       <- trimws(as.vector(unlist(sample_sheet[1])))
-nr_sample_groups   <- length(sample_names) / nr_replicates
-group_names        <- trimws(as.vector(unlist(sample_sheet[2])))
-group_names        <- gsub("[^-.[:alnum:]]", "_", group_names)
+sample_names <- trimws(as.vector(unlist(sample_sheet[1])))
+nr_sample_groups <- length(sample_names) / nr_replicates
+group_names <- trimws(as.vector(unlist(sample_sheet[2])))
+group_names <- gsub("[^-.[:alnum:]]", "_", group_names)
 group_names_unique <- unique(group_names)
 
+# generate the replication pattern
 repl_pattern <- c()
 for (sample_group in 1:nr_sample_groups) {
   tmp <- c()
@@ -26,4 +29,4 @@ names(repl_pattern) <- group_names_unique
 # preview the replication pattern
 print(tail(repl_pattern))
 
-save(repl_pattern, file = "./init.RData")
+save(repl_pattern, file = "init.RData")
diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf
index 48b1c9b..7aae0e4 100644
--- a/DIMS/MakeInit.nf
+++ b/DIMS/MakeInit.nf
@@ -5,7 +5,8 @@ process MakeInit {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple path(samplesheet), val(nr_replicates)
+       path(samplesheet) 
+       val(nr_replicates)
 
     output:
        path('init.RData')
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index e978d4b..e2edc85 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -4,51 +4,63 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-filepath        <- cmd_args[1]
-breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-resol           <- as.numeric(cmd_args[3])
-scripts_dir     <- cmd_args[4]
-thresh          <- 2000
+sample_file <- cmd_args[1]
+breaks_file <- cmd_args[2]
+resol <- as.numeric(cmd_args[3])
+scripts_dir <- cmd_args[4]
+thresh <- 2000
+outdir <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R"))
-source(paste0(scripts_dir, "AddOnFunctions/searchMZRange.R"))
-source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fitGaussian.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fitGaussianInit.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getSD.R"))
-source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit1Peak.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit2peaks.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit3peaks.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit4peaks.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fitG.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit2G.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit3G.R"))
-source(paste0(scripts_dir, "AddOnFunctions/fit4G.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getArea.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getFitQuality.R"))
-source(paste0(scripts_dir, "AddOnFunctions/checkOverlap.R"))
-source(paste0(scripts_dir, "AddOnFunctions/sumCurves.R"))
-source(paste0(scripts_dir, "AddOnFunctions/isWithinXppm.R"))
-
-load(breaks_filepath)
+source(paste0(scripts_dir, "findPeaks.Gauss.HPC.R"))
+source(paste0(scripts_dir, "searchMZRange.R"))
+source(paste0(scripts_dir, "generateGaussian.R"))
+source(paste0(scripts_dir, "fitGaussian.R"))
+source(paste0(scripts_dir, "fitGaussianInit.R"))
+source(paste0(scripts_dir, "getFwhm.R"))
+source(paste0(scripts_dir, "getSD.R"))
+source(paste0(scripts_dir, "optimizeGauss.R"))
+source(paste0(scripts_dir, "fit1Peak.R"))
+source(paste0(scripts_dir, "fit2peaks.R"))
+source(paste0(scripts_dir, "fit3peaks.R"))
+source(paste0(scripts_dir, "fit4peaks.R"))
+source(paste0(scripts_dir, "fitG.R"))
+source(paste0(scripts_dir, "fit2G.R"))
+source(paste0(scripts_dir, "fit3G.R"))
+source(paste0(scripts_dir, "fit4G.R"))
+source(paste0(scripts_dir, "getArea.R"))
+source(paste0(scripts_dir, "getFitQuality.R"))
+source(paste0(scripts_dir, "checkOverlap.R"))
+source(paste0(scripts_dir, "sumCurves.R"))
+source(paste0(scripts_dir, "isWithinXppm.R"))
+
+load(breaks_file)
 
 # Load output of AverageTechReplicates for a sample
-sample_avgtechrepl <- get(load(filepath))
-if (grepl("_pos", filepath)) {
+sample_avgtechrepl <- get(load(sample_file))
+if (grepl("_pos", sample_file)) {
   scanmode <- "positive"
-} else if (grepl("_neg", filepath)) {
+} else if (grepl("_neg", sample_file)) {
   scanmode <- "negative"
 }
 
 # Initialize
 options(digits = 16)
 int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve
-scale      <- 2 # Initial value used to estimate scaling parameter
-width      <- 1024
-height     <- 768
+scale <- 2 # Initial value used to estimate scaling parameter
+width <- 1024
+height <- 768
 
 # run the findPeaks function
-findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+print(head(sample_avgtechrepl))
+print(head(breaks_fwhm))
+print(int_factor)
+print(scale)
+print(resol)
+print(outdir)
+print(scanmode)
+print(thresh)
+print(width)
+print(height)
+
+findPeaks.Gauss.HPC(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf
index 1752a74..0097d12 100644
--- a/DIMS/PeakFinding.nf
+++ b/DIMS/PeakFinding.nf
@@ -1,17 +1,17 @@
 process PeakFinding {
-    tag "DIMS PeakFinding ${RData_file}"
+    tag "DIMS PeakFinding ${rdata_file}"
     label 'PeakFinding'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       tuple(path(RData_file), path(breaks_file))
+       tuple(path(rdata_file), path(breaks_file))
 
     output:
        path '*tive.RData'
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $breaks_file $params.resolution $params.scripts_dir
+        Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir
         """
 }
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index b574206..8da74d9 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -56,8 +56,8 @@ while (dim(hmdb_add_iso)[1] > 0) {
   selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance)))
   tmplist <- outlist_copy[selp, , drop = FALSE]
   list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist)
-
   nrsamples <- length(selp)
+  # if peaks have been found, create a peak group
   if (nrsamples > 0) {
     mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"]))
     mzmin_pgrp <- reference_mass - mass_tolerance
@@ -81,7 +81,7 @@ while (dim(hmdb_add_iso)[1] > 0) {
     assi_hmdb <- iso_hmdb <- hmdb_code <- NA
     tmplist_mass_iso <- tmplist_mass_adduct <- NULL
 
-    # Identification: find all entries in HMDB part with mass within ppm range
+    # find all entries in HMDB part with mass within ppm range
     mass_all <- as.numeric(hmdb_add_iso[, column_label])
     index <- which((mass_all > (reference_mass - mass_tolerance)) & (mass_all < (reference_mass + mass_tolerance)))
     tmplist_mass <- hmdb_add_iso[index, , drop = FALSE]
@@ -103,24 +103,30 @@ while (dim(hmdb_add_iso)[1] > 0) {
         }
       }
 
-      # Compose a list compounds, adducts or isotopes with corresponding m/z
+      # Compose a list of compounds, adducts or isotopes with corresponding m/z
       if (dim(tmplist_mass)[1] > 0) {
         # metabolites
-        assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), collapse = ";"))
-        hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), collapse = ";"))
+        assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), 
+					collapse = ";"))
+        hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), 
+					collapse = ";"))
         theormz_hmdb <- as.numeric(tmplist_mass[1, column_label])
 
         # adducts of metabolites
         if (!is.null(tmplist_mass_adduct)) {
           if (dim(tmplist_mass_adduct)[1] > 0) {
             if (is.na(assi_hmdb)) {
-              assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";"))
-              hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";"))
+              assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), 
+					      collapse = ";"))
+              hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), 
+					      collapse = ";"))
             } else {
               assi_hmdb <- paste(assi_hmdb,
-                                 as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+                                 as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), 
+						    collapse = ";")), sep = ";")
               hmdb_code <- paste(hmdb_code,
-                                 as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";")
+                                 as.character(paste(as.character(rownames(tmplist_mass_adduct)), 
+						    collapse = ";")), sep = ";")
             }
           }
         }
@@ -128,7 +134,8 @@ while (dim(hmdb_add_iso)[1] > 0) {
         # isotopes of metabolites
         if (!is.null(tmplist_mass_iso)) {
           if (dim(tmplist_mass_iso)[1] > 0) {
-            iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";"))
+            iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), 
+					   collapse = ";"))
           }
         }
 
@@ -140,13 +147,17 @@ while (dim(hmdb_add_iso)[1] > 0) {
         if (!is.null(tmplist_mass_adduct)) {
           if (dim(tmplist_mass_adduct)[1] > 0) {
             if (is.na(assi_hmdb)) {
-              assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";"))
-              hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";"))
+              assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), 
+					      collapse = ";"))
+              hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), 
+					      collapse = ";"))
             } else {
               assi_hmdb <- paste(assi_hmdb,
-                                 as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";")
+                                 as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), 
+						    collapse = ";")), sep = ";")
               hmdb_code <- paste(hmdb_code,
-                                 as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";")
+                                 as.character(paste(as.character(rownames(tmplist_mass_adduct)), 
+						    collapse = ";")), sep = ";")
             }
           }
         }
@@ -154,7 +165,8 @@ while (dim(hmdb_add_iso)[1] > 0) {
         # isotopes of metabolites
         if (!is.null(tmplist_mass_iso)) {
           if (dim(tmplist_mass_iso)[1] > 0) {
-            iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";"))
+            iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), 
+					   collapse = ";"))
           }
         }
 
@@ -162,7 +174,8 @@ while (dim(hmdb_add_iso)[1] > 0) {
       } else if (!is.null(tmplist_mass_iso)) {
         if (dim(tmplist_mass_iso)[1] > 0) {
           theormz_hmdb <- as.numeric(tmplist_mass_iso[1, column_label])
-          iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";"))
+          iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), 
+					 collapse = ";"))
         }
       }
     }
@@ -180,7 +193,6 @@ while (dim(hmdb_add_iso)[1] > 0) {
   hmdb_add_iso <- hmdb_add_iso[-index, ]
 }
 
-
 # save peak list corresponding to masses in HMDB part
 save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData"))
 # save peak group list, identified part
diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf
index 5c48cde..ae4382c 100644
--- a/DIMS/PeakGrouping.nf
+++ b/DIMS/PeakGrouping.nf
@@ -1,13 +1,13 @@
 process PeakGrouping {
-    tag "DIMS PeakGrouping ${HMDBpart_file}"
+    tag "DIMS PeakGrouping ${hmdbpart_file}"
     label 'PeakGrouping'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(HMDBpart_file)
-       each path(SpectrumPeak_file) // input files need to be linked, but called within R script
-       each path(pattern_file)      // Execute the process for each element in the input collection (HMDBpart_file)
+       path(hmdbpart_file)
+       each path(spectrumpeak_file)
+       each path(pattern_file)
 
     output:
        path '*_peaks_used.RData', emit: peaks_used
@@ -15,6 +15,6 @@ process PeakGrouping {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $hmdbpart_file $params.ppm
         """
 }
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index 5c77b12..cf6665c 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -4,7 +4,7 @@
 # define parameters
 scanmodes <- c("positive", "negative")
 
-# Check whether all jobs terminated correct!
+# Check whether all jobs terminated correctly
 not_run <- NULL
 
 # collect spectrum peaks for each scanmode
@@ -13,10 +13,10 @@ for (scanmode in scanmodes) {
   peaklist_files <- list.files(pattern = paste0("_", scanmode, ".RData"))
 
   # get sample names
-  load(paste0("./", scanmode, "_repl_pattern", ".RData"))
+  load(paste0(scanmode, "_repl_pattern.RData"))
   group_names <- names(repl_pattern_filtered)
   for (sample_nr in 1:length(group_names)) {
-    group <- paste0("./", group_names[sample_nr], "_", scanmode, ".RData")
+    group <- paste0(group_names[sample_nr], "_", scanmode, ".RData")
     if (!(group %in% peaklist_files)) {
       not_run <- c(not_run, group)
     }
diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf
index 775cd2c..3590265 100644
--- a/DIMS/SpectrumPeakFinding.nf
+++ b/DIMS/SpectrumPeakFinding.nf
@@ -5,8 +5,8 @@ process SpectrumPeakFinding {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(RData_file) // input files need to be linked, but called within R script
-       path(replication_pattern) // input files need to be linked, but called within R script
+       path(rdata_files)
+       path(replication_pattern)
 
     output:
        path 'SpectrumPeaks_*.RData'
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index db74623..870a914 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -7,36 +7,13 @@ cmd_args <- commandArgs(trailingOnly = TRUE)
 hmdbpart_main_file <- cmd_args[1]
 scripts_dir <- cmd_args[2]
 z_score <- as.numeric(cmd_args[3])
-# NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder
-
-if (grepl("positive_hmdb", hmdbpart_main_file)) {
-  scanmode <- "positive"
-  # for the adduct sum: include adducts M+Na (1) and M+K (2)
-  adducts <- c(1, 2)
-} else if (grepl("negative_hmdb", hmdbpart_main_file)) {
-  scanmode <- "negative"
-  # for the adduct sum: include adduct M+Cl (1)
-  adducts <- c(1)
-}
-
-# load input files
-collect_file <- paste0("outlist_identified_", scanmode, ".RData")
-load(collect_file)
-repl_file <- paste0(scanmode, "_repl_pattern.RData")
-load(repl_file)
-outlist_part <- get(load(hmdbpart_main_file))
-
-# get the number from the file name
-batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2]
-
-outlist_total <- unique(outlist_ident)
 
 sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number, scanmode, outdir, z_score) {
   hmdb_codes <- rownames(theor_mz)
   hmdb_names <- theor_mz[, 1, drop = FALSE]
   hmdb_names[] <- lapply(hmdb_names, as.character)
 
-  # remove isotopes!!!
+  # remove isotopes
   index <- grep("HMDB", hmdb_codes, fixed = TRUE)
   hmdb_codes <- hmdb_codes[index]
   hmdb_names <- hmdb_names[index, ]
@@ -86,4 +63,26 @@ sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number
   }
 }
 
+if (grepl("positive_hmdb", hmdbpart_main_file)) {
+  scanmode <- "positive"
+  # for the adduct sum: include adducts M+Na (1) and M+K (2)
+  adducts <- c(1, 2)
+} else if (grepl("negative_hmdb", hmdbpart_main_file)) {
+  scanmode <- "negative"
+  # for the adduct sum: include adduct M+Cl (1)
+  adducts <- c(1)
+}
+
+# load input files
+collect_file <- paste0("outlist_identified_", scanmode, ".RData")
+load(collect_file)
+repl_file <- paste0(scanmode, "_repl_pattern.RData")
+load(repl_file)
+outlist_part <- get(load(hmdbpart_main_file))
+
+# get the number from the file name
+batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2]
+
+outlist_total <- unique(outlist_ident)
+
 sum_adducts(outlist_total, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score)
diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf
index 66a404f..bbd6420 100644
--- a/DIMS/SumAdducts.nf
+++ b/DIMS/SumAdducts.nf
@@ -1,12 +1,12 @@
 process SumAdducts {
-    tag "DIMS SumAdducts ${collect_file}"
+    tag "DIMS SumAdducts"
     label 'SumAdducts'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       each path(collect_file) // input files need to be linked, but called within R script
-       each path(replication_pattern) // input files need to be linked, but called within R script
+       each path(collect_files)
+       each path(replication_pattern)
        path(HMDBpart_main_file)
 
     output:
diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf
index 17cee27..6e51984 100644
--- a/DIMS/ThermoRawFileParser.nf
+++ b/DIMS/ThermoRawFileParser.nf
@@ -1,14 +1,13 @@
 process ConvertRawFile {
     tag "DIMS ConvertRawFile ${file_id}"
-    // Custom process to convert raw file to mzML format
     label 'ThermoRawFileParser_1_1_11'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-        tuple val(file_id), path(raw_file)
+        tuple(val(file_id), path(raw_file))
 
     output:
-        tuple val(file_id), path("${file_id}.mzML")
+        tuple(val(file_id), path("${file_id}.mzML"))
 
     script:
 
diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
index 68dbfe6..2bce990 100755
--- a/DIMS/UnidentifiedCalcZscores.R
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -4,19 +4,19 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-scripts_dir        <- cmd_args[1]
-ppm     <- as.numeric(cmd_args[2])
+scripts_dir <- cmd_args[1]
+ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R"))
-source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R"))
+source(paste0(scripts_dir, "mergeDuplicatedRows.R"))
+source(paste0(scripts_dir, "statistics_z.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
   # get list of files
-  filled_file <- paste0("./PeakGroupList_", scanmode, "_Unidentified_filled.RData")
+  filled_file <- paste0("PeakGroupList_", scanmode, "_Unidentified_filled.RData")
   # load file
   outlist_total <- get(load(filled_file))
 
@@ -36,7 +36,8 @@ for (scanmode in scanmodes) {
     nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
     order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
     outlist_stats_more <- cbind(outlist_stats[, 1:7],
-                                outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)],
+                                outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):
+					      (length(repl_pattern) - nr_removed_samples + 8 + 6)],
                                 outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int],
                                 outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)])
 
@@ -50,10 +51,9 @@ for (scanmode in scanmodes) {
 
   outlist_not_ident <- outlist_total
 
-  # Extra output in Excel-readable format:
+  # Save output
   remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp")
   remove_colindex <- which(colnames(outlist_not_ident) %in% remove_columns)
   outlist_not_ident <- outlist_not_ident[, -remove_colindex]
-  write.table(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".txt"), sep = "\t", row.names = FALSE)
-
+  save(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".RData"))
 }
diff --git a/DIMS/UnidentifiedCalcZscores.nf b/DIMS/UnidentifiedCalcZscores.nf
index 1687b76..675d9cd 100644
--- a/DIMS/UnidentifiedCalcZscores.nf
+++ b/DIMS/UnidentifiedCalcZscores.nf
@@ -6,10 +6,10 @@ process UnidentifiedCalcZscores {
 
     input:
        path(unidentified_filled_files)
-       path(replication_pattern)  // input files need to be linked, but called within R script
+       each path(replication_pattern)
 
     output:
-       path('unidentified_outlist*.txt')
+       path('unidentified_outlist*.RData')
 
     script:
         """
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index 5f1d5d7..a5674f1 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -5,7 +5,6 @@
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
 ppm <- as.numeric(cmd_args[1])
-outdir <- "./"
 
 scanmodes <- c("positive", "negative")
 
@@ -17,16 +16,16 @@ for (scanmode in scanmodes) {
 
   # Make a list of indexes of peaks that have been identified, then remove these from the peaklist.
   remove <- NULL
-  for (i in 1:length(files)) {
+  for (file_index in 1:length(files)) {
     # load list_of_peaks_used_in_peak_groups_identified
-    load(files[i])
-    remove <- c(remove, which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i, "mzmed.pkt"]))
+    load(files[file_index])
+    remove <- c(remove, 
+		which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[file_index, "mzmed.pkt"]))
   }
   outlist_rest <- outlist_total[-remove, ]
 
   # sort on mass
   outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ]
   # save output
-  save(outlist, file = paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData"))
-
+  save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData"))
 }
diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf
index bcd1264..6b2656e 100644
--- a/DIMS/UnidentifiedCollectPeaks.nf
+++ b/DIMS/UnidentifiedCollectPeaks.nf
@@ -5,16 +5,14 @@ process UnidentifiedCollectPeaks {
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(SpectrumPeaks_file)  // input files need to be linked, but called within R script
-       path(PeakList_identified)
+       path(spectrumpeaks_file)
+       path(peaklist_identified)
 
     output:
        path('SpectrumPeaks_*_Unidentified.RData')
-       // path('SpectrumPeaks_negative_Unidentified.RData')
-       // path('SpectrumPeaks_positive_Unidentified.RData')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $SpectrumPeaks_file $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $spectrumpeaks_file $params.ppm
         """
 }
diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R
index 5416b15..2ee085f 100755
--- a/DIMS/UnidentifiedFillMissing.R
+++ b/DIMS/UnidentifiedFillMissing.R
@@ -4,25 +4,24 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-# define parameters
 peakgrouplist_file1 <- cmd_args[1]
 peakgrouplist_file2 <- cmd_args[2]
-scripts_dir         <- cmd_args[3]
-thresh   <- as.numeric(cmd_args[4])
-resol    <- as.numeric(cmd_args[5])
-ppm      <- as.numeric(cmd_args[6])
-outdir   <- "./"
+scripts_dir <- cmd_args[3]
+thresh <- as.numeric(cmd_args[4])
+resol <- as.numeric(cmd_args[5])
+ppm <- as.numeric(cmd_args[6])
+outdir  <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R"))
-source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getSD.R"))
-source(paste0(scripts_dir, "AddOnFunctions/getArea.R"))
-source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R"))
-source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R"))
-source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R"))
-source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R"))
+source(paste0(scripts_dir, "replaceZeros.R"))
+source(paste0(scripts_dir, "generateGaussian.R"))
+source(paste0(scripts_dir, "getFwhm.R"))
+source(paste0(scripts_dir, "getSD.R"))
+source(paste0(scripts_dir, "getArea.R"))
+source(paste0(scripts_dir, "optimizeGauss.R"))
+source(paste0(scripts_dir, "ident.hires.noise.HPC.R"))
+source(paste0(scripts_dir, "elementInfo.R"))
+source(paste0(scripts_dir, "globalAssignments.HPC.R"))
 
 peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2)
 for (peakgrouplist_file in peakgrouplist_files) {
@@ -46,5 +45,5 @@ for (peakgrouplist_file in peakgrouplist_files) {
   peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
   # save output
-  save(peakgrouplist_filled, file = paste0("./", outputfile_name))
+  save(peakgrouplist_filled, file = outputfile_name)
 }
diff --git a/DIMS/UnidentifiedFillMissing.nf b/DIMS/UnidentifiedFillMissing.nf
index 50c9783..3f9098e 100644
--- a/DIMS/UnidentifiedFillMissing.nf
+++ b/DIMS/UnidentifiedFillMissing.nf
@@ -1,18 +1,18 @@
 process UnidentifiedFillMissing {
-    tag "DIMS UnidentifiedFillMissing ${GroupedList_file}"
+    tag "DIMS UnidentifiedFillMissing ${groupedlist_file}"
     label 'UnidentifiedFillMissing'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(GroupedList_file)
-       path(replication_pattern) // input files need to be linked, but called within R script
+       path(groupedlist_file)
+       each path(replication_pattern)
 
     output:
        path('*_filled.RData')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $groupedlist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm
         """
 }
diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R
index 73ac4c1..16dd8c1 100755
--- a/DIMS/UnidentifiedPeakGrouping.R
+++ b/DIMS/UnidentifiedPeakGrouping.R
@@ -4,25 +4,27 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-resol  <- as.numeric(cmd_args[1])
-ppm    <- as.numeric(cmd_args[2])
+resol <- as.numeric(cmd_args[1])
+ppm <- as.numeric(cmd_args[2])
 outdir <- "./"
-options(digits = 16)
-
-print(list.files(outdir, pattern = "RData"))
 
-scanmodes <- c("positive", "negative")
+options(digits = 16)
 
 # function for grouping unidentified peaks
 grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
   outlist_copy <- get(load(unidentified_peaklist))
-  load(paste0("./", scanmode, "_repl_pattern.RData"))
+  load(paste0(scanmode, "_repl_pattern.RData"))
   outpgrlist <- NULL
 
   # group on highest peaks
   range <- ppm * 1e-06
 
-  while (dim(outlist_copy)[1] > 0) {
+  # temporary: speed up this step by limiting the number of rows used in while loop
+  # this script needs to be parallellized
+  nrow_div <- nrow(outlist_copy) / 1.1
+
+  # while (dim(outlist_copy)[1] > 0) {
+  while (dim(outlist_copy)[1] > nrow_div) {
     sel <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1]
 
     # ppm range around max
@@ -58,22 +60,27 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
         sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"]))
       })))
 
-      outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA))
+      # combine all information
+      outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, 
+					mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA))
     }
 
     outlist_copy <- outlist_copy[-which(selp == TRUE), , drop = FALSE]
   }
 
   outpgrlist <- as.data.frame(outpgrlist)
-  colnames(outpgrlist)[1:6] <- c("mzmed_pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin_pgrp", "mzmax_pgrp")
-  colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB")
+  colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp")
+  colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", 
+										  "HMDB_code", "theormz_HMDB")
 
   return(outpgrlist)
 }
 
+scanmodes <- c("positive", "negative")
+
 for (scanmode in scanmodes) {
   # generate peak group lists of the unidentified peaks
-  unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData")
+  unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")
   outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm)
   write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt"))
 
diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf
index 00fc576..2988195 100644
--- a/DIMS/UnidentifiedPeakGrouping.nf
+++ b/DIMS/UnidentifiedPeakGrouping.nf
@@ -1,12 +1,12 @@
 process UnidentifiedPeakGrouping {
-    tag "DIMS UnidentifiedPeakGrouping ${UnidentifiedSpectrumPeaks_file}"
+    tag "DIMS UnidentifiedPeakGrouping ${unidentified_spectrumpeaks_files}"
     label 'UnidentifiedPeakGrouping'
     container = 'docker://umcugenbioinf/dims:1.3'
     shell = ['/bin/bash', '-euo', 'pipefail']
 
     input:
-       path(UnidentifiedSpectrumPeaks_file) // input files need to be linked, but called within R script
-       path(replication_pattern)  // input files need to be linked, but called within R script
+       path(unidentified_spectrumpeaks_files)
+       each path(replication_pattern)
 
     output:
        path('*_Unidentified.txt')

From 95df3e945ffc6a8e31097d5171ba9ecde4802bd7 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Wed, 27 Mar 2024 17:10:56 +0100
Subject: [PATCH 40/73] cleaned up AddOnFunctions

---
 DIMS/Utils/do_peakfinding.R |  58 +++++++++++
 DIMS/Utils/fit_gaussians.R  | 198 ++++++++++++++++++++++++++++++++++++
 DIMS/Utils/fit_init.R       |  49 +++++++++
 DIMS/Utils/get_fwhm.R       |  21 ++++
 DIMS/Utils/get_stdev.R      |  23 +++++
 DIMS/Utils/search_mzrange.R | 188 ++++++++++++++++++++++++++++++++++
 6 files changed, 537 insertions(+)
 create mode 100644 DIMS/Utils/do_peakfinding.R
 create mode 100644 DIMS/Utils/fit_gaussians.R
 create mode 100644 DIMS/Utils/fit_init.R
 create mode 100644 DIMS/Utils/get_fwhm.R
 create mode 100644 DIMS/Utils/get_stdev.R
 create mode 100644 DIMS/Utils/search_mzrange.R

diff --git a/DIMS/Utils/do_peakfinding.R b/DIMS/Utils/do_peakfinding.R
new file mode 100644
index 0000000..004d96f
--- /dev/null
+++ b/DIMS/Utils/do_peakfinding.R
@@ -0,0 +1,58 @@
+## adapted from findpeaks.Gauss.HPC.R
+# NB: this function will be taken up into PeakFinding.R
+# variables with fixed values will be removed from function parameters
+# int_factor, scale, outdir, plot, thresh, width, height
+do_peakfinding <- function(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, plot, thresh, width, height) {
+  #' start peak finding
+  #'
+  #' @param sample_avgtechrepl: Dataframe with binned intensities averaged over technical replicates for a sample
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param outdir: Path for output directory (string)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param thresh: Value for noise level threshold (integer)
+  #' @param width: Value for width of plot (integer)
+  #' @param height: Value for height of plot (integer)
+  #'
+  #' @return save output to file
+
+  sample_name <- colnames(sample_avgtechrepl)[1]
+
+  # turn dataframe with intensities into a named list
+  ints_fullrange <- as.vector(sample_avgtechrepl)
+  names(ints_fullrange) <- rownames(sample_avgtechrepl)
+
+  # initialise list to store results for all peaks
+  allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL,
+                          "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0)
+
+  # look for m/z range for all peaks
+  allpeaks_values <- search_mzrange(ints_fullrange, allpeaks_values, int_factor, scale, resol,
+                                    outdir, sample_name, scanmode,
+                                    plot, width, height, thresh)
+
+  # turn the list into a dataframe
+  outlist_persample <- NULL
+  outlist_persample <- cbind("samplenr" = allpeaks_values$nr,
+                             "mzmed.pkt" = allpeaks_values$mean,
+                             "fq" = allpeaks_values$qual,
+                             "mzmin.pkt" = allpeaks_values$min,
+                             "mzmax.pkt" = allpeaks_values$max,
+                             "height.pkt" = allpeaks_values$area)
+
+  # remove peaks with height = 0
+  index <- which(outlist_persample[, "height.pkt"] == 0)
+  if (length(index) > 0) {
+    outlist_persample <- outlist_persample[-index, ]
+  }
+
+  # save output to file
+  save(outlist_persample, file = paste0(sample_name, "_", scanmode, ".RData"))
+
+  # generate text output to log file on number of spikes for this sample
+  # spikes are peaks that are too narrow, e.g. 1 data point
+  cat(paste("There were", allpeaks_values$spikes, "spikes"))
+}
+
diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R
new file mode 100644
index 0000000..007b8ed
--- /dev/null
+++ b/DIMS/Utils/fit_gaussians.R
@@ -0,0 +1,198 @@
+# Gaussian fit functions
+## adapted from fitG.R, fit2G.R, fit3G.R and fit4G.R (combined)
+fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use_bounds) {
+  #' Fit a Gaussian curve for a peak with given parameters
+  #'
+  #' @param mass_vector: Vector of masses (float)
+  #' @param int_vector: Vector of intensities (float)
+  #' @param sigma: Value for width of the peak (float)
+  #' @param query_mass: Value for mass at center of peak (float)
+  #' @param scale: Value for scaling intensities (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return opt_fit: list of parameters and values describing the optimal fit
+
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(params) {
+    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma)
+    sum((d - int_vector) ^ 2)
+  }
+
+  if (use_bounds) {
+    # determine lower and upper boundaries
+    lower <- c(mass_vector[1], 0, mass_vector[1], 0)
+    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
+    # get optimal value for fitted Gaussian curve
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+                     opt_f, control = list(maxit = 10000), method = "L-BFGS-B",
+                     lower = lower, upper = upper)
+  } else {
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+                     opt_f, control = list(maxit = 10000))
+  }
+  return(opt_fit)
+}
+
+
+fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
+                           query_mass1, scale1,
+                           query_mass2, scale2, use_bounds) {
+  #' Fit two Gaussian curves for a peak with given parameters
+  #'
+  #' @param mass_vector: Vector of masses (float)
+  #' @param int_vector: Vector of intensities (float)
+  #' @param sigma1: Value for width of the first peak (float)
+  #' @param sigma2: Value for width of the second peak (float)
+  #' @param query_mass1: Value for mass at center of first peak (float)
+  #' @param scale1: Value for scaling intensities for first peak (float)
+  #' @param query_mass2: Value for mass at center of second peak (float)
+  #' @param scale2: Value for scaling intensities for second peak (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return opt_fit: list of parameters and values describing the optimal fit
+
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(params) {
+    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) +
+         params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2)
+    sum((d - int_vector) ^ 2)
+  }
+
+  if (use_bounds) {
+    # determine lower and upper boundaries
+    lower <- c(mass_vector[1], 0, mass_vector[1], 0)
+    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
+    # get optimal value for 2 fitted Gaussian curves
+    if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
+      sigma2 <- sigma1
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass1), as.numeric(scale1)),
+                       opt_f, control = list(maxit = 10000),
+                       method = "L-BFGS-B", lower = lower, upper = upper)
+    } else {
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass2), as.numeric(scale2)),
+                       opt_f, control = list(maxit = 10000),
+                       method = "L-BFGS-B", lower = lower, upper = upper)
+    }
+  } else {
+    if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
+      sigma2 <- sigma1
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass1), as.numeric(scale1)),
+                       opt_f, control = list(maxit = 10000))
+    } else {
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                         as.numeric(query_mass2), as.numeric(scale2)),
+                       opt_f, control = list(maxit = 10000))
+    }
+  }
+  return(opt_fit)
+}
+
+
+fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3,
+                           query_mass1, scale1,
+                           query_mass2, scale2,
+                           query_mass3, scale3, use_bounds) {
+  #' Fit three Gaussian curves for a peak with given parameters
+  #'
+  #' @param mass_vector: Vector of masses (float)
+  #' @param int_vector: Vector of intensities (float)
+  #' @param sigma1: Value for width of the first peak (float)
+  #' @param sigma2: Value for width of the second peak (float)
+  #' @param sigma3: Value for width of the third peak (float)
+  #' @param query_mass1: Value for mass at center of first peak (float)
+  #' @param scale1: Value for scaling intensities for first peak (float)
+  #' @param query_mass2: Value for mass at center of second peak (float)
+  #' @param scale2: Value for scaling intensities for second peak (float)
+  #' @param query_mass3: Value for mass at center of third peak (float)
+  #' @param scale3: Value for scaling intensities for third peak (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return opt_fit: list of parameters and values describing the optimal fit
+
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(params) {
+    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) +
+         params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) +
+         params[6] * dnorm(mass_vector, mean = params[5], sd = sigma3)
+    sum((d - int_vector) ^ 2)
+  }
+
+  if (use_bounds) {
+    # determine lower and upper boundaries
+    lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0)
+    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf,
+               mass_vector[length(mass_vector)], Inf)
+    # get optimal value for 3 fitted Gaussian curves
+    opt_fit <- optim(c(query_mass1, scale1,
+                       query_mass2, scale2,
+                       query_mass3, scale3),
+                     opt_f, control = list(maxit = 10000),
+                     method = "L-BFGS-B", lower = lower, upper = upper)
+  } else {
+    opt_fit <- optim(c(query_mass1, scale1,
+                       query_mass2, scale2,
+                       query_mass3, scale3),
+                     opt_f, control = list(maxit = 10000))
+  }
+  return(opt_fit)
+}
+
+
+fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma4,
+                           query_mass1, scale1,
+                           query_mass2, scale2,
+                           query_mass3, scale3,
+                           query_mass4, scale4, use_bounds) {
+  #' Fit four Gaussian curves for a peak with given parameters
+  #'
+  #' @param mass_vector: Vector of masses (float)
+  #' @param int_vector: Vector of intensities (float)
+  #' @param sigma1: Value for width of the first peak (float)
+  #' @param sigma2: Value for width of the second peak (float)
+  #' @param sigma3: Value for width of the third peak (float)
+  #' @param sigma4: Value for width of the fourth peak (float)
+  #' @param query_mass1: Value for mass at center of first peak (float)
+  #' @param scale1: Value for scaling intensities for first peak (float)
+  #' @param query_mass2: Value for mass at center of second peak (float)
+  #' @param scale2: Value for scaling intensities for second peak (float)
+  #' @param query_mass3: Value for mass at center of third peak (float)
+  #' @param scale3: Value for scaling intensities for third peak (float)
+  #' @param query_mass4: Value for mass at center of fourth peak (float)
+  #' @param scale4: Value for scaling intensities for fourth peak (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return opt_fit: list of parameters and values describing the optimal fit
+
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(params) {
+    d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) +
+         params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) +
+         params[6] * dnorm(mass_vector, mean = params[5], sd = sigma3) +
+         params[8] * dnorm(mass_vector, mean = params[7], sd = sigma4)
+    sum((d - int_vector) ^ 2)
+  }
+
+  if (use_bounds) {
+    # determine lower and upper boundaries
+    lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0)
+    upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf,
+               mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
+    # get optimal value for 3 fitted Gaussian curves
+    opt_fit <- optim(c(query_mass1, scale1,
+                       query_mass2, scale2,
+                       query_mass3, scale3,
+                       query_mass4, scale4),
+                     opt_f, control = list(maxit = 10000),
+                     method = "L-BFGS-B", lower = lower, upper = upper)
+  } else {
+    opt_fit <- optim(c(query_mass1, scale1,
+                       query_mass2, scale2,
+                       query_mass3, scale3,
+                       query_mass4, scale4),
+                     opt_f, control = list(maxit = 10000))
+  }
+  return(opt_fit)
+}
diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R
new file mode 100644
index 0000000..5e0b823
--- /dev/null
+++ b/DIMS/Utils/fit_init.R
@@ -0,0 +1,49 @@
+## adapted from fitGaussianInit.R
+# variables with fixed values will be removed from function parameters
+# int_factor, scale, outdir, plot, thresh, width, height
+# mz_index, start_index, end_index, sample_name not used.
+# fit_gaussian should be defined before this function is called.
+fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
+                     outdir, sample_name, scanmode, plot, width, height,
+                     mz_index, start_index, end_index) {
+  #' Initial fit of Gaussian curve to small region of m/z
+  #'
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Vector of intensities for a region of interest (float)
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param outdir: Path for output directory (string)
+  #' @param sample_name: Sample name (string)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param width: Value for width of plot (integer)
+  #' @param height: Value for height of plot (integer)
+  #' @param mz_index: Index of m/z value with non-zero intensity (integer)
+  #' @param start_index: Index of start of m/z range in mass_vector (integer)
+  #' @param end_index: Index of end of m/z range in mass_vector (integer)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+
+  # define mass_diff as difference between last and first value of mass_vector
+  mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1]
+  # generate a second mass_vector with equally spaced m/z values
+  mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)],
+                      length = mass_diff * int_factor)
+
+  # Find the index in int_vector with the highest intensity
+  max_index <- which(int_vector == max(int_vector))
+
+  roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index,
+                             scale, resol, outdir, force = length(max_index),
+                             useBounds = FALSE, plot, scanmode, int_factor, width, height)
+
+  roi_value_list <- list("mean" = roi_values$mean,
+                         "area" = roi_values$area,
+                         "qual" = roi_values$qual,
+                         "min" = roi_values$min,
+                         "max" = roi_values$max)
+
+  return(roi_value_list)
+}
+
diff --git a/DIMS/Utils/get_fwhm.R b/DIMS/Utils/get_fwhm.R
new file mode 100644
index 0000000..7543921
--- /dev/null
+++ b/DIMS/Utils/get_fwhm.R
@@ -0,0 +1,21 @@
+## adapted from getFwhm.R
+get_fwhm <- function(query_mass, resol) {
+  #' Calculate fwhm (full width at half maximum intensity) for a peak
+  #'
+  #' @param query_mass: Value for mass (float)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return fwhm: Value for full width at half maximum (float)
+
+  # set aberrant values of query_mass to zero
+  if (is.nan(query_mass)) query_mass <- 0
+  if (is.na(query_mass)) query_mass <- 0
+  if (is.null(query_mass)) query_mass <- 0
+  if (query_mass < 0) query_mass <- 0
+  # calculate resolution at given m/z value
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200)))
+  # calculate full width at half maximum
+  fwhm <- query_mass / resol_mz
+  return(fwhm)
+}
+
diff --git a/DIMS/Utils/get_stdev.R b/DIMS/Utils/get_stdev.R
new file mode 100644
index 0000000..cfe0aa4
--- /dev/null
+++ b/DIMS/Utils/get_stdev.R
@@ -0,0 +1,23 @@
+## adapted from getSD.R
+get_stdev <- function(mass_vector, int_vector, resol = 140000) {
+  #' Calculate standard deviation to determine width of a peak
+  #'
+  #' @param mass_vector: Vector of 3 mass values (float)
+  #' @param int_vector: Vector of 3 intensities (float)
+  #' @param resol: Value for resolution (integer)
+  #'
+  #' @return stdev: Value for standard deviation
+
+  # find maximum intensity in vector
+  max_index <- which(int_vector == max(int_vector))
+  # find corresponding mass at maximum intensity
+  max_mass <- mass_vector[max_index]
+  # calculate resolution at given m/z value
+  resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200)))
+  # calculate full width at half maximum
+  fwhm <- mean / resol_mz
+  # calculate standard deviation
+  stdev <- (fwhm / 2) * 0.85
+  return(stdev)
+}
+
diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R
new file mode 100644
index 0000000..aa0abbd
--- /dev/null
+++ b/DIMS/Utils/search_mzrange.R
@@ -0,0 +1,188 @@
+## adapted from searchMZRange.R
+# variables with fixed values will be removed from function parameters
+# int_factor, scale, outdir, plot, thresh, width, height
+# allpeaks_values should be generated here, not passed on from do_peakfinding
+search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, resol,
+                           outdir, sample_name, scanmode, plot, width, height, thresh) {
+  #' Divide the full m/z range into regions of interest with min, max and mean m/z
+  #'
+  #' @param ints_fullrange: Named list of intensities (float)
+  #' @param allpeaks_values: Empty list to store results for all peaks
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param outdir: Path for output directory (string)
+  #' @param sample_name: Sample name (string)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param width: Value for width of plot (integer)
+  #' @param height: Value for height of plot (integer)
+  #' @param thresh: Value for noise level threshold (integer)
+  #'
+  #' @return allpeaks_values: list of m/z regions of interest
+
+  # initialise list to store results for all peaks. Currently in do_peakfinding
+  # allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL,
+  #                         "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0)
+
+  # find indices where intensity is not equal to zero
+  nonzero_indices <- as.vector(which(ints_fullrange != 0))
+
+  # bad infusion. These should have been taken out in AverageTechReplicates
+  if (length(nonzero_indices) == 0) return(allpeaks_values)
+
+  # initialize
+  end_index <- NULL
+  start_index <- nonzero_indices[1]
+  # maximum length of region of interest
+  max_roi_length <- 15
+
+  # find regions of interest
+  for (mz_index in 1:length(nonzero_indices)) {
+    # check whether mz_index is smaller than length(nonzero_indices).
+    # only false if mz_index == length(nonzero_indixes). 
+    # second check is true at the end of a peak.
+    if (mz_index < length(nonzero_indices) && (nonzero_indices[mz_index + 1] - nonzero_indices[mz_index]) > 1) {
+      end_index <- nonzero_indices[mz_index]
+      # get m/z values and intensities for this region of interest
+      mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)])
+      int_vector <- as.vector(ints_fullrange[c(start_index:end_index)])
+      # check whether the vector of intensities is not empty. 
+      if (length(int_vector) != 0) {
+        # check if intensity is above threshold or the maximum intensity is NaN
+        if (max(int_vector) < thresh || is.nan(max(int_vector))) {
+          # go to next region of interest
+          start_index <- nonzero_indices[mz_index + 1]
+          next
+        }
+        # check if there are more intensities than maximum for region of interest
+        if (length(int_vector) > max_roi_length) {
+          # trim lowest intensities to zero
+          int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0
+          # split the range into multiple sub ranges
+          sub_range <- int_vector
+          names(sub_range) <- mass_vector
+          allpeaks_values <- search_mzrange(sub_range, allpeaks_values, int_factor,
+                                            scale, resol, outdir, sample_name, scanmode,
+                                            plot, width, height, thresh)
+        # A proper peak needs to have at least 3 intensities above threshold  
+        } else if (length(int_vector) > 3) {
+          # check if the sum of intensities is above zero. Why is this necessary?
+          if (sum(int_vector) == 0) next
+          # get initial fit values
+          roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol,
+                                 outdir, sample_name, scanmode, plot, width, height,
+                                 mz_index, start_index, end_index)
+
+          if (roi_values$qual[1] == 1) {
+            # get optimized fit values
+            roi_values <- generateGaussian(mass_vector, int_vector, resol, plot,
+                                           scanmode, int_factor, width, height)
+            # add region of interest to list of all peaks
+            allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
+            allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
+            allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+            allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
+            allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
+            allpeaks_values$qual <- c(allpeaks_values$qual, 0)
+            allpeaks_values$spikes <- allpeaks_values$spikes + 1
+
+          } else {
+            for (j in 1:length(roi_values$mean)){
+              allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j])
+              allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j])
+              allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+              allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1])
+              allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1])
+              allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1])
+            }
+          }
+
+        } else {
+
+          roi_values <- generateGaussian(mass_vector, int_vector, resol,
+                                         plot, scanmode, int_factor, width, height)
+
+          allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
+          allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
+          allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+          allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
+          allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
+          allpeaks_values$qual <- c(allpeaks_values$qual, 0)
+          allpeaks_values$spikes <- allpeaks_values$spikes + 1
+        }
+      }
+      start_index <- nonzero_indices[mz_index + 1]
+    }
+  }
+
+  # last little range
+  end_index <- nonzero_indices[length(nonzero_indices)]
+  mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)])
+  int_vector <- as.vector(ints_fullrange[c(start_index:end_index)])
+
+  if (length(int_vector) != 0) {
+    # check if intensity above threshold
+    if (max(int_vector) < thresh || is.nan(max(int_vector))) {
+      # do nothing
+    } else {
+      # check if there are more intensities than maximum for region of interest
+      if (length(int_vector) > max_roi_length) {
+        # trim lowest intensities to zero
+        int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0
+        # split the range into multiple sub ranges
+        sub_range <- int_vector
+        names(sub_range) <- mass_vector
+
+        allpeaks_values <- search_mzrange(sub_range, allpeaks_values, int_factor, scale, resol,
+                                          outdir, sample_name, scanmode,
+                                          plot, width, height, thresh)
+
+      } else if (length(int_vector) > 3) {
+        # Check only zeros
+        if (sum(int_vector) == 0) next
+
+        roi_values <- fit_init(mass_vector, int_vector, int_factor, scale,
+                               resol, outdir, sample_name, scanmode,
+                               plot, width, height,
+                               mz_index, start_index, end_index)
+
+        if (roi_values$qual[1] == 1) {
+          roi_values <- generateGaussian(mass_vector, int_vector, resol,
+                                         plot, scanmode, int_factor, width, height)
+
+          allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
+          allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
+          allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+          allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
+          allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
+          allpeaks_values$qual <- c(allpeaks_values$qual, 0)
+          allpeaks_values$spikes <- allpeaks_values$spikes + 1
+
+        } else {
+          for (j in 1:length(roi_values$mean)){
+            allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j])
+            allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j])
+            allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+            allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1])
+            allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1])
+            allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1])
+          }
+        }
+      } else {
+        roi_values <- generateGaussian(mass_vector, int_vector, resol,
+                                       plot, scanmode, int_factor, width, height)
+
+        allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
+        allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
+        allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
+        allpeaks_values$min <- c(allpeaks_values$min, roi_values$min)
+        allpeaks_values$max <- c(allpeaks_values$max, roi_values$max)
+        allpeaks_values$qual <- c(allpeaks_values$qual, 0)
+        allpeaks_values$spikes <- allpeaks_values$spikes + 1
+      }
+    }
+  }
+  return(allpeaks_values)
+}
+

From 77a53c18cdf61cba24265adffca2de3bc1a01cae Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Wed, 27 Mar 2024 17:41:26 +0100
Subject: [PATCH 41/73] cleaned up AddOnFunctions

---
 DIMS/Utils/fit_init.R  |  6 +++---
 DIMS/Utils/fit_optim.R | 48 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 3 deletions(-)
 create mode 100644 DIMS/Utils/fit_optim.R

diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R
index 5e0b823..f9e0c27 100644
--- a/DIMS/Utils/fit_init.R
+++ b/DIMS/Utils/fit_init.R
@@ -1,12 +1,12 @@
 ## adapted from fitGaussianInit.R
 # variables with fixed values will be removed from function parameters
-# int_factor, scale, outdir, plot, thresh, width, height
+# scale, outdir, plot, width, height
 # mz_index, start_index, end_index, sample_name not used.
 # fit_gaussian should be defined before this function is called.
 fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
                      outdir, sample_name, scanmode, plot, width, height,
                      mz_index, start_index, end_index) {
-  #' Initial fit of Gaussian curve to small region of m/z
+  #' Determine initial fit of Gaussian curve to small region of m/z
   #'
   #' @param mass_vector: Vector of m/z values for a region of interest (float)
   #' @param int_vector: Vector of intensities for a region of interest (float)
@@ -37,7 +37,7 @@ fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
   roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index,
                              scale, resol, outdir, force = length(max_index),
                              useBounds = FALSE, plot, scanmode, int_factor, width, height)
-
+  # put all values for this region of interest into a list
   roi_value_list <- list("mean" = roi_values$mean,
                          "area" = roi_values$area,
                          "qual" = roi_values$qual,
diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R
new file mode 100644
index 0000000..ac3a9ee
--- /dev/null
+++ b/DIMS/Utils/fit_optim.R
@@ -0,0 +1,48 @@
+## adapted from generateGaussian.R
+# variables with fixed values will be removed from function parameters
+# plot, width, height
+# fit_gaussian should be defined before this function is called.
+fit_op4tim <- function(mass_vector, int_vector, resol,
+                      plot, scanmode, int_factor, width, height) {
+  #' Determine optimized fit of Gaussian curve to small region of m/z
+  #'
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Vector of intensities for a region of interest (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #' @param width: Value for width of plot (integer)
+  #' @param height: Value for height of plot (integer)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+  
+  factor <- 1.5
+  # Find the index in int_vector with the highest intensity
+  max_index <- which(int_vector == max(int_vector))[1]
+  mass_max <- mass_vector[max_index]
+  int_max <- int_vector[max_index]
+  # get peak width
+  fwhm <- get_fwhm(mass_max, resol)
+  # simplify the peak shape: represent it by a triangle
+  mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm)
+  int_max_simple <- c(0, int_max, 0)
+  
+  # define mass_diff as difference between last and first value of mass_max_simple
+  mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1]
+  # generate a second mass_vector with equally spaced m/z values
+  mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
+                      length = mz_diff * int_factor)
+  sigma <- get_stdev(mass_vector2, int_max_simple)
+  scale <- optimizeGauss(mass_vector2, int_max_simple, sigma, mass_max)
+
+  # get an estimate of the area under the peak
+  area <- getArea(mass_max, resol, scale, sigma, int_factor)
+  
+  # put all values for this region of interest into a list
+  roi_value_list <- list("mean" = mass_max,
+                         "area" = area,
+                         "min" = mass_vector2[1],
+                         "max" = mass_vector2[length(mass_vector2)])
+  return(roi_value_list)
+}

From 777db937bf637a19713ba2919a6ac146b20dc0a0 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 5 Apr 2024 09:11:31 +0200
Subject: [PATCH 42/73] cleaned up version of fitGaussian.R

---
 DIMS/Utils/fit_gaussian.R | 333 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 333 insertions(+)
 create mode 100644 DIMS/Utils/fit_gaussian.R

diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R
new file mode 100644
index 0000000..9dc7e52
--- /dev/null
+++ b/DIMS/Utils/fit_gaussian.R
@@ -0,0 +1,333 @@
+## adapted from fitGaussian.R
+# variables with fixed values will be removed from function parameters
+# scale, outdir, plot, width, height
+# max_index doesn't need to be passed to this function, can be determined here.
+# remove plot sections (commented out)
+# several functions need to be loaded before this function can run
+fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                        outdir, force, use_bounds, plot, scanmode,
+                        int_factor, width, height) {
+  #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param outdir: Path for output directory (string)
+  #' @param force: Number of local maxima in int_vector (integer)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #' @param width: Value for width of plot (integer)
+  #' @param height: Value for height of plot (integer)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+
+  # Initialise
+  peak_mean <- NULL
+  peak_area <- NULL
+  peak_qual <- NULL
+  peak_min <- NULL
+  peak_max <- NULL
+  fit_quality1 <- 0.15
+  fit_quality <- 0.2
+
+  # One local maximum:
+  if (force == 1) {
+    # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
+    fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds)
+    # set initial value for scale factor
+    scale <- 2
+    # test if the mean is outside the m/z range
+    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) {
+      # run this function again with fixed boundaries
+      return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                          outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height))
+
+    } else {
+      # test if the fit is bad
+      if (fit_values$qual > fit_quality1) {
+        # Try to fit two curves; find two local maxima
+        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
+        # test if there are two indices in new_index
+        if (length(new_index) != 2) {
+          new_index <- round(length(mass_vector) / 3)
+          new_index <- c(new_index, 2 * new_index)
+        }
+        # run this function again with two local maxima
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index,
+                            scale, resol, outdir, force = 2, use_bounds = FALSE,
+                            plot, scanmode, int_factor, width, height))
+      # good fit
+      } else {
+        peak_mean <- c(peak_mean, fit_values$mean)
+        peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale,
+                                          fit_values$sigma, int_factor))
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+    }
+
+  #### Two local maxima; need at least 6 data points for this ####
+  } else if (force == 2 && (length(mass_vector) > 6)) {
+    # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
+    fit_values <- fit2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol,
+                            use_bounds, plot, fit_quality, int_factor)
+    # test if one of the means is outside the m/z range
+    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
+      # check if fit quality is bad
+      if (fit_values$qual > fit_quality) {
+        # run this function again with fixed boundaries
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            outdir, force = 2, use_bounds = TRUE,
+                            plot, scanmode, int_factor, width, height))
+      } else {
+        # check which mean is outside range and remove it from the list of means
+        # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
+        for (i in 1:length(fit_values$mean)){
+          if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) {
+            peak_mean <- c(peak_mean, -i)
+            peak_area <- c(peak_area, -i)
+          } else {
+            peak_mean <- c(peak_mean, fit_values$mean[i])
+            peak_area <- c(peak_area, fit_values$area[i])
+          }
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+    # if all means are within range
+    } else {
+      # check for bad fit
+      if (fit_values$qual > fit_quality) {
+        # Try to fit three curves; find three local maxima
+        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
+        # test if there are three indices in new_index
+        if (length(new_index) != 3) {
+          new_index <- round(length(mass_vector) / 4)
+          new_index <- c(new_index, 2 * new_index, 3 * new_index)
+        }
+        # run this function again with three local maxima
+        return(fitGaussian(mass_vector2, mass_vector, int_vector, new_index,
+                           scale, resol, outdir, force = 3, use_bounds = FALSE,
+                           plot, scanmode, int_factor, width, height))
+      # good fit, all means are within m/z range
+      } else {
+        # check if means are within 3 ppm and sum if so
+        tmp <- fit_values$qual
+        nr_means_new <- -1
+        nr_means <- length(fit_values$mean)
+        while (nr_means != nr_means_new) {
+          nr_means <- length(fit_values$mean)
+          fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+                                     mass_vector2, mass_vector, ppm = 4, resol, plot)
+          nr_means_new <- length(fit_values$mean)
+        }
+        # restore original quality score
+        fit_values$qual <- tmp
+        # section for plot
+        # plot_header <- NULL
+        # for (i in 1:length(fit_values$mean)) {
+        #   plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " "))
+        #   peak_mean <- c(peak_mean, fit_values$mean[i])
+        #   peak_area <- c(peak_area, fit_values$area[i])
+        # }
+        # peak_qual <- fit_values$qual
+        # peak_min <- mass_vector[1]
+        # peak_max <- mass_vector[length(mass_vector)]
+        # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " "))
+        # if (plot) legend("topright", legend=plot_header)
+      }  
+    }
+
+  #### Three local maxima; need at least 6 data points for this ####
+  } else if (force == 3 && (length(mass_vector) > 6)) {
+    # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual)
+    fit_values <- fit3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            use_bounds, plot, fit_quality, int_factor)
+    # test if one of the means is outside the m/z range
+    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) {
+
+      # check if fit quality is bad
+      if (fit_values$qual > fit_quality) {
+        # run this function again with fixed boundaries
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            outdir, force, use_bounds = TRUE,
+                            plot, scanmode, int_factor, width, height))
+      } else {
+        # check which mean is outside range and remove it from the list of means
+        # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
+        for (i in 1:length(fit_values$mean)) {
+          if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) {
+            peak_mean <- c(peak_mean, -i)
+            peak_area <- c(peak_area, -i)
+          } else {
+            peak_mean <- c(peak_mean, fit_values$mean[i])
+            peak_area <- c(peak_area, fit_values$area[i])
+          }
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+    # if all means are within range
+    } else {
+      # check for bad fit
+      if (fit_values$qual > fit_quality) {
+        # Try to fit four curves; find four local maxima
+        new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
+        # test if there are four indices in new_index
+        if (length(new_index) != 4) {
+          new_index <- round(length(mass_vector) / 5)
+          new_index <- c(new_index, 2 * new_index, 3 * new_index, 4 * new_index)
+        }
+        # run this function again with four local maxima
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol,
+                            outdir, force = 4, use_bounds = FALSE, plot, scanmode,
+                            int_factor, width, height))
+      # good fit, all means are within m/z range
+      } else {
+        # check if means are within 4 ppm and sum if so  
+        tmp <- fit_values$qual
+        nr_means_new <- -1
+        nr_means <- length(fit_values$mean)
+        while (nr_means!=nr_means_new) {
+          nr_means <- length(fit_values$mean)
+          fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+                                     mass_vector2, mass_vector, ppm = 4, resol, plot)
+          nr_means_new <- length(fit_values$mean)
+        }
+        # restore original quality score
+        fit_values$qual <- tmp
+
+        # plot_header <- NULL
+        # for (i in 1:length(fit_values$mean)){
+        #   plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " "))
+        #   
+        #   peak_mean <- c(peak_mean, fit_values$mean[i])
+        #   peak_area <- c(peak_area, fit_values$area[i])
+        # }
+        # peak_qual <- fit_values$qual
+        # peak_min <- mass_vector[1]
+        # peak_max <- mass_vector[length(mass_vector)]
+        # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " "))
+        # if (plot) legend("topright", legend=plot_header)
+      }
+    }
+    
+  #### Four local maxima; need at least 6 data points for this ####  
+  } else if (force == 4 && (length(mass_vector) > 6)) {
+    # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual)
+    fit_values <- fit4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            use_bounds, plot, fit_quality, int_factor)
+    # test if one of the means is outside the m/z range
+    if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] ||
+        fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) {
+
+      # check if quality of fit is bad
+      if (fit_values$qual > fit_quality) {
+        # run this function again with fixed boundaries
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            outdir, force, use_bounds = TRUE,
+                            plot, scanmode, int_factor, width, height))
+
+      } else {
+        # check which mean is outside range and remove it from the list of means
+        # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
+        for (i in 1:length(fit_values$mean)) {
+          if (fit_values$mean[i] < mass_vector[1] | fit_values$mean[i] > mass_vector[length(mass_vector)]) {
+            peak_mean <- c(peak_mean, -i)
+            peak_area <- c(peak_area, -i)
+          } else {
+            peak_mean <- c(peak_mean, fit_values$mean[i])
+            peak_area <- c(peak_area, fit_values$area[i])
+          }
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
+    # if all means are within range
+    } else {
+      # check for bad fit
+      if (fit_values$qual > fit_quality) {
+        # Try to fit 1 curve, force = 5
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            outdir, force = 5, use_bounds = FALSE,
+                            plot, scanmode, int_factor, width, height))
+      # good fit, all means are within m/z range
+      } else {
+        # check if means are within 4 ppm and sum if so
+        tmp <- fit_values$qual 
+        nr_means_new <- -1
+        nr_means <- length(fit_values$mean)
+        while (nr_means != nr_means_new) {
+          nr_means <- length(fit_values$mean)
+          fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+                                     mass_vector2, mass_vector, ppm = 4, resol, plot)
+          nr_means_new <- length(fit_values$mean)
+        }
+        # restore original quality score
+        fit_values$qual <- tmp
+
+        # plot_header<-NULL
+        # for (i in 1:length(fit_values$mean)){
+        #   plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " "))
+        #   peak_mean <- c(peak_mean, fit_values$mean[i])
+        #   peak_area <- c(peak_area, fit_values$area[i])
+        # }
+        # peak_qual <- fit_values$qual
+        # peak_min <- mass_vector[1]
+        # peak_max <- mass_vector[length(mass_vector)]
+        # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " "))
+        # if (plot) legend("topright", legend=plot_header)
+      }  
+    }
+    
+  # More than four local maxima; fit 1 peak.
+  } else {
+    scale <- 2
+    fit_quality1 <- 0.40
+    use_bounds <- TRUE
+    max_index <- which(int_vector == max(int_vector))
+    fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                           plot, fit_quality1, use_bounds)
+    # check for bad fit
+    if (fit_values$qual > fit_quality1) {
+      # remove
+      if (plot) dev.off()    
+      # get fit values from fit_optim
+      fit_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height)
+      peak_mean <- c(peak_mean, fit_values$mean)
+      peak_area <- c(peak_area, fit_values$area)
+      peak_min <- fit_values$min
+      peak_max <- fit_values$max
+      peak_qual <- 0
+    } else {
+      peak_mean <- c(peak_mean, fit_values$mean)
+      peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
+      peak_qual <- fit_values$qual
+      peak_min <- mass_vector[1]
+      peak_max <- mass_vector[length(mass_vector)]
+    }
+  }
+
+  # put all values for this region of interest into a list
+  roi_value_list <- list("mean" = peak_mean,
+                         "area" = peak_area,
+                         "qual" = peak_qual,
+                         "min" = peak_min,
+                         "max" = peak_max)
+  return(roi_value_list)
+}
+

From 5ff20a6bd62e498656fc7eb76cb5a0ca7c688675 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Mon, 8 Apr 2024 12:54:38 +0200
Subject: [PATCH 43/73] Add changes v2.7

---
 .../add_lab_id_and_onderzoeksnummer.R         | 10 ++++
 DIMS/AddOnFunctions/create_violin_plots.R     | 46 ++++++++++-----
 .../get_patient_data_to_helix.R               | 31 ++++++++++
 DIMS/AddOnFunctions/is_diagnostic_patient.R   |  6 ++
 DIMS/AddOnFunctions/prepare_alarmvalues.R     | 56 +++++++++++++------
 DIMS/GenerateViolinPlots.R                    | 36 +++++++++---
 6 files changed, 146 insertions(+), 39 deletions(-)
 create mode 100644 DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R
 create mode 100644 DIMS/AddOnFunctions/get_patient_data_to_helix.R
 create mode 100644 DIMS/AddOnFunctions/is_diagnostic_patient.R

diff --git a/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R
new file mode 100644
index 0000000..9d91a55
--- /dev/null
+++ b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R
@@ -0,0 +1,10 @@
+add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) {
+  # Split patient number into labnummer and Onderzoeksnummer
+  for (row in 1:nrow(df_metabs_helix)) {
+    df_metabs_helix[row,"labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row,"Patient"])
+    labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]]
+    df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2])
+  }
+  
+  return(df_metabs_helix)
+}
diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R
index 35e6afb..df398be 100644
--- a/DIMS/AddOnFunctions/create_violin_plots.R
+++ b/DIMS/AddOnFunctions/create_violin_plots.R
@@ -1,5 +1,5 @@
 create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) {
-
+  
   # set parameters for plots
   plot_height <- 9.6 
   plot_width <- 6
@@ -10,18 +10,26 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU
   #                   green     blue      blue/purple purple    orange    red
   
   # patient plots, create the PDF device
-  if (grepl("Diagnostics", pdf_dir)) { 
-    prefix <- "Dx_" 
-  } else if (grepl("IEM", pdf_dir)) { 
-    prefix <- "IEM_" 
-  } else { 
-    prefix <- "R_" 
+  pt_name_sub <- pt_name
+  suffix <- ""
+  if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) {
+    prefix <- "MB"
+    suffix <- "_DIMS_PL_DIAG"
+    # substitute P and M in P2020M00001 into right format for Helix
+    pt_name_sub <- gsub("[PM]", "", pt_name)
+    pt_name_sub <- gsub("\\..*", "", pt_name_sub)
+  } else if (grepl("Diagnostics", pdf_dir)) {
+    prefix <- "Dx_"
+  } else if (grepl("IEM", pdf_dir)) {
+    prefix <- "IEM_"
+  } else {
+    prefix <- "R_"
   }
-
-  pdf(paste0(pdf_dir, "/", prefix, pt_name, ".pdf"), 
+  
+  pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"),
       onefile = TRUE,
-      width = plot_width, 
-      height = plot_height) 
+      width = plot_width,
+      height = plot_height)
   
   # page headers:
   page_headers <- names(metab_perpage)
@@ -42,17 +50,25 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU
   for (page_index in 1:length(metab_perpage)) {
     # extract list of metabolites to plot on a page
     metab_list_2plot <- metab_perpage[[page_index]]
+    # extract original data for patient of interest (pt_name) before cut-offs
+    pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
     # cut off Z-scores higher than 20 or lower than -5 (for nicer plots)
     metab_list_2plot$value[metab_list_2plot$value >  20] <-  20
     metab_list_2plot$value[metab_list_2plot$value <  -5] <-  -5
     # extract data for patient of interest (pt_name)
     pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
+    # restore original Z-score before cut-off, for showing Z-scores in PDF
+    pt_list_2plot$value_orig <- pt_list_2plot_orig$value
     # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients
     metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ]
     # subtitle per page
     sub_perpage <- gsub("_", " ", page_headers[page_index])
     # for IEM plots, put subtitle on two lines
     sub_perpage <- gsub("probability", "\nprobability", sub_perpage)
+    # add size parameter for showing Z-score of patient per metabolite
+    Z_size <- rep(3, nrow(pt_list_2plot))
+    # set size to 0 if row is empty
+    Z_size[is.na(pt_list_2plot$value)] <- 0
     
     # draw violin plot. shape=22 gives square for patient of interest
     ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) +
@@ -61,6 +77,8 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU
       geom_violin(scale="width") +
       geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") +
       scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") +
+      # add Z-score value for patient of interest at x=16
+      geom_text(data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, size = Z_size) +
       # add labels. Use font Courier to get all the plots in the same location.
       labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + 
       theme(axis.text.y = element_text(family = "Courier", size=6)) +
@@ -72,11 +90,11 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU
       # add vertical lines
       geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) +
       geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2)
-      
+    
     suppressWarnings(print(ggplot_object))
     
   }
-
+  
   # add explanation of violin plots, version number etc.
   # plot.new()
   plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='')
@@ -91,7 +109,7 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU
     #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n"))
     #grid.arrange(explanation_grob)
   }
-
+  
   # close the PDF file
   dev.off()
   
diff --git a/DIMS/AddOnFunctions/get_patient_data_to_helix.R b/DIMS/AddOnFunctions/get_patient_data_to_helix.R
new file mode 100644
index 0000000..c582177
--- /dev/null
+++ b/DIMS/AddOnFunctions/get_patient_data_to_helix.R
@@ -0,0 +1,31 @@
+get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all){
+  # Combine Z-scores of metab groups together
+  df_all_metabs_zscores <- bind_rows(metab_interest_sorted)
+  # Change columnnames 
+  colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score")
+  # Change Patient column to character instead of factor
+  df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient)
+  
+  # Delete whitespaces HMDB_name 
+  df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right")
+  
+  # Split HMDB_name column on "nitine;" for match dims_helix_table
+  df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[,1]
+  
+  # Combine stofgroepen
+  dims_helix_table <- bind_rows(metab_list_all)
+  # Filter table for metabolites for Helix
+  dims_helix_table <- dims_helix_table %>% filter(Helix == "ja")
+  # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores
+  dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[,1]
+  dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore)
+  
+  # Filter DIMS results for metabolites for Helix
+  df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split)
+  # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc.
+  df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split))
+  
+  df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore)
+  
+  return(df_metabs_helix)
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/is_diagnostic_patient.R b/DIMS/AddOnFunctions/is_diagnostic_patient.R
new file mode 100644
index 0000000..839f41d
--- /dev/null
+++ b/DIMS/AddOnFunctions/is_diagnostic_patient.R
@@ -0,0 +1,6 @@
+is_diagnostic_patient <- function(patient_column){
+  # Check for Diagnostics patients with correct patientnumber (e.g. starting with "P2024M")
+  diagnostic_patients <- grepl("^P[0-9]{4}M",patient_column)
+  
+  return(diagnostic_patients)
+}
\ No newline at end of file
diff --git a/DIMS/AddOnFunctions/prepare_alarmvalues.R b/DIMS/AddOnFunctions/prepare_alarmvalues.R
index ff94fce..c09c536 100644
--- a/DIMS/AddOnFunctions/prepare_alarmvalues.R
+++ b/DIMS/AddOnFunctions/prepare_alarmvalues.R
@@ -1,23 +1,43 @@
-prepare_alarmvalues <- function(pt_name, metab_interest_sorted) {
-  # set parameters for table
-  high_zscore_cutoff <- 5
-  low_zscore_cutoff <- -3
+prepare_alarmvalues <- function(pt_name, dims_helix_table) {
+  # extract data for patient of interest (pt_name)
+  pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name)
+  pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2)
+  
+  # Make empty dataframes for metabolites above or below alarmvalues
+  pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric())
+  pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric())
   
-  # make table of all metabolites
-  all_metab <- c()
-  for (page_nr in 1:length(metab_interest_sorted)) {
-    all_metab <- rbind(all_metab, metab_interest_sorted[[page_nr]])
+  # Loop over individual metabolites
+  for (metab in unique(pt_metabs_helix$HMDB_name)){
+    # Get data for individual metabolite
+    pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab)
+    # print(pt_metab)
+    
+    # Check if zscore is positive of negative
+    if(pt_metab$Z_score > 0) {
+      # Get specific alarmvalue for metabolite
+      high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore)
+      
+      # If zscore is above the alarmvalue, add to pt_list_high table
+      if(pt_metab$Z_score > high_zscore_cutoff_metab) {
+        pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score)
+        pt_list_high <- rbind(pt_list_high, pt_metab_high)
+      }
+    } else {
+      # Get specific alarmvalue for metabolite
+      low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore)
+      
+      # If zscore is below the alarmvalue, add to pt_list_low table
+      if(pt_metab$Z_score < low_zscore_cutoff_metab) {
+        pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score)
+        pt_list_low <- rbind(pt_list_low, pt_metab_low)
+      }
+    }
   }
-  # extract data for patient of interest (pt_name)
-  pt_list <- all_metab[which(all_metab$variable==pt_name), ]
-  # remove column with patient name
-  pt_list <- pt_list[ , -2]
-  # round off Z-scores
-  pt_list$value <- round(as.numeric(pt_list$value), 2)
   
-  # determine alarms for this patient: Z-score above 5 or below -3
-  pt_list_high <- pt_list[pt_list$value > high_zscore_cutoff, ]
-  pt_list_low <- pt_list[pt_list$value < low_zscore_cutoff, ]
+  # sort tables on zscore
+  pt_list_high <- pt_list_high %>% arrange(desc(Z_score))
+  pt_list_low <- pt_list_low %>% arrange(Z_score)
   # add lines for increased, decreased
   extra_line1 <- c("Increased", "") 
   extra_line2 <- c("Decreased", "")
@@ -27,6 +47,6 @@ prepare_alarmvalues <- function(pt_name, metab_interest_sorted) {
   rownames(top_metab_pt) <- NULL
   # change column names for display
   colnames(top_metab_pt) <- c("Metabolite", "Z-score")
-
+  
   return(top_metab_pt)
 }
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index 097d030..2a79d40 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -11,11 +11,11 @@
 ## adapted from 15-dIEM_violin.R
 
 # load packages
-library(dplyr)
+suppressPackageStartupMessages(library("dplyr"))
 library(reshape2)
 library(openxlsx)
 library(ggplot2)
-library(gridExtra)
+suppressPackageStartupMessages(library("gridExtra"))
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
@@ -64,6 +64,10 @@ source(paste0(scripts_dir, "prepare_data_perpage.R"))
 source(paste0(scripts_dir, "prepare_toplist.R"))
 source(paste0(scripts_dir, "create_violin_plots.R"))
 source(paste0(scripts_dir, "prepare_alarmvalues.R"))
+source(paste0(scripts_dir, "output_helix.R"))
+source(paste0(scripts_dir, "get_patient_data_to_helix.R"))
+source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R"))
+source(paste0(scripts_dir, "is_diagnostic_patient.R"))
 
 # number of diseases that score highest in algorithm to plot
 top_nr_iem <- 5
@@ -75,15 +79,18 @@ ratios_cutoff <- -5
 nr_plots_perpage <- 20
 
 # binary variable: run function, yes(1) or no(0)
-if (z_score == 1) {
-  algorithm <- ratios <- violin <- 1
-}
+if (z_score == 1) { 
+  algorithm <- ratios <- violin <- 1 
+} else {
+  algorithm <- ratios <- violin <- 0 
+}	
 # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
 header_row <- 1
 # column name where the data starts (default B)
 col_start <- "B"
 zscore_cutoff <- 5
 xaxis_cutoff <- 20
+protocol_name <- "DIMS_PL_DIAG"
 
 #### STEP 1: Preparation ####
 # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS
@@ -342,7 +349,7 @@ if (algorithm == 1) {
 
 #### STEP 5: Make violin plots #####
 # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, 
-# top_diseases, top_metab, output_dir ||| out: pdf file
+# top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file
 
 if (violin == 1) {
 
@@ -403,13 +410,28 @@ if (violin == 1) {
     metab_interest_controls <- prepare_data(metab_list_all, zscore_controls)
     metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr)
 
+    # for Diagnostics metabolites to be saved in Helix
+    if(grepl("Diagnost", pdf_dir)) {
+      # get table that combines DIMS results with stofgroepen/Helix table
+      dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all)
+      
+      # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs
+      if(any(is_diagnostic_patient(dims_helix_table$Patient))){
+        # get output file for Helix
+        output_helix <- output_for_helix(protocol_name, dims_helix_table)
+        # write output to file
+        path_helixfile <- paste0(outdir, "/output_Helix_", run_name,".csv")
+        write.csv(output_helix, path_helixfile, quote = F, row.names = F)
+      }
+    }
+    
     # make violin plots per patient
     for (pt_nr in 1:length(patient_list)) {
       pt_name <- patient_list[pt_nr]
       # for category Diagnostics, make list of metabolites that exceed alarm values for this patient
       # for category Other, make list of top highest and lowest Z-scores for this patient
       if (grepl("Diagnost", pdf_dir)) {
-        top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted)
+        top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table)
       } else {
         top_metab_pt <- prepare_toplist(pt_name, zscore_patients)
       }

From c7a2cfd2f21ea8768bf8fcf1bd4046a9dde16f29 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Mon, 8 Apr 2024 13:58:28 +0200
Subject: [PATCH 44/73] Added helix output file

---
 DIMS/GenerateViolinPlots.nf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
index 27bc20b..a1f5259 100644
--- a/DIMS/GenerateViolinPlots.nf
+++ b/DIMS/GenerateViolinPlots.nf
@@ -13,6 +13,7 @@ process GenerateViolinPlots {
        path('Other/*.pdf'), emit: other_plot_files
        path('dIEM/*.pdf'), emit: diem_plot_files
        path('*.xlsx'), emit: excel_file
+       path('*.csv'), emit: helix_file
 
     script:
         """

From c2c3eb494e5ccc849bb6d18634a2737f314329c1 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Thu, 11 Apr 2024 15:00:43 +0200
Subject: [PATCH 45/73] Added output_helix.R

---
 DIMS/AddOnFunctions/output_helix.R | 31 ++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 DIMS/AddOnFunctions/output_helix.R

diff --git a/DIMS/AddOnFunctions/output_helix.R b/DIMS/AddOnFunctions/output_helix.R
new file mode 100644
index 0000000..c098675
--- /dev/null
+++ b/DIMS/AddOnFunctions/output_helix.R
@@ -0,0 +1,31 @@
+output_for_helix <- function(protocol_name, df_metabs_helix){
+  
+  # Remove positive controls
+  df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient))
+  
+  # Add 'Vial' column, each patient has unique ID
+  df_metabs_helix <- df_metabs_helix %>% 
+    group_by(Patient) %>% 
+    mutate(Vial = cur_group_id()) %>% 
+    ungroup()
+  
+  # Split patient number into labnummer and Onderzoeksnummer
+  df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix)
+  
+  # Add column with protocol name
+  df_metabs_helix$Protocol <- protocol_name
+  
+  # Change name Z_score and Helix_naam columns to Amount and Name
+  change_columns <- c(Amount = "Z_score", Name = "Helix_naam")
+  df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns))
+  
+  # Select only necessary columns and set them in correct order
+  df_metabs_helix <- df_metabs_helix %>% 
+    select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount))
+  
+  # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times)
+  df_metabs_helix <- df_metabs_helix %>% 
+    group_by(Onderzoeksnummer, Name) %>% distinct() %>% ungroup()
+  
+  return(df_metabs_helix)
+}
\ No newline at end of file

From 3fe61b0d2a10103cc627e45a7d07efd5e62bb721 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Fri, 12 Apr 2024 09:12:13 +0200
Subject: [PATCH 46/73] flake8 changes

---
 CheckQC/check_qc.py               | 46 +++++++++++++++----------------
 CheckQC/test_check_qc.py          | 15 ++++++++--
 Utils/create_hsmetrics_summary.py |  2 +-
 Utils/get_stats_from_flagstat.py  | 24 ++++++++++++----
 4 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index e6f2cc1..dd236d4 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -25,7 +25,7 @@ def non_empty_existing_path(file_or_dir):
 
     Returns:
         string: Provided input file or directory. If dir, suffix '/' might be added.
-    """    
+    """
     input_path = Path(file_or_dir)
     if not input_path.is_file() and not input_path.is_dir():
         raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir)
@@ -44,7 +44,7 @@ def parse_arguments_and_check(args_in):
 
     Returns:
         Namespace: Convert argument strings to objects and assign them as attributes of the namespace.
-    """    
+    """
     parser = argparse.ArgumentParser(
         description="Check and summarize sample quality using qc metrics and their thresholds."
     )
@@ -82,7 +82,7 @@ def read_yaml(yaml_file):
 
     Returns:
         Object: Content of the YAML file.
-    """    
+    """
     yaml_loaded = yaml.safe_load(open(yaml_file))
     if not yaml_loaded:
         raise ValueError("Could not load YAML.")
@@ -97,7 +97,7 @@ def check_allowed_operators(qc_operator):
 
     Raises:
         ValueError: If provided qc_operator is invalid / unsupported.
-    """    
+    """
     operators = ["<", "<=", ">", ">=", "==", "!=", "match"]
     if qc_operator not in operators:
         raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}")
@@ -111,7 +111,7 @@ def check_required_keys_metrics(qc_metrics):
 
     Raises:
         KeyError: Required key is not provided for the qc metric settings
-    """    
+    """
     for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]:
         if any([req_key not in setting.keys() for setting in qc_metrics]):
             raise KeyError(f"Required key {req_key} not in all metrics settings.")
@@ -129,7 +129,7 @@ def select_metrics(filename, input_files):
     """
     # If filename is string, change into regex to match absolute and relative paths in input_files.
     if filename.isalpha():
-        filename=".*" + filename
+        filename = ".*" + filename
     metrics = list(filter(re.compile(f"{filename}").match, input_files))
     if not metrics:
         warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}"))
@@ -151,7 +151,7 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
 
     Returns:
         list of strings: Valid column names to include in report.
-    """    
+    """
     not_existing_cols = list(set(qc_report_cols) - set(qc_metric_cols))
     if qc_report_cols == "@all":
         qc_report_cols = qc_metric_cols
@@ -190,7 +190,7 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol
 
 
 def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
-    """Get rows that fail provided qc threshold 
+    """Get rows that fail provided qc threshold
 
     Args:
         qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
@@ -203,7 +203,7 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
 
     Returns:
         object: The indexes of failed rows in qc_metric
-    """    
+    """
     # Select failed rows using qc_threshold regex pattern and qc_operator 'match'
     if qc_operator == "match" and isinstance(qc_threshold, str):
         return qc_metric[qc_col].str.match(qc_threshold)
@@ -231,7 +231,7 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
     Returns:
         qc_metric (DataFrame): DataFrame of qc metric without failed rows
         qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows
-    """    
+    """
     qc_metric_out = DataFrame(columns=["sample", "qc_check", "qc_status", "qc_msg", "qc_value"])
     failed_samples = []
     if failed_rows.to_list():
@@ -272,7 +272,7 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
 
     Returns:
         pandas DataFrame: Sorted qc metric for both passed and failed samples, without duplicates.
-    """    
+    """
     # Add passed samples to output
     for sample_col in sample_cols:
         qc_metric_out = concat([
@@ -284,7 +284,7 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
             )
         ])
     # Try to convert column qc_value to float.
-    # If ValueError is raised, probably because column is a string, continue.    
+    # If ValueError is raised, probably because column is a string, continue.
     try:
         qc_metric_out["qc_value"] = qc_metric_out["qc_value"].astype("float")
     except ValueError:
@@ -295,13 +295,13 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
 
 
 def create_and_write_output(qc_output, output_path, output_prefix):
-    """Joined qc metrics is created and written to output file. 
+    """Joined qc metrics is created and written to output file.
 
     Args:
         qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates.
-        output_path (string): Relative or absolute path where output should be saved. 
+        output_path (string): Relative or absolute path where output should be saved.
         output_prefix (string): Output prefix for output file.
-    """    
+    """
     # Add qc_summary
     qc_output.insert(1, "qc_summary", "PASS")
     qc_output.loc[qc_output.isin(["FAIL"]).any(axis=1), "qc_summary"] = "FAIL"
@@ -310,7 +310,7 @@ def create_and_write_output(qc_output, output_path, output_prefix):
 
 
 def read_and_judge_metrics(qc, metrics):
-    """Read and judge each single qc metric and join results. 
+    """Read and judge each single qc metric and join results.
 
     Args:
         qc (dict): qc settings of the metric
@@ -318,7 +318,7 @@ def read_and_judge_metrics(qc, metrics):
 
     Returns:
         pandas DataFrame: Joined and judged qc metrics.
-    """    
+    """
     for qc_file in metrics:
         qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"')
         report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
@@ -355,26 +355,26 @@ def read_and_judge_metrics(qc, metrics):
 
 def check_qc(input_files, settings, output_path, output_prefix):
     """
-    Main function to judge input files on configured qc settings. 
-    It creates a single results table, each row representing 
+    Main function to judge input files on configured qc settings.
+    It creates a single results table, each row representing
         sample (string): sample name
         qc_summary: Summarized status of all qcs for sample (pass or fail)
         qc columns (5 per each qc metric);
             qc_check (string): QC check consiting of qc title, operator and threshold
             qc_status (string): Status of performed qc check (pass or fail)
-            qc_msg (string): String with human readable message if sample failed qc check, empty if passed.  
+            qc_msg (string): String with human readable message if sample failed qc check, empty if passed.
             qc_value (string, int, float): qc value/score to check.
 
     Args:
         input_files (list): All qc metrics input files.
         settings (string): Path to yaml file
-        output_path (string): Relative or absolute path where output should be saved. 
+        output_path (string): Relative or absolute path where output should be saved.
         output_prefix (string): Output prefix for output file.
 
     Raises:
         ValueError: No input files found to match any qc metric patterns defined in settings.
-        ValueError: Duplicated samples with different values found in some of the input files. 
-    """    
+        ValueError: Duplicated samples with different values found in some of the input files.
+    """
     # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings.
     qc_settings = read_yaml(settings)
     check_required_keys_metrics(qc_settings["metrics"])
diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py
index a757fac..629d439 100644
--- a/CheckQC/test_check_qc.py
+++ b/CheckQC/test_check_qc.py
@@ -113,13 +113,19 @@ class TestSelectMetrics():
         # match on word truth and SNP
         (
             ".*truth.*SNP",
-            ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'],
+            [
+                '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv',
+                '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'
+            ],
             ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
         ),
         # match on word truth and SNP
         (
             ".*truth.*SNP",
-            ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', 'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'],
+            [
+                'U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv',
+                'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'
+            ],
             ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
         ),
         # match if 'truth' is absent and contains 'SNP'
@@ -127,7 +133,10 @@ class TestSelectMetrics():
         # ?! Match if 'truth' is absent.
         (
             "(?:(?!truth).)*SNP.*$",
-            ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'],
+            [
+                '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv',
+                '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'
+            ],
             ['12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv']
         ),
 
diff --git a/Utils/create_hsmetrics_summary.py b/Utils/create_hsmetrics_summary.py
index 405e1ce..4256fc3 100644
--- a/Utils/create_hsmetrics_summary.py
+++ b/Utils/create_hsmetrics_summary.py
@@ -8,7 +8,7 @@
     parser.add_argument('hsmetrics_files', type=argparse.FileType('r'), nargs='*', help='HSMetric file')
     arguments = parser.parse_args()
 
-    interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]")
+    interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]")  # noqa: W605
     summary_header = []
     summary_data = {}
     for hsmetrics_file in arguments.hsmetrics_files:
diff --git a/Utils/get_stats_from_flagstat.py b/Utils/get_stats_from_flagstat.py
index 2f46abb..52560bc 100644
--- a/Utils/get_stats_from_flagstat.py
+++ b/Utils/get_stats_from_flagstat.py
@@ -43,12 +43,24 @@
 
         print("\n\t{0} %duplication\n".format(100*sample_dups/sample_mapped))
 
-    print("Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format(
-        total=counts['total'], total_75bp=counts['total']*75, total_100bp=counts['total']*100, total_150bp=counts['total']*150
-    ))
-    print("Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format(
-        total=counts['mapped'], total_75bp=counts['mapped']*75, total_100bp=counts['mapped']*100, total_150bp=counts['mapped']*150
-    ))
+    print(
+        "Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, "
+        "150bp={total_150bp:,} bp)".format(
+            total=counts['total'],
+            total_75bp=counts['total']*75,
+            total_100bp=counts['total']*100,
+            total_150bp=counts['total']*150
+        )
+    )
+    print(
+        "Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, "
+        "150bp={total_150bp:,} bp)".format(
+            total=counts['mapped'],
+            total_75bp=counts['mapped']*75,
+            total_100bp=counts['mapped']*100,
+            total_150bp=counts['mapped']*150
+        )
+    )
     print("Average mapped per lib: {:,} reads".format(int(round(float(counts['mapped'])/float(counts['files'])))))
     print("Average dups per lib: {:,} reads".format(int(round(float(counts['dups'])/float(counts['files'])))))
     print("Average dups % per lib: {:.2f} %".format(100*float(counts['dups'])/float(counts['mapped'])))

From 24399fccf714f75d668b1be28526e7a53a37a367 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Fri, 12 Apr 2024 14:22:06 +0200
Subject: [PATCH 47/73] Added package for Helix output

---
 DIMS/GenerateViolinPlots.R | 1 +
 1 file changed, 1 insertion(+)

diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index 2a79d40..38a8c61 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -16,6 +16,7 @@ library(reshape2)
 library(openxlsx)
 library(ggplot2)
 suppressPackageStartupMessages(library("gridExtra"))
+library(stringr)
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)

From c22e1f0baa76261bdebaae3414c60e9d31357bee Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 08:12:00 +0200
Subject: [PATCH 48/73] Docstrings summary to newline and comments with
 uppercase

---
 CheckQC/check_qc.py      |  39 ++++++++----
 CheckQC/test_check_qc.py | 128 ++++++++++++++++++++++-----------------
 2 files changed, 100 insertions(+), 67 deletions(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index dd236d4..3e65352 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -14,7 +14,8 @@
 
 
 def non_empty_existing_path(file_or_dir):
-    """This function checks whether the provided file or dir exists and is not empty.
+    """
+    This function checks whether the provided file or dir exists and is not empty.
 
     Args:
         file_or_dir (string): Input file or directory
@@ -37,7 +38,8 @@ def non_empty_existing_path(file_or_dir):
 
 
 def parse_arguments_and_check(args_in):
-    """Parses arguments and validates / checks format of input.
+    """
+    Parses arguments and validates / checks format of input.
 
     Args:
         args_in (list of strings): Commandline input arguments.
@@ -72,7 +74,8 @@ def parse_arguments_and_check(args_in):
 
 
 def read_yaml(yaml_file):
-    """Read input yaml
+    """
+    Read input yaml
 
     Args:
         yaml_file (string): String with path to yaml file
@@ -90,7 +93,8 @@ def read_yaml(yaml_file):
 
 
 def check_allowed_operators(qc_operator):
-    """Check if provided qc_operator is allowed.
+    """
+    Check if provided qc_operator is allowed.
 
     Args:
         qc_operator (string): (Custom / math) operator
@@ -104,7 +108,8 @@ def check_allowed_operators(qc_operator):
 
 
 def check_required_keys_metrics(qc_metrics):
-    """Check if all required settings are included in the qc_settings
+    """
+    Check if all required settings are included in the qc_settings
 
     Args:
         qc_metrics (list with dicts): qc settings per qc metric
@@ -118,7 +123,8 @@ def check_required_keys_metrics(qc_metrics):
 
 
 def select_metrics(filename, input_files):
-    """Using regular expression to match the qc metric filename with the input files
+    """
+    Using regular expression to match the qc metric filename with the input files
 
     Args:
         filename (string): Filename of qc metric, could be regex.
@@ -138,7 +144,8 @@ def select_metrics(filename, input_files):
 
 
 def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
-    """Get valid columns to include in final output report
+    """
+    Get valid columns to include in final output report
 
     Args:
         qc_report_cols (list, string): column name(s) to include in report.
@@ -167,7 +174,8 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
 
 
 def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshold):
-    """Add and rename columns in qc_metric.
+    """
+    Add and rename columns in qc_metric.
 
     Args:
         qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
@@ -190,7 +198,8 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol
 
 
 def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
-    """Get rows that fail provided qc threshold
+    """
+    Get rows that fail provided qc threshold
 
     Args:
         qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
@@ -220,7 +229,8 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
 
 
 def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
-    """Failed samples are added to the output metric, and removed from qc_metric.
+    """
+    Failed samples are added to the output metric, and removed from qc_metric.
 
     Args:
         qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
@@ -263,7 +273,8 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
 
 
 def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
-    """Passed samples are added to the output metric
+    """
+    Passed samples are added to the output metric
 
     Args:
         qc_metric (DataFrame): DataFrame of qc metric without failed rows
@@ -295,7 +306,8 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
 
 
 def create_and_write_output(qc_output, output_path, output_prefix):
-    """Joined qc metrics is created and written to output file.
+    """
+    Joined qc metrics is created and written to output file.
 
     Args:
         qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates.
@@ -310,7 +322,8 @@ def create_and_write_output(qc_output, output_path, output_prefix):
 
 
 def read_and_judge_metrics(qc, metrics):
-    """Read and judge each single qc metric and join results.
+    """
+    Read and judge each single qc metric and join results.
 
     Args:
         qc (dict): qc settings of the metric
diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py
index 629d439..9d572ed 100644
--- a/CheckQC/test_check_qc.py
+++ b/CheckQC/test_check_qc.py
@@ -15,7 +15,7 @@
 @pytest.fixture(scope="module", autouse=True)
 def setup_test_path(tmp_path_factory):
     test_tmp_path = str(tmp_path_factory.mktemp("test")) + "/"
-    # create empty files
+    # Create empty files
     open(str(test_tmp_path) + "/empty.txt", "a").close()
     open(str(test_tmp_path) + "/empty.yaml", "a").close()
     return test_tmp_path
@@ -78,11 +78,14 @@ def test_required_keys_present(self):
             [{"filename": "fakename"}],
             [
                 {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake",  "report_cols": "fake"},
-                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
+                # Missing report_cols
+                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},
             ],
             [
-                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
-                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
+                # Missing report_cols
+                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},
+                # Missing report_cols
+                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},
             ]
         ]
     )
@@ -96,21 +99,21 @@ def test_missing_keys(self, incomplete_qc_metrics):
 
 class TestSelectMetrics():
     @pytest.mark.parametrize("filename_or_regex,input_files,expected", [
-        # multi match
+        # Multi match
         ("test", ["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]),
-        # single match
+        # Single match
         ("test", ["test1.txt", "fake2.txt"],  ["test1.txt"]),
-        # # match with relative path
+        # Match with relative path
         ("test", ["./random/path/to/test1.txt"],  ["./random/path/to/test1.txt"]),
-        # match with absolute path
+        # Match with absolute path
         ("test", ["/random/path/to/test1.txt"],  ["/random/path/to/test1.txt"]),
-        # match regex: kinship file suffix
+        # Match regex: kinship file suffix
         (
             ".*.kinship_check.out$",
             ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"],
             ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"]
         ),
-        # match on word truth and SNP
+        # Match on word truth and SNP
         (
             ".*truth.*SNP",
             [
@@ -119,7 +122,7 @@ class TestSelectMetrics():
             ],
             ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
         ),
-        # match on word truth and SNP
+        # Match on word truth and SNP
         (
             ".*truth.*SNP",
             [
@@ -128,7 +131,7 @@ class TestSelectMetrics():
             ],
             ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
         ),
-        # match if 'truth' is absent and contains 'SNP'
+        # Match if 'truth' is absent and contains 'SNP'
         # ?: Match expression but do not capture it
         # ?! Match if 'truth' is absent.
         (
@@ -157,10 +160,14 @@ def test_no_match(self):
 class TestGetColumnsToReport():
     @pytest.mark.parametrize("report_cols,metric_cols,qc_col,expected", [
         (["col1"], ["col1"], "col1", ["qc_title", "qc_value"]),
-        (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),  # additional report col
-        (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]),  # different order output
-        (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]),  # additional metric col
-        ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),  # special @all option
+        # Additional report col
+        (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),
+        # Different order output
+        (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]),
+        # Additional metric col
+        (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]),
+        # Special @all option
+        ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),
     ])
     def test_get_columns_to_report(self, report_cols, metric_cols, qc_col, expected):
         qc_report_cols = check_qc.get_columns_to_report(report_cols, metric_cols, qc_col)
@@ -182,11 +189,11 @@ class TestAddAndRenameColumns():
     def test_add_and_rename_columns(self):
         fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": ["0.01"]})
         qc_metric_out = check_qc.add_and_rename_columns(fake_qc_metric, "FAKE_title", "fake_qc_col", "fake_op", "fake_thres")
-        # assert expected column values
+        # Assert expected column values
         assert qc_metric_out["qc_title"].values == "fake_title"
         assert qc_metric_out["qc_status"].values == "PASS"
         assert qc_metric_out["qc_check"].values == "fake_thres fake_op fake_qc_col"
-        # assert all expected columns exist
+        # Assert all expected columns exist
         assert not list(
             set(['sample', 'qc_value', 'qc_title', 'qc_status', 'qc_check', 'qc_msg']) - set(qc_metric_out.columns)
         )
@@ -195,10 +202,14 @@ def test_add_and_rename_columns(self):
 
 class TestGetFailedRows():
     @pytest.mark.parametrize("qc_op,qc_thres", [
-        ("match", "fake_thres"),  # test match
-        ("==", "FAIL"),  # test string
-        ("==", 1),  # test int
-        ("==", 0.1),  # test float
+        # Test match
+        ("match", "fake_thres"),
+        # Test string
+        ("==", "FAIL"),
+        # Test int
+        ("==", 1),
+        # Test float
+        ("==", 0.1),
     ])
     def test_correct(self, qc_op, qc_thres):
         fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": [qc_thres]})
@@ -223,9 +234,12 @@ def test_add_failed_samples_single_sample_col(self):
         failed_rows = fake_qc_metric.loc[fake_qc_metric["sample_col"] == "sample2"].index
         qc_metric, qc_metric_out = check_qc.add_failed_samples_metric(
             fake_qc_metric, failed_rows, fake_qc_metric.columns.to_list(), ["sample_col"])
-        assert "sample" in qc_metric_out.columns.to_list()  # test rename column
-        assert "sample2" not in qc_metric["sample_col"].to_list()  # test removal failed sample
-        assert "sample2" in qc_metric_out["sample"].to_list()  # test added failed sample
+        # Test rename column
+        assert "sample" in qc_metric_out.columns.to_list()
+        # Test removal failed sample
+        assert "sample2" not in qc_metric["sample_col"].to_list()
+        # Test added failed sample
+        assert "sample2" in qc_metric_out["sample"].to_list()
         assert len(qc_metric) == 1 and len(qc_metric_out) == 1
         assert qc_metric_out["qc_status"].values == "FAIL"
 
@@ -235,18 +249,21 @@ def test_add_failed_samples_multi_sample_col(self):
             columns=["sample_col1", "sample_col2"]
         )
         fake_kinship_metric = fake_kinship_metric.assign(qc_check="checks", qc_value="wrong")
-        failed_rows = fake_kinship_metric.iloc[0:2].index  # define sample1 vs sample2 and sample1 vs sample3 as failed
+        # Define sample1 vs sample2 and sample1 vs sample3 as failed
+        failed_rows = fake_kinship_metric.iloc[0:2].index
         qc_metric, qc_metric_out = check_qc.add_failed_samples_metric(
             fake_kinship_metric, failed_rows, fake_kinship_metric.columns.to_list(), ["sample_col1", "sample_col2"])
         for failed_sample in ["sample1", "sample2", "sample3"]:
-            # test removal failed sample
+            # Test removal failed sample
             assert failed_sample not in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel())
-            assert failed_sample in qc_metric_out["sample"].to_list()  # test added failed sample
+            # Test added failed sample
+            assert failed_sample in qc_metric_out["sample"].to_list()
         assert qc_metric_out["qc_status"].values.all() == "FAIL"
         assert len(qc_metric) == 1
         twice_failed = qc_metric_out.loc[qc_metric_out["sample"] == "sample1"]
-        assert "wrong;wrong" == twice_failed["qc_value"].item()  # assert join with ';' on column qc_value
-        # assert join with ';' on column qc_msg
+        # Assert join with ';' on column qc_value
+        assert "wrong;wrong" == twice_failed["qc_value"].item()
+        # Assert join with ';' on column qc_msg
         assert "sample1 sample2 checks wrong;sample1 sample3 checks wrong" == twice_failed["qc_msg"].item()
         for passed_sample in ["sample4", "sample5"]:
             assert passed_sample in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel())
@@ -276,20 +293,23 @@ def test_add_passed_samples_multi_sample_col(self):
         )
         qc_metric_out = check_qc.add_passed_samples_metric(
             fake_qc_metric, fake_sample_qc, ["sample_col1", "sample_col2"])
-        assert "sample" in qc_metric_out.columns.to_list()  # test rename column
-        assert qc_metric_out["sample"].to_list().count("s4") == 1  # test removal duplicates
-        assert "new_col" not in qc_metric_out.columns.to_list()  # test additional columns ignored
+        # Test rename column
+        assert "sample" in qc_metric_out.columns.to_list()
+        # Test removal duplicates
+        assert qc_metric_out["sample"].to_list().count("s4") == 1
+        # Test additional columns ignored
+        assert "new_col" not in qc_metric_out.columns.to_list()
 
 
 class TestCreateAndWriteOutput():
     @pytest.mark.parametrize("exp_summary,qc_output", [
-        # all qc checks passed
+        # All qc checks passed
         ("PASS", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["PASS"]})),
-        # single qc check failed
+        # Single qc check failed
         ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["FAIL"]})),
-        # all qc check failed
+        # All qc check failed
         ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["FAIL"], "qc_status_kinship": ["FAIL"]})),
-        # not restricted to qc_status_<check> column name
+        # Not restricted to qc_status_<check> column name
         ("PASS", DataFrame({"sample": ["s1"], "random_col1": ["PASS"], "random_col2": ["PASS"]})),
     ])
     def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output):
@@ -304,61 +324,61 @@ def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output):
 
 class TestGetOutputMetrics():
     @pytest.mark.parametrize("data_in,nr_rows", [
-        # single sample
+        # Single sample
         (["sample1_fake_check.txt"], 1),
-        # multiple single samples
+        # Multiple single samples
         (["sample1_fake_check.txt", "sample2_fake_check.txt"], 2),
-        # single multi samples
+        # Single multi samples
         (["240101_fake_check.txt"], 2),
-        # multiple multi samples
+        # Multiple multi samples
         (["240101_fake_check.txt", "240102_fake_check.txt"], 4),
-        # multi and single sample
+        # Multi and single sample
         (["sample1_fake_check.txt", "240101_fake_check.txt"], 3),
     ])
     def test_input_ok(self, data_in, nr_rows, dataset, datadir):
         datadir_files = [f"{datadir}/{filename}" for filename in data_in]
-        # input1 = datadir / "sample1_fake_check.txt"
         df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files)
         assert not df_output.empty
         observed_cols = df_output.columns.to_list()
-        assert df_output.shape[0] == nr_rows  # shape results in tuple with no. rows and no. cols
+        # Shape results in tuple with no. rows and no. cols
+        assert df_output.shape[0] == nr_rows
         assert len(observed_cols) == 5
         assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc']
 
     @pytest.mark.parametrize("data_in,nr_rows,exp_warn_msg", [
-        # single sample duplicate
+        # Single sample duplicate
         (["sample1_fake_check.txt"]*2, 1, "Sample IDs occur multiple times in input:"),
-        # single multi samples duplicate
+        # Single multi samples duplicate
         (["240101_fake_check.txt"]*2, 2, "Sample IDs occur multiple times in input:"),
-        # multiple multi samples, duplicate samples
+        # Multiple multi samples, duplicate samples
         (["240101_fake_check.txt", "240101_v2_fake_check.txt"], 4, "Different qc values for duplicated sample IDs in input:"),
     ])
     def test_input_warn(self, data_in, nr_rows, exp_warn_msg, dataset, datadir):
         datadir_files = [f"{datadir}/{filename}" for filename in data_in]
-        # input1 = datadir / "sample1_fake_check.txt"
         with pytest.warns(UserWarning) as match_warning:
             df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files)
         warn_msg = match_warning[0].message.args[0]
         assert exp_warn_msg in warn_msg
         assert not df_output.empty
         observed_cols = df_output.columns.to_list()
-        assert df_output.shape[0] == nr_rows  # Shape: tuple with no. rows and no. cols
+        # Shape: tuple with no. rows and no. cols
+        assert df_output.shape[0] == nr_rows
         assert len(observed_cols) == 5
         assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc']
 
 
 class TestCheckQc():
     @pytest.mark.parametrize("settings,data_in,exp_shape", [
-        # single metric, single sample input
+        # Single metric, single sample input
         ("settings_single_metric", ["sample1_fake_check.txt"], (1, 5)),
-        # two metrics, single sample input
+        # Two metrics, single sample input
         ("settings_two_metrics", ["sample1_fake_check.txt"], (1, 9)),
-        # single metric, multiple samples input
+        # Single metric, multiple samples input
         ("settings_single_metric", ["240101_fake_check.txt"], (2, 5)),
         ("settings_single_metric", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 5)),
-        # two metrics, multiple sample input
+        # Two metrics, multiple sample input
         ("settings_two_metrics", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 9)),
-        # two metric, multi and single sample input
+        # Two metricS, multi and single sample input
         ("settings_two_metrics", ["sample1_fake_check.txt", "240101_fake_check.txt"], (3, 9)),
     ])
     def test_ok(self, settings, data_in, exp_shape, datadir, dataset, mocker, ):

From f66b1f87a01eea7efb42725b7c771fc668eba0b6 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 08:15:28 +0200
Subject: [PATCH 49/73] Change isalpha to punctuation check to determine if
 filename string or regex

---
 CheckQC/check_qc.py      | 9 ++++++---
 CheckQC/test_check_qc.py | 8 +++++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index 3e65352..472bf89 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -5,6 +5,7 @@
 from os import strerror as os_strerror
 from pathlib import Path
 import re
+from string import punctuation
 import sys
 import warnings
 
@@ -127,14 +128,16 @@ def select_metrics(filename, input_files):
     Using regular expression to match the qc metric filename with the input files
 
     Args:
-        filename (string): Filename of qc metric, could be regex.
+        filename (string): Filename of qc metric, could be a regex.
         input_files (list): All qc metrics input files.
 
     Returns:
         list: Input files matching the given filename.
     """
-    # If filename is string, change into regex to match absolute and relative paths in input_files.
-    if filename.isalpha():
+    # Change filename without special characters (except for '_', '-' and '.') into
+    # a regex pattern to match absolute and relative paths in input_files.
+    special_symbols = set(punctuation) - set(".", "_", "-")
+    if not any([char in special_symbols for char in set(filename)]):
         filename = ".*" + filename
     metrics = list(filter(re.compile(f"{filename}").match, input_files))
     if not metrics:
diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py
index 9d572ed..baedf3d 100644
--- a/CheckQC/test_check_qc.py
+++ b/CheckQC/test_check_qc.py
@@ -148,7 +148,13 @@ def test_select_metric(self, filename_or_regex, input_files, expected):
         metrics = check_qc.select_metrics(filename_or_regex, input_files)
         assert metrics == expected
 
-    def test_no_match(self):
+    @pytest.mark.parametrize("filename_or_regex,input_files", [
+        # No match
+        ("test", ["fake1.txt", "fake2.txt"]),
+        # No match, filename with specialchar assumed to be a regex
+        ("specialchar_@", ["12/specialchar_@.txt"]),
+    ])
+    def test_no_match(self, filename_or_regex, input_files):
         with pytest.warns(UserWarning) as match_warning:
             return_val = check_qc.select_metrics("test", ["fake1.txt", "fake2.txt"])
         warn_msg = match_warning[0].message.args[0]

From d83ca8d58eb4fcd42afcb5cb215ee27f8ec1c7ab Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 08:51:22 +0200
Subject: [PATCH 50/73] Moved comments to new lines and resolved flake8 errors

---
 GenderCheck/test_calculate_gender.py              | 15 ++++++++-------
 .../1.0.0/test_get_gender_from_bam_chrx.py        |  4 +++-
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/GenderCheck/test_calculate_gender.py b/GenderCheck/test_calculate_gender.py
index 0e84e0b..334c158 100644
--- a/GenderCheck/test_calculate_gender.py
+++ b/GenderCheck/test_calculate_gender.py
@@ -4,15 +4,12 @@
 
 
 class TestIsValidRead():
-
-
     class MyObject:
         def __init__(self, qual, start, end):
             self.mapping_quality = qual
             self.reference_start = start
             self.reference_end = end
 
-
     @pytest.mark.parametrize("read,mapping_qual,expected", [
         (MyObject(19, True, True), 20, False),  # mapping quality is below the threshold
         (MyObject(20, True, True), 20, True),  # mapping quality is equal to the threshold
@@ -35,10 +32,14 @@ def test_get_gender_from_bam(self, bam, mapping_qual, locus_y, ratio_y, expected
 
 class TestCompareGender():
     @pytest.mark.parametrize("sample_id,analysis_id,test_gender,true_gender,expected", [
-        ("test_sample", "test_analyse", "male", "male", "test_sample\ttest_analyse\tmale\tmale\tPASS\n"),  # test_gender and true_gender identical, PASS
-        ("test_sample", "test_analyse", "male", "female", "test_sample\ttest_analyse\tmale\tfemale\tFAIL\n"),  # test_gender and true_gender not identical , FAIL
-        ("test_sample", "test_analyse", "male", "unknown", "test_sample\ttest_analyse\tmale\tunknown\tPASS\n"),  # true_gender unknown, PASS
-        ("test_sample", "test_analyse", "male", "not_detected", "test_sample\ttest_analyse\tmale\tnot_detected\tFAIL\n"),  # true_gender not_detected, FAIL
+        # test_gender and true_gender identical, PASS
+        ("test_sample", "test_analyse", "male", "male", "test_sample\ttest_analyse\tmale\tmale\tPASS\n"),
+        # test_gender and true_gender not identical , FAIL
+        ("test_sample", "test_analyse", "male", "female", "test_sample\ttest_analyse\tmale\tfemale\tFAIL\n"),
+        # true_gender unknown, PASS
+        ("test_sample", "test_analyse", "male", "unknown", "test_sample\ttest_analyse\tmale\tunknown\tPASS\n"),
+        # true_gender not_detected, FAIL
+        ("test_sample", "test_analyse", "male", "not_detected", "test_sample\ttest_analyse\tmale\tnot_detected\tFAIL\n"),
     ])
     def test_compare_gender(self, sample_id, analysis_id, test_gender, true_gender, expected):
         assert expected == calculate_gender.compare_gender(sample_id, analysis_id, test_gender, true_gender)
diff --git a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py
index 0a494ea..7674f45 100644
--- a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py
+++ b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py
@@ -26,7 +26,9 @@ class TestGetGenderFromBam:
         ("./test_bam.bam", 20, "X:2699520-154931044", 5.5, 7.5, ("M", False)),
         ("./test_bam.bam", 20, "X:2699520-154931044", 4.5, 6.5, ("F", True)),
     ])
-    def test_get_gender_from_bam(self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome):
+    def test_get_gender_from_bam(
+            self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome
+    ):
         assert expected_outcome == get_gender_from_bam_chrx.get_gender_from_bam_chrx(
             bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female)
 

From b78741dce7fe90b7c4a8e70dd5a8fd6dcfbaaad9 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 09:02:05 +0200
Subject: [PATCH 51/73] Single newline at end of file (.nf)

---
 ExomeDepth/CallCNV.nf           | 2 +-
 ExomeDepth/GetRefset.nf         | 2 +-
 ExomeDepth/Summary.nf           | 2 +-
 ExonCov/ImportBam.nf            | 2 +-
 ExonCov/SampleQC.nf             | 2 +-
 Kinship/Kinship.nf              | 2 +-
 TrendAnalysis/TrendAnalysis.nf  | 2 +-
 UPD/IGV.nf                      | 2 +-
 Utils/CreateHSmetricsSummary.nf | 2 +-
 Utils/EditSummaryFileHappy.nf   | 2 +-
 Utils/GetStatsFromFlagstat.nf   | 2 +-
 Utils/ParseChildFromFullTrio.nf | 2 +-
 Utils/SavePedFile.nf            | 2 +-
 Utils/VersionLog.nf             | 2 +-
 14 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/ExomeDepth/CallCNV.nf b/ExomeDepth/CallCNV.nf
index 9a3197e..fda8ef1 100644
--- a/ExomeDepth/CallCNV.nf
+++ b/ExomeDepth/CallCNV.nf
@@ -22,4 +22,4 @@ process CallCNV {
         source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate
         python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py callcnv ./ ${bam_file} ${analysis_id} ${sample_id}
         """
-}
\ No newline at end of file
+}
diff --git a/ExomeDepth/GetRefset.nf b/ExomeDepth/GetRefset.nf
index efff299..fbe9d7b 100644
--- a/ExomeDepth/GetRefset.nf
+++ b/ExomeDepth/GetRefset.nf
@@ -18,4 +18,4 @@ process GetRefset {
         python ${params.dx_resources_path}/${params.exomedepth_path}/exomedepth_db.py add_sample_return_refset_bam ${bam_file} --print_refset_stdout | \
         tr -d '\n'
         """
-}
\ No newline at end of file
+}
diff --git a/ExomeDepth/Summary.nf b/ExomeDepth/Summary.nf
index 3244d8c..a883376 100644
--- a/ExomeDepth/Summary.nf
+++ b/ExomeDepth/Summary.nf
@@ -17,4 +17,4 @@ process Summary {
         source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate
         python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py summary ${exomedepth_logs} > ${analysis_id}_exomedepth_summary.txt
         """
-}
\ No newline at end of file
+}
diff --git a/ExonCov/ImportBam.nf b/ExonCov/ImportBam.nf
index 474ae35..ff193da 100644
--- a/ExonCov/ImportBam.nf
+++ b/ExonCov/ImportBam.nf
@@ -21,4 +21,4 @@ process ImportBam {
         --exon_bed ${params.dxtracks_path}/${params.exoncov_bed} \
         ${analysis_id} WES ${bam_file} | tr -d '\n'
         """
-}
\ No newline at end of file
+}
diff --git a/ExonCov/SampleQC.nf b/ExonCov/SampleQC.nf
index 49ecf2d..0c5bd78 100644
--- a/ExonCov/SampleQC.nf
+++ b/ExonCov/SampleQC.nf
@@ -19,4 +19,4 @@ process SampleQC {
         python ${params.exoncov_path}/ExonCov.py sample_qc \
         -s ${samples} -p ${panels} > ${analysis_id}.ExonCovQC_check.out
         """
-}
\ No newline at end of file
+}
diff --git a/Kinship/Kinship.nf b/Kinship/Kinship.nf
index 9241dbf..fd051fe 100644
--- a/Kinship/Kinship.nf
+++ b/Kinship/Kinship.nf
@@ -19,4 +19,4 @@ process Kinship {
         cp king.kin0 ${analysis_id}.kinship
         python ${projectDir}/CustomModules/Kinship/check_kinship.py ${analysis_id}.kinship ${ped_file} > ${analysis_id}.kinship_check.out
         """
-}
\ No newline at end of file
+}
diff --git a/TrendAnalysis/TrendAnalysis.nf b/TrendAnalysis/TrendAnalysis.nf
index c8968ab..5009861 100644
--- a/TrendAnalysis/TrendAnalysis.nf
+++ b/TrendAnalysis/TrendAnalysis.nf
@@ -12,4 +12,4 @@ process TrendAnalysis {
         source ${params.trend_analysis_path}/venv/bin/activate
         python ${params.trend_analysis_path}/trend_analysis.py upload processed_data ${analysis_id} .
         """
-}
\ No newline at end of file
+}
diff --git a/UPD/IGV.nf b/UPD/IGV.nf
index 2ea72af..f2974c7 100644
--- a/UPD/IGV.nf
+++ b/UPD/IGV.nf
@@ -19,4 +19,4 @@ process IGV {
         source ${params.upd_path}/venv/bin/activate
         python ${params.upd_path}/make_UPD_igv.py ${ped_file} ${analysis_id} $trio_sample ${vcf_files} -c
         """
-}
\ No newline at end of file
+}
diff --git a/Utils/CreateHSmetricsSummary.nf b/Utils/CreateHSmetricsSummary.nf
index 2ca443f..45553cb 100644
--- a/Utils/CreateHSmetricsSummary.nf
+++ b/Utils/CreateHSmetricsSummary.nf
@@ -14,4 +14,4 @@ process CreateHSmetricsSummary {
         """
         python2 ${projectDir}/CustomModules/Utils/create_hsmetrics_summary.py ${hsmetrics_files} > HSMetrics_summary.txt
         """
-}
\ No newline at end of file
+}
diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
index 8050c46..a4aba58 100644
--- a/Utils/EditSummaryFileHappy.nf
+++ b/Utils/EditSummaryFileHappy.nf
@@ -27,4 +27,4 @@ process EditSummaryFileHappy {
         # Remove tmp files
         rm ${summary_csv}.tmp     
         """
-}
\ No newline at end of file
+}
diff --git a/Utils/GetStatsFromFlagstat.nf b/Utils/GetStatsFromFlagstat.nf
index 99018ed..27e6bbe 100644
--- a/Utils/GetStatsFromFlagstat.nf
+++ b/Utils/GetStatsFromFlagstat.nf
@@ -14,4 +14,4 @@ process GetStatsFromFlagstat {
         """
         python2 ${projectDir}/CustomModules/Utils/get_stats_from_flagstat.py ${flagstat_files} > run_stats.txt
         """
-}
\ No newline at end of file
+}
diff --git a/Utils/ParseChildFromFullTrio.nf b/Utils/ParseChildFromFullTrio.nf
index 9963277..b9377bd 100644
--- a/Utils/ParseChildFromFullTrio.nf
+++ b/Utils/ParseChildFromFullTrio.nf
@@ -17,4 +17,4 @@ process ParseChildFromFullTrio {
         """
         python2 ${projectDir}/CustomModules/Utils/parse_child_from_fulltrio.py ${ped_file} ${sample_ids} | tr -d '\n'
         """
-}
\ No newline at end of file
+}
diff --git a/Utils/SavePedFile.nf b/Utils/SavePedFile.nf
index 1c49c07..00f45a3 100644
--- a/Utils/SavePedFile.nf
+++ b/Utils/SavePedFile.nf
@@ -14,4 +14,4 @@ process SavePedFile {
         """
         cp --remove-destination "\$(readlink ${ped_file})" ./
         """
-}
\ No newline at end of file
+}
diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf
index 9d5c654..cff4f49 100644
--- a/Utils/VersionLog.nf
+++ b/Utils/VersionLog.nf
@@ -22,4 +22,4 @@ process VersionLog {
             echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml
         done
         """
-}
\ No newline at end of file
+}

From 37c67e605ddbc32bd84bdc903a4f8f4cf43990e2 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 09:04:45 +0200
Subject: [PATCH 52/73] replace noqa W605 with a regex string (r"")

---
 Utils/create_hsmetrics_summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Utils/create_hsmetrics_summary.py b/Utils/create_hsmetrics_summary.py
index 4256fc3..582b61e 100644
--- a/Utils/create_hsmetrics_summary.py
+++ b/Utils/create_hsmetrics_summary.py
@@ -8,7 +8,7 @@
     parser.add_argument('hsmetrics_files', type=argparse.FileType('r'), nargs='*', help='HSMetric file')
     arguments = parser.parse_args()
 
-    interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]")  # noqa: W605
+    interval_files_pattern = re.compile(r"BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]")
     summary_header = []
     summary_data = {}
     for hsmetrics_file in arguments.hsmetrics_files:

From 54670e7153830761fc07e98fc60b539fbd4eb3c1 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 09:11:04 +0200
Subject: [PATCH 53/73] get quotechar from qc settings or use default.

---
 CheckQC/check_qc.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index 472bf89..791094f 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -336,7 +336,9 @@ def read_and_judge_metrics(qc, metrics):
         pandas DataFrame: Joined and judged qc metrics.
     """
     for qc_file in metrics:
-        qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"')
+        qc_metric_raw = read_csv(
+            qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar=qc.get("quotechar", '"')
+        )
         report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
         qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"])
         failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"])

From 5202e47ccb93540a5d75bbf036112f13cb0502b7 Mon Sep 17 00:00:00 2001
From: ellendejong <de.jong.ellen@hotmail.com>
Date: Mon, 15 Apr 2024 10:47:04 +0200
Subject: [PATCH 54/73] add comment floats to string conversion

---
 CheckQC/check_qc.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index 791094f..d0b86f7 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -297,6 +297,8 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
                 .loc[:, qc_metric_out.columns]
             )
         ])
+    # Some qc metrics did result in errors when merging the tables.
+    # The merge failed when the column qc_value has floats stored as strings.
     # Try to convert column qc_value to float.
     # If ValueError is raised, probably because column is a string, continue.
     try:

From 038521dda6441cff824293be807a95d67d7e363f Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Tue, 16 Apr 2024 09:40:52 +0200
Subject: [PATCH 55/73] Cleaned up GenerateViolinPlots AddOnFunctions

---
 DIMS/Utils/add_lab_id_and_onderzoeksnummer.R |  16 +++
 DIMS/Utils/check_same_samplename.R           |  10 ++
 DIMS/Utils/create_violin_plots.R             | 122 +++++++++++++++++++
 DIMS/Utils/get_patient_data_to_helix.R       |  39 ++++++
 DIMS/Utils/is_diagnostic_patient.R           |  11 ++
 DIMS/Utils/output_helix.R                    |  39 ++++++
 DIMS/Utils/prepare_alarmvalues.R             |  59 +++++++++
 DIMS/Utils/prepare_data.R                    |  51 ++++++++
 DIMS/Utils/prepare_data_perpage.R            |  58 +++++++++
 DIMS/Utils/prepare_toplist.R                 |  35 ++++++
 10 files changed, 440 insertions(+)
 create mode 100644 DIMS/Utils/add_lab_id_and_onderzoeksnummer.R
 create mode 100644 DIMS/Utils/check_same_samplename.R
 create mode 100644 DIMS/Utils/create_violin_plots.R
 create mode 100644 DIMS/Utils/get_patient_data_to_helix.R
 create mode 100644 DIMS/Utils/is_diagnostic_patient.R
 create mode 100644 DIMS/Utils/output_helix.R
 create mode 100644 DIMS/Utils/prepare_alarmvalues.R
 create mode 100644 DIMS/Utils/prepare_data.R
 create mode 100644 DIMS/Utils/prepare_data_perpage.R
 create mode 100644 DIMS/Utils/prepare_toplist.R

diff --git a/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R b/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R
new file mode 100644
index 0000000..f2ff13e
--- /dev/null
+++ b/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R
@@ -0,0 +1,16 @@
+add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) {
+  #' Adding labnummer and Onderzoeksnummer to a dataframe
+  #'
+  #' @param df_metabs_helix: dataframe with patient data to be uploaded to Helix
+  #'
+  #' @return: dataframe with added labnummer and Onderzoeksnummer columns
+
+  # Split patient number into labnummer and Onderzoeksnummer
+  for (row in 1:nrow(df_metabs_helix)) {
+    df_metabs_helix[row, "labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row, "Patient"])
+    labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]]
+    df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2])
+  }
+  
+  return(df_metabs_helix)
+}
diff --git a/DIMS/Utils/check_same_samplename.R b/DIMS/Utils/check_same_samplename.R
new file mode 100644
index 0000000..6e80841
--- /dev/null
+++ b/DIMS/Utils/check_same_samplename.R
@@ -0,0 +1,10 @@
+check_same_samplename <- function(int_col_name, zscore_col_name) {
+  #' A check to see if intensity and Z-score columns match
+  #'
+  #' @param int_col_name: name of an intensity column (string)
+  #' @param zscore_col_name: name of a Z-score column (string)
+  #'
+  #' @return: match or mismatch of the columns (boolean)
+
+  paste0(int_col_name, "_Zscore") == zscore_col_name
+}
diff --git a/DIMS/Utils/create_violin_plots.R b/DIMS/Utils/create_violin_plots.R
new file mode 100644
index 0000000..be4f1f6
--- /dev/null
+++ b/DIMS/Utils/create_violin_plots.R
@@ -0,0 +1,122 @@
+# remove parameter default value
+# add explanation variable to parameters
+create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt = NULL) {
+  #' Create violin plots for each patient
+  #'
+  #' @param pdf_dir: location where to save the pdf file (string)
+  #' @param pt_name: patient code (string)
+  #' @param metab_perpage: list of dataframe with a dataframe for each page in the violinplot pdf (list)
+  #' @param top_metab_pt: dataframe with increased and decrease metabolites for this patient (dataframe)
+
+  # set parameters for plots
+  plot_height <- 9.6
+  plot_width <- 6
+  fontsize <- 1
+  circlesize <- 0.8
+  colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF", "#A704FD", "#F36265", "#DA0641")
+  #                   green     blue      blue/purple purple    orange     red
+
+  # patient plots, create the PDF device
+  pt_name_sub <- pt_name
+  suffix <- ""
+  if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) {
+    prefix <- "MB"
+    suffix <- "_DIMS_PL_DIAG"
+    # substitute P and M in P2020M00001 into right format for Helix
+    pt_name_sub <- gsub("[PM]", "", pt_name)
+    pt_name_sub <- gsub("\\..*", "", pt_name_sub)
+  } else if (grepl("Diagnostics", pdf_dir)) {
+    prefix <- "Dx_"
+  } else if (grepl("IEM", pdf_dir)) {
+    prefix <- "IEM_"
+  } else {
+    prefix <- "R_"
+  }
+
+  pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"),
+      onefile = TRUE,
+      width = plot_width,
+      height = plot_height)
+
+  # page headers:
+  page_headers <- names(metab_perpage)
+
+  # put table into PDF file, if not empty
+  if (!is.null(dim(top_metab_pt))) {
+    plot.new()
+    # get the names and numbers in the table aligned
+    table_theme <- ttheme_default(core = list(fg_params = list(hjust = 0, x = 0.05, fontsize = 6)),
+                                  colhead = list(fg_params = list(fontsize = 8, fontface = "bold")))
+    grid.table(top_metab_pt, theme = table_theme, rows = NULL)
+    # g <- tableGrob(top_metab_pt)
+    # grid.draw(g)
+    text(x = 0.45, y = 1.02, paste0("Top deviating metabolites for patient: ", pt_name), font = 1, cex = 1)
+  }
+
+  # violin plots
+  for (page_index in 1:length(metab_perpage)) {
+    # extract list of metabolites to plot on a page
+    metab_list_2plot <- metab_perpage[[page_index]]
+    # extract original data for patient of interest (pt_name) before cut-offs
+    pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
+    # cut off Z-scores higher than 20 or lower than -5 (for nicer plots)
+    metab_list_2plot$value[metab_list_2plot$value >  20] <-  20
+    metab_list_2plot$value[metab_list_2plot$value <  -5] <-  -5
+    # extract data for patient of interest (pt_name)
+    pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ]
+    # restore original Z-score before cut-off, for showing Z-scores in PDF
+    pt_list_2plot$value_orig <- pt_list_2plot_orig$value
+    # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients
+    metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ]
+    # subtitle per page
+    sub_perpage <- gsub("_", " ", page_headers[page_index])
+    # for IEM plots, put subtitle on two lines
+    sub_perpage <- gsub("probability", "\nprobability", sub_perpage)
+    # add size parameter for showing Z-score of patient per metabolite
+    z_size <- rep(3, nrow(pt_list_2plot))
+    # set size to 0 if row is empty
+    z_size[is.na(pt_list_2plot$value)] <- 0
+
+    # draw violin plot. shape=22 gives square for patient of interest
+    ggplot_object <- ggplot(metab_list_2plot, aes(x = value, y = HMDB_name)) +
+      theme(axis.text.y = element_text(size = rel(fontsize)), plot.caption = element_text(size = rel(fontsize))) +
+      xlim(-5, 20) +
+      geom_violin(scale = "width") +
+      geom_point(data = pt_list_2plot, aes(color = value), size = 3.5 * circlesize, shape = 22, fill = "white") +
+      scale_fill_gradientn(
+        colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar",
+        aesthetics = "colour"
+      ) +
+      # add Z-score value for patient of interest at x=16
+      geom_text(
+        data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2,
+        size = z_size
+      ) +
+      # add labels. Use font Courier to get all the plots in the same location.
+      labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") +
+      theme(axis.text.y = element_text(family = "Courier", size = 6)) +
+      # do not show legend
+      theme(legend.position = "none") +
+      # add title
+      ggtitle(label = paste0("Results for patient ", pt_name)) +
+      # add vertical lines
+      geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty = 2) +
+      geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty = 2)
+
+    suppressWarnings(print(ggplot_object))
+
+  }
+
+  # add explanation of violin plots, version number etc.
+  plot(NA, xlim = c(0, 5), ylim = c(0, 5), bty = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
+  if (length(explanation) > 0) {
+    text(0.2, 5, explanation[1], pos = 4, cex = 0.8)
+    for (line_index in 2:length(explanation)) {
+      text_y_position <- 5 - (line_index * 0.2)
+      text(-0.2, text_y_position, explanation[line_index], pos = 4, cex = 0.5)
+    }
+  }
+
+  # close the PDF file
+  dev.off()
+}
diff --git a/DIMS/Utils/get_patient_data_to_helix.R b/DIMS/Utils/get_patient_data_to_helix.R
new file mode 100644
index 0000000..1eeaf7a
--- /dev/null
+++ b/DIMS/Utils/get_patient_data_to_helix.R
@@ -0,0 +1,39 @@
+get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all) {
+  #' Get patient data to be uploaded to Helix
+  #'
+  #' @param metab_interest_sorted: list of dataframes with metabolite Z-scores for each sample/patient (list)
+  #' @param metab_list_all: list of tables with metabolites for Helix and violin plots (list)
+  #'
+  #' @return: dataframe with patient data with only metabolites for Helix and violin plots 
+  #'          with Helix name, high/low Z-score cutoffs
+  
+  # Combine Z-scores of metab groups together
+  df_all_metabs_zscores <- bind_rows(metab_interest_sorted)
+  # Change columnnames 
+  colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score")
+  # Change Patient column to character instead of factor
+  df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient)
+  
+  # Delete whitespaces HMDB_name 
+  df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right")
+  
+  # Split HMDB_name column on "nitine;" for match dims_helix_table
+  df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[, 1]
+  
+  # Combine stofgroepen
+  dims_helix_table <- bind_rows(metab_list_all)
+  # Filter table for metabolites for Helix
+  dims_helix_table <- dims_helix_table %>% filter(Helix == "ja")
+  # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores
+  dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[, 1]
+  dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore)
+  
+  # Filter DIMS results for metabolites for Helix
+  df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split)
+  # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc.
+  df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split))
+  
+  df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore)
+  
+  return(df_metabs_helix)
+}
diff --git a/DIMS/Utils/is_diagnostic_patient.R b/DIMS/Utils/is_diagnostic_patient.R
new file mode 100644
index 0000000..5820838
--- /dev/null
+++ b/DIMS/Utils/is_diagnostic_patient.R
@@ -0,0 +1,11 @@
+is_diagnostic_patient <- function(patient_column) {
+  #' Check for Diagnostics patients with correct patient number (e.g. starting with "P2024M")
+  #'
+  #' @param patient_column: a column from dataframe with IDs (character vector)
+  #'
+  #' @return: a logical vector with TRUE or FALSE for each element (vector)
+  
+  diagnostic_patients <- grepl("^P[0-9]{4}M", patient_column)
+  
+  return(diagnostic_patients)
+}
diff --git a/DIMS/Utils/output_helix.R b/DIMS/Utils/output_helix.R
new file mode 100644
index 0000000..09fa1a9
--- /dev/null
+++ b/DIMS/Utils/output_helix.R
@@ -0,0 +1,39 @@
+output_for_helix <- function(protocol_name, df_metabs_helix) {
+  #' Get the output dataframe for Helix
+  #'
+  #' @param protocol_name: protocol name (string)
+  #' @param df_metabs_helix: dataframe with metabolite Z-scores for patients (dataframe)
+  #'
+  #' @return: dataframe with patient metabolite Z-scores in correct format for Helix
+
+  # Remove positive controls
+  df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient))
+
+  # Add 'Vial' column, each patient has unique ID
+  df_metabs_helix <- df_metabs_helix %>%
+    group_by(Patient) %>%
+    mutate(Vial = cur_group_id()) %>%
+    ungroup()
+
+  # Split patient number into labnummer and Onderzoeksnummer
+  df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix)
+
+  # Add column with protocol name
+  df_metabs_helix$Protocol <- protocol_name
+
+  # Change name Z_score and Helix_naam columns to Amount and Name
+  change_columns <- c(Amount = "Z_score", Name = "Helix_naam")
+  df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns))
+
+  # Select only necessary columns and set them in correct order
+  df_metabs_helix <- df_metabs_helix %>%
+    select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount))
+
+  # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times)
+  df_metabs_helix <- df_metabs_helix %>%
+    group_by(Onderzoeksnummer, Name) %>%
+    distinct() %>%
+    ungroup()
+
+  return(df_metabs_helix)
+}
diff --git a/DIMS/Utils/prepare_alarmvalues.R b/DIMS/Utils/prepare_alarmvalues.R
new file mode 100644
index 0000000..94ffbfd
--- /dev/null
+++ b/DIMS/Utils/prepare_alarmvalues.R
@@ -0,0 +1,59 @@
+prepare_alarmvalues <- function(pt_name, dims_helix_table) {
+  #' Create a dataframe with all metabolites that exceed the min and max Z-score cutoffs
+  #'
+  #' @param pt_name: patient code (string)
+  #' @param dims_helix_table: dataframe with metabolite Z-scores for each patient and Helix info (dataframe)
+  #'
+  #' @return: dataframe with metabolites that exceed the min and max Z-score cutoffs for the selected patient
+
+  # extract data for patient of interest (pt_name)
+  pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name)
+  pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2)
+
+  # Make empty dataframes for metabolites above or below alarmvalues
+  pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric())
+  pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric())
+
+  # Loop over individual metabolites
+  for (metab in unique(pt_metabs_helix$HMDB_name)){
+    # Get data for individual metabolite
+    pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab)
+    # print(pt_metab)
+
+    # Check if zscore is positive of negative
+    if (pt_metab$Z_score > 0) {
+      # Get specific alarmvalue for metabolite
+      high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore)
+
+      # If zscore is above the alarmvalue, add to pt_list_high table
+      if (pt_metab$Z_score > high_zscore_cutoff_metab) {
+        pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score)
+        pt_list_high <- rbind(pt_list_high, pt_metab_high)
+      }
+    } else {
+      # Get specific alarmvalue for metabolite
+      low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore)
+
+      # If zscore is below the alarmvalue, add to pt_list_low table
+      if (pt_metab$Z_score < low_zscore_cutoff_metab) {
+        pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score)
+        pt_list_low <- rbind(pt_list_low, pt_metab_low)
+      }
+    }
+  }
+
+  # sort tables on zscore
+  pt_list_high <- pt_list_high %>% arrange(desc(Z_score))
+  pt_list_low <- pt_list_low %>% arrange(Z_score)
+  # add lines for increased, decreased
+  extra_line1 <- c("Increased", "")
+  extra_line2 <- c("Decreased", "")
+  # combine the two lists
+  top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low)
+  # remove row names
+  rownames(top_metab_pt) <- NULL
+  # change column names for display
+  colnames(top_metab_pt) <- c("Metabolite", "Z-score")
+
+  return(top_metab_pt)
+}
diff --git a/DIMS/Utils/prepare_data.R b/DIMS/Utils/prepare_data.R
new file mode 100644
index 0000000..ea3efe5
--- /dev/null
+++ b/DIMS/Utils/prepare_data.R
@@ -0,0 +1,51 @@
+# unused variables will be removed: metab_list_alarm
+prepare_data <- function(metab_list_all, zscore_patients_local) {
+  #' Combine patient Z-scores with metabolite info
+  #'
+  #' @param metab_list_all: list of dataframes with metabolite information for different stofgroepen (list)
+  #' @param zscore_patients_local: dataframe with metabolite Z-scores for all patient
+  #'
+  #' @return: list of dataframes for each stofgroep with data for each metabolite and patient/control per row
+
+  # remove "_Zscore" from column (patient) names
+  colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local))
+  # put data into pages, max 20 violin plots per page in PDF
+  metab_interest_sorted <- list()
+  metab_category <- c()
+  for (metab_class_index in 1:length(metab_list_all)) {
+    metab_class <- names(metab_list_all)[metab_class_index]
+    metab_list <- metab_list_all[[metab_class_index]]
+    if (ncol(metab_list) > 2) {
+      # third column are the alarm values, so reduce the data frame to 2 columns and save list
+      metab_list_alarm <- metab_list
+      metab_list <- metab_list[, c(1, 2)]
+    }
+    # make sure that all HMDB_names have 45 characters
+    for (metab_index in 1:length(metab_list$HMDB_name)) {
+      if (is.character(metab_list$HMDB_name[metab_index])) {
+        hmdb_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]]
+      } else {
+        hmdb_name_separated <- "strspliterror"
+      }
+      if (length(hmdb_name_separated) <= 45) {
+        hmdb_name_separated <- c(hmdb_name_separated, rep(" ", 45 - length(hmdb_name_separated)))
+      } else {
+        hmdb_name_separated <- c(hmdb_name_separated[1:42], "...")
+      }
+      metab_list$HMDB_name[metab_index] <- paste0(hmdb_name_separated, collapse = "")
+    }
+    # find metabolites and ratios in data frame zscore_patients_local
+    metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code")
+    # remove column "HMDB_code"
+    metab_interest <- metab_interest[, -which(colnames(metab_interest) == "HMDB_code")]
+    # put the data frame in long format
+    metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name")
+    # sort on metabolite names (HMDB_name)
+    sort_order <- order(metab_interest_melt$HMDB_name)
+    metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ]
+    metab_category <- c(metab_category, metab_class)
+  }
+  names(metab_interest_sorted) <- metab_category
+
+  return(metab_interest_sorted)
+}
diff --git a/DIMS/Utils/prepare_data_perpage.R b/DIMS/Utils/prepare_data_perpage.R
new file mode 100644
index 0000000..61d2e15
--- /dev/null
+++ b/DIMS/Utils/prepare_data_perpage.R
@@ -0,0 +1,58 @@
+# remove default variable values
+prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat = 20, nr_contr = 30) {
+  #' Combine patient and control data for each page of the violinplot pdf
+  #'
+  #' @param metab_interest_sorted: list of dataframes with data for each metabolite and patient (list)
+  #' @param metab_interest_contr:  list of dataframes with data for each metabolite and control (list)
+  #' @param nr_plots_perpage: number of plots per page in the violinplot pdf (integer)
+  #' @param nr_pat: number of patients (integer)
+  #' @param nr_contr: number of controls (integer)
+  #'
+  #' @return: list of dataframes with metabolite Z-scores for each patient and control,
+  #'          the length of list is the number of pages for the violinplot pdf (list)
+
+  total_nr_pages <- 0
+  metab_perpage <- list()
+  metab_category <- c()
+  for (metab_class_index in 1:length(metab_interest_sorted)) {
+    # split list into pages, each page containing max nr_plots_perpage (20) compounds
+    metab_interest_perclass <- metab_interest_sorted[[metab_class_index]]
+    metab_class <- names(metab_interest_sorted)[metab_class_index]
+    # add controls
+    metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]]
+    # number of pages for this class
+    nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage)
+    for (page_nr in 1:nr_pages) {
+      total_nr_pages <- total_nr_pages + 1
+      select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr - 1)) + 1
+      select_rows_end <- nr_pat * nr_plots_perpage * page_nr
+      metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ]
+      # same for controls
+      select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr - 1)) + 1
+      select_rows_end_contr <- nr_contr * nr_plots_perpage * page_nr
+      metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ]
+      # add controls
+      metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr)
+      # if a page has fewer than nr_plots_perpage plots, fill page with empty plots
+      na_rows <- which(is.na(metab_onepage$HMDB_name))
+      if (length(na_rows) > 0) {
+        # repeat the patient and control variables
+        metab_onepage$variable[na_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)]
+        # for HMDB name, substitute a number of spaces
+        for (row_nr in na_rows) {
+          metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr / (nr_pat + nr_contr))), collapse = "")
+        }
+        metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name)
+        # leave the values at NA
+      }
+      # put data for one page into object with data for all pages
+      metab_perpage[[total_nr_pages]] <- metab_onepage
+      # create list of page headers
+      metab_category <- c(metab_category, paste(metab_class, page_nr, sep = "_"))
+    }
+  }
+  # add page headers to list
+  names(metab_perpage) <- metab_category
+
+  return(metab_perpage)
+}
diff --git a/DIMS/Utils/prepare_toplist.R b/DIMS/Utils/prepare_toplist.R
new file mode 100644
index 0000000..cc616c4
--- /dev/null
+++ b/DIMS/Utils/prepare_toplist.R
@@ -0,0 +1,35 @@
+prepare_toplist <- function(pt_name, zscore_patients_copy) {
+  #' Create a dataframe with the top 20 highest and top 10 lowest metabolites per patient
+  #'
+  #' @param pt_name: patient code (string)
+  #' @param zscore_patients_copy: dataframe with metabolite Z-scores per patient (dataframe)
+  #'
+  #' @return: dataframe with 30 metabolites and Z-scores (dataframe)
+
+  # set parameters for table
+  top_highest <- 20
+  top_lowest <- 10
+
+  # extract data for patient of interest (pt_name)
+  pt_list <- zscore_patients_copy[, c(1, 2, which(colnames(zscore_patients_copy) == pt_name))]
+  # sort metabolites on Z-scores for this patient
+  pt_list_sort <- sort(pt_list[, 3], index.return = TRUE)
+  # determine top highest and lowest Z-scores for this patient
+  pt_list_sort <- sort(pt_list[, 3], index.return = TRUE)
+  pt_list_low <- pt_list[pt_list_sort$ix[1:top_lowest], ]
+  pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix) - top_highest + 1)], ]
+  # round off Z-scores
+  pt_list_low[, 3] <- round(as.numeric(pt_list_low[, 3]), 2)
+  pt_list_high[, 3] <- round(as.numeric(pt_list_high[, 3]), 2)
+  # add lines for increased, decreased
+  extra_line1 <- c("Increased", "", "")
+  extra_line2 <- c("Decreased", "", "")
+  top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low)
+  # remove row names
+  rownames(top_metab_pt) <- NULL
+
+  # change column names for display
+  colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score")
+
+  return(top_metab_pt)
+}

From b75fabadc79387c0ee0e7e9ece15003b8324163a Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Thu, 18 Apr 2024 17:13:45 +0200
Subject: [PATCH 56/73] Output paths added

---
 DIMS/AverageTechReplicates.nf | 1 +
 DIMS/GenerateExcel.nf         | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index 9fe9a98..9e47af3 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -18,6 +18,7 @@ process AverageTechReplicates {
        path('*_avg.RData'), emit: binned_files
        path('miss_infusions_negative.txt')
        path('miss_infusions_positive.txt')
+       path('*_TICplots.pdf')
 
     script:
         """
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 32ec928..86055bd 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -14,8 +14,9 @@ process GenerateExcel {
     output:
        path('AdductSums_*.txt')
        path('*IS_results.RData')
-       path('*.xlsx'), emit: excel_file 
-       path('plots'), emit: plot_files 
+       path('*_positive_control.RData')
+       path('*.xlsx'), emit: excel_files 
+       path('plots/*.png'), emit: plot_files 
 
     script:
         """

From 3687f14b880d75437645edfc037b3597ccccd3ea Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Thu, 18 Apr 2024 17:14:34 +0200
Subject: [PATCH 57/73] Changed to Unidentified steps to parallel

---
 DIMS/UnidentifiedCalcZscores.R   | 11 +++++---
 DIMS/UnidentifiedCollectPeaks.R  | 48 ++++++++++++++++++++++++++++++--
 DIMS/UnidentifiedCollectPeaks.nf |  3 +-
 DIMS/UnidentifiedFillMissing.R   | 48 ++++++++++++++++----------------
 DIMS/UnidentifiedPeakGrouping.R  | 40 +++++++++++++++++++-------
 DIMS/UnidentifiedPeakGrouping.nf |  2 +-
 6 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
index 2bce990..e58063a 100755
--- a/DIMS/UnidentifiedCalcZscores.R
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -15,10 +15,13 @@ source(paste0(scripts_dir, "statistics_z.R"))
 scanmodes <- c("positive", "negative")
 
 for (scanmode in scanmodes) {
-  # get list of files
-  filled_file <- paste0("PeakGroupList_", scanmode, "_Unidentified_filled.RData")
-  # load file
-  outlist_total <- get(load(filled_file))
+  filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".{1,}_Unidentified_filled"))
+  # load files and combine into one object
+  outlist_total <- NULL
+  for (file_nr in 1:length(filled_files)) {
+    peakgrouplist_filled <- get(load(filled_files[file_nr]))
+    outlist_total <- rbind(outlist_total, peakgrouplist_filled)
+  }
 
   # remove duplicates; peak groups with exactly the same m/z
   outlist_total <- mergeDuplicatedRows(outlist_total)
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index a5674f1..3de66d5 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -1,6 +1,8 @@
 #!/usr/bin/Rscript
 ## adapted from 7-collectSamplesGroupedHMDB.R
 
+options(digits = 16)
+
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
@@ -20,12 +22,54 @@ for (scanmode in scanmodes) {
     # load list_of_peaks_used_in_peak_groups_identified
     load(files[file_index])
     remove <- c(remove, 
-		which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[file_index, "mzmed.pkt"]))
+		which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[, "mzmed.pkt"])) # nolint
   }
   outlist_rest <- outlist_total[-remove, ]
 
   # sort on mass
   outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ]
   # save output
-  save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData"))
+  # save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData"))
+
+  size_parts <- 10000
+  # start <- 1
+
+  num_parts <- ceiling(nrow(outlist) / size_parts)
+
+  for (part in 1:num_parts){
+    if (part == 1) {
+      start_part <- 1
+      end_part <- size_parts
+    } else {
+      start_part <- (part - 1) * size_parts + 1
+    }
+    if (part == num_parts) {
+      end_part <- nrow(outlist)
+    } else if (part != 1) {
+      end_part <- part * size_parts
+    }
+
+    outlist_part <- outlist[c(start_part:end_part), ]
+    # # add ppm extra before start
+    # if (part != 1) {
+    #   mz_start <- outlist_part[1, "mzmed.pkt"]
+    #   mz_ppm_range <- ppm * as.numeric(mz_start) / 1e+06
+    #   mz_start_min_ppm <- mz_start - mz_ppm_range
+    #   outlist_before_part <- outlist %>% filter(mzmed.pkt >= mz_start_min_ppm & mzmed.pkt < mz_start)
+
+    #   outlist_part <- rbind(outlist_before_part, outlist_part)
+    # }
+
+    # add ppm extra after end
+    if (part != num_parts) {
+      mz_end <- outlist_part[nrow(outlist_part), "mzmed.pkt"]
+      mz_ppm_range <- ppm * as.numeric(mz_end) / 1e+06
+      mz_end_plus_ppm <- mz_end + mz_ppm_range
+      outlist_after_part <- outlist %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm)
+
+      outlist_part <- rbind(outlist_part, outlist_after_part)
+    }
+
+    save(outlist_part, file = paste0(scanmode, "_", paste0("unidentified_part_", part, ".RData")))
+  }
 }
diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf
index 6b2656e..4f419a7 100644
--- a/DIMS/UnidentifiedCollectPeaks.nf
+++ b/DIMS/UnidentifiedCollectPeaks.nf
@@ -9,7 +9,8 @@ process UnidentifiedCollectPeaks {
        path(peaklist_identified)
 
     output:
-       path('SpectrumPeaks_*_Unidentified.RData')
+       // path('SpectrumPeaks_*_Unidentified.RData')
+       path('*.RData')
 
     script:
         """
diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R
index 2ee085f..0fbdea4 100755
--- a/DIMS/UnidentifiedFillMissing.R
+++ b/DIMS/UnidentifiedFillMissing.R
@@ -4,12 +4,12 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-peakgrouplist_file1 <- cmd_args[1]
-peakgrouplist_file2 <- cmd_args[2]
-scripts_dir <- cmd_args[3]
-thresh <- as.numeric(cmd_args[4])
-resol <- as.numeric(cmd_args[5])
-ppm <- as.numeric(cmd_args[6])
+peakgrouplist_file <- cmd_args[1]
+# peakgrouplist_file2 <- cmd_args[2]
+scripts_dir <- cmd_args[2]
+thresh <- as.numeric(cmd_args[3])
+resol <- as.numeric(cmd_args[4])
+ppm <- as.numeric(cmd_args[5])
 outdir  <- "./"
 
 # load in function scripts
@@ -23,27 +23,27 @@ source(paste0(scripts_dir, "ident.hires.noise.HPC.R"))
 source(paste0(scripts_dir, "elementInfo.R"))
 source(paste0(scripts_dir, "globalAssignments.HPC.R"))
 
-peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2)
-for (peakgrouplist_file in peakgrouplist_files) {
+# peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2)
+# for (peakgrouplist_file in peakgrouplist_files) {
 
-  if (grepl("_pos", peakgrouplist_file)) {
-    scanmode <- "positive"
-  } else if (grepl("_neg", peakgrouplist_file)) {
-    scanmode <- "negative"
-  }
+if (grepl("_pos", basename(peakgrouplist_file))) {
+  scanmode <- "positive"
+} else if (grepl("_neg", basename(peakgrouplist_file))) {
+  scanmode <- "negative"
+}
 
-  # get replication pattern for sample names
-  pattern_file <- paste0(scanmode, "_repl_pattern.RData")
-  repl_pattern <- get(load(pattern_file))
+# get replication pattern for sample names
+pattern_file <- paste0(scanmode, "_repl_pattern.RData")
+repl_pattern <- get(load(pattern_file))
 
-  # load peak group list and determine output file name
-  outpgrlist_identified <- get(load(peakgrouplist_file))
+# load peak group list and determine output file name
+outpgrlist_identified <- get(load(peakgrouplist_file))
 
-  outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
+outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
-  # replace missing values (zeros) with random noise
-  peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+# replace missing values (zeros) with random noise
+peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
-  # save output
-  save(peakgrouplist_filled, file = outputfile_name)
-}
+# save output
+save(peakgrouplist_filled, file = outputfile_name)
+# }
diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R
index 16dd8c1..4600820 100755
--- a/DIMS/UnidentifiedPeakGrouping.R
+++ b/DIMS/UnidentifiedPeakGrouping.R
@@ -4,8 +4,9 @@
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
 
-resol <- as.numeric(cmd_args[1])
-ppm <- as.numeric(cmd_args[2])
+unidentified_peaklist <- cmd_args[1]
+resol <- as.numeric(cmd_args[2])
+ppm <- as.numeric(cmd_args[3])
 outdir <- "./"
 
 options(digits = 16)
@@ -76,14 +77,33 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
   return(outpgrlist)
 }
 
-scanmodes <- c("positive", "negative")
+# scanmodes <- c("positive", "negative")
 
-for (scanmode in scanmodes) {
-  # generate peak group lists of the unidentified peaks
-  unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")
-  outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm)
-  write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt"))
+# for (scanmode in scanmodes) {
+#   # generate peak group lists of the unidentified peaks
+#   unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")
+#   outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm)
+#   write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt"))
 
-  # save output in RData format for further processing
-  save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData"))
+#   # save output in RData format for further processing
+#   save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData"))
+# }
+
+# determine appropriate scanmode based on unidentified_peaklist file
+if (grepl("negative", basename(unidentified_peaklist))) {
+  scanmode <- "negative"
+} else if (grepl("positive", basename(unidentified_peaklist))) {
+  scanmode <- "positive"
 }
+
+# generate peak group lists of the unidentified peaks
+outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm)
+
+# determine part number of unidentified_peaklist file
+part_number <- gsub("\\D", "", basename(unidentified_peaklist))
+
+# save output in txt format
+write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_part_", part_number, "_Unidentified.txt"))
+
+# save output in RData format for further processing
+save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_part_", part_number, "_Unidentified.RData"))
diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf
index 2988195..ce140ec 100644
--- a/DIMS/UnidentifiedPeakGrouping.nf
+++ b/DIMS/UnidentifiedPeakGrouping.nf
@@ -14,6 +14,6 @@ process UnidentifiedPeakGrouping {
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $params.resolution $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $unidentified_spectrumpeaks_files $params.resolution $params.ppm
         """
 }

From 7187c2f478a4093e34b1fb24269a88c70a9a8e2b Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Fri, 19 Apr 2024 13:48:23 +0200
Subject: [PATCH 58/73] Added changes HPC and parallelisation Unidentified

---
 DIMS/GenerateExcel.R            | 4 +++-
 DIMS/GenerateExcel.nf           | 7 +++----
 DIMS/GenerateViolinPlots.R      | 2 +-
 DIMS/GenerateViolinPlots.nf     | 2 +-
 DIMS/UnidentifiedCollectPeaks.R | 4 ++--
 DIMS/Utils/fit_optim.R          | 2 +-
 6 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index 420b2e1..b2edf5c 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -181,9 +181,11 @@ if (z_score == 1) {
   }
   colnames(outlist_noZ)[startcol:ncol(outlist_noZ)] <- cnames_robust
 
-  # output metabolites filtered on relevance into tab-separated file
+  # output metabolites filtered on relevance
+  save(outlist, file = paste0("AdductSums_filtered_Zscores.RData"))
   write.table(outlist, file = paste0("AdductSums_filtered_Zscores.txt"), sep = "\t", row.names = FALSE)
   # output filtered metabolites with robust scaled Zscores
+  save(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.RData"))
   write.table(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE)
 
   # get the IDs of the patients and sort
diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 86055bd..689d3ef 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -12,11 +12,10 @@ process GenerateExcel {
        path(relevance_file)
 
     output:
-       path('AdductSums_*.txt')
-       path('*IS_results.RData')
-       path('*_positive_control.RData')
+       path('*.RData')
+       path('*.txt')
        path('*.xlsx'), emit: excel_files 
-       path('plots/*.png'), emit: plot_files 
+       path('*.png'), emit: plot_files 
 
     script:
         """
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index 38a8c61..562c794 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -421,7 +421,7 @@ if (violin == 1) {
         # get output file for Helix
         output_helix <- output_for_helix(protocol_name, dims_helix_table)
         # write output to file
-        path_helixfile <- paste0(outdir, "/output_Helix_", run_name,".csv")
+        path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv")
         write.csv(output_helix, path_helixfile, quote = F, row.names = F)
       }
     }
diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
index a1f5259..641d27a 100644
--- a/DIMS/GenerateViolinPlots.nf
+++ b/DIMS/GenerateViolinPlots.nf
@@ -11,7 +11,7 @@ process GenerateViolinPlots {
     output:
        path('Diagnostics/*.pdf'), emit: diag_plot_files
        path('Other/*.pdf'), emit: other_plot_files
-       path('dIEM/*.pdf'), emit: diem_plot_files
+       path('dIEM_plots/*.pdf'), emit: diem_plot_files
        path('*.xlsx'), emit: excel_file
        path('*.csv'), emit: helix_file
 
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index 3de66d5..61e00f3 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -64,8 +64,8 @@ for (scanmode in scanmodes) {
     if (part != num_parts) {
       mz_end <- outlist_part[nrow(outlist_part), "mzmed.pkt"]
       mz_ppm_range <- ppm * as.numeric(mz_end) / 1e+06
-      mz_end_plus_ppm <- mz_end + mz_ppm_range
-      outlist_after_part <- outlist %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm)
+      mz_end_plus_ppm <- as.numeric(mz_end) + mz_ppm_range
+      outlist_after_part <- as.data.frame(outlist) %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm)
 
       outlist_part <- rbind(outlist_part, outlist_after_part)
     }
diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R
index ac3a9ee..bc5f7d3 100644
--- a/DIMS/Utils/fit_optim.R
+++ b/DIMS/Utils/fit_optim.R
@@ -2,7 +2,7 @@
 # variables with fixed values will be removed from function parameters
 # plot, width, height
 # fit_gaussian should be defined before this function is called.
-fit_op4tim <- function(mass_vector, int_vector, resol,
+fit_optim <- function(mass_vector, int_vector, resol,
                       plot, scanmode, int_factor, width, height) {
   #' Determine optimized fit of Gaussian curve to small region of m/z
   #'

From f9335619d9fd7bc05e3c564f5bc2f3f7eb2a4d36 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Fri, 19 Apr 2024 14:22:25 +0200
Subject: [PATCH 59/73] Added loading package

---
 DIMS/UnidentifiedCollectPeaks.R | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index 61e00f3..be1ac0e 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -1,6 +1,9 @@
 #!/usr/bin/Rscript
 ## adapted from 7-collectSamplesGroupedHMDB.R
 
+# load required packages
+suppressMessages(library("dplyr"))
+
 options(digits = 16)
 
 # define parameters

From 8397c12319f93ed5876ae7374e65fbf01f66e65a Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Fri, 19 Apr 2024 15:31:17 +0200
Subject: [PATCH 60/73] Changed output

---
 DIMS/GenerateExcel.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf
index 689d3ef..6134766 100644
--- a/DIMS/GenerateExcel.nf
+++ b/DIMS/GenerateExcel.nf
@@ -15,7 +15,7 @@ process GenerateExcel {
        path('*.RData')
        path('*.txt')
        path('*.xlsx'), emit: excel_files 
-       path('*.png'), emit: plot_files 
+       path('plots/*.png'), emit: plot_files 
 
     script:
         """

From 507c88dd405edbbcd217383a388697e15871525d Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 22 Apr 2024 14:05:10 +0200
Subject: [PATCH 61/73] cleaned up peak finding, fill missing functions

---
 DIMS/FillMissing.R                |   2 +-
 DIMS/PeakFinding.R                |   2 +-
 DIMS/Utils/atomic_info.R          |  92 ++++++++
 DIMS/Utils/calculate_zscores.R    |  63 +++++
 DIMS/Utils/check_overlap.R        |  24 ++
 DIMS/Utils/estimate_area.R        |  30 +++
 DIMS/Utils/fit_gaussian.R         |  78 ++----
 DIMS/Utils/fit_gaussians.R        |   1 -
 DIMS/Utils/fit_init.R             |   1 +
 DIMS/Utils/fit_optim.R            |   4 +-
 DIMS/Utils/fit_peaks.R            | 381 ++++++++++++++++++++++++++++++
 DIMS/Utils/get_element_info.R     |  24 ++
 DIMS/Utils/get_fit_quality.R      |  34 +++
 DIMS/Utils/identify_noisepeaks.R  | 103 ++++++++
 DIMS/Utils/merge_duplicate_rows.R |  59 +++++
 DIMS/Utils/optimize_gaussfit.R    |  23 ++
 DIMS/Utils/replace_zeros.R        |  64 +++++
 DIMS/Utils/search_mzrange.R       |  20 +-
 DIMS/Utils/sum_curves.R           |  35 +++
 DIMS/Utils/within_ppm.R           |  64 +++++
 20 files changed, 1030 insertions(+), 74 deletions(-)
 create mode 100644 DIMS/Utils/atomic_info.R
 create mode 100644 DIMS/Utils/calculate_zscores.R
 create mode 100644 DIMS/Utils/check_overlap.R
 create mode 100644 DIMS/Utils/estimate_area.R
 create mode 100644 DIMS/Utils/fit_peaks.R
 create mode 100644 DIMS/Utils/get_element_info.R
 create mode 100644 DIMS/Utils/get_fit_quality.R
 create mode 100644 DIMS/Utils/identify_noisepeaks.R
 create mode 100644 DIMS/Utils/merge_duplicate_rows.R
 create mode 100644 DIMS/Utils/optimize_gaussfit.R
 create mode 100644 DIMS/Utils/replace_zeros.R
 create mode 100644 DIMS/Utils/sum_curves.R
 create mode 100644 DIMS/Utils/within_ppm.R

diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index f7166d0..f96e257 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -39,7 +39,7 @@ outpgrlist_identified <- get(load(peakgrouplist_file))
 outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # replace missing values (zeros) with random noise
-peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
 # save output
 save(peakgrouplist_filled, file = outputfile_name)
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index e2edc85..08eb546 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -63,4 +63,4 @@ print(thresh)
 print(width)
 print(height)
 
-findPeaks.Gauss.HPC(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
diff --git a/DIMS/Utils/atomic_info.R b/DIMS/Utils/atomic_info.R
new file mode 100644
index 0000000..94d0e83
--- /dev/null
+++ b/DIMS/Utils/atomic_info.R
@@ -0,0 +1,92 @@
+## adapted from globalAssignments.HPC.R
+# relative abundancies from theoretical mass and composition
+# not a function, but a large amount of objects that will become available in memory
+# refactor: include in get_element_info
+# check if all these elements are necessary. Only use snake_make for Hmass -> hydrogen_mass.
+# move source library to higher level
+options(digits = 16)
+
+suppressPackageStartupMessages(library(lattice))
+
+# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST
+Ba <- list(name = "Ba", mass = 130, isotope = list(mass = c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768), abundance = c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717)))
+Br <- list(name = "Br", mass = 79, isotope = list(mass = c(-0.0816639, 0, -0.083711), abundance = c(0.5069, 0, 0.4931)))
+C <- list(name = "C", mass = 12, isotope = list(mass = c(0, 0.003354838, 0.003241989), abundance = c(0.9893, 0.0107, 0)))
+Ca <- list(name = "Ca", mass = 40, isotope = list(mass = c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467), abundance = c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187)))
+Cl <- list(name = "Cl", mass = 35, isotope = list(mass = c(-0.03114732, 0, -0.03409741), abundance = c(0.7576, 0, 0.2424)))
+Cr <- list(name = "Cr", mass = 50, isotope = list(mass = c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175), abundance = c(0.04345, 0, 0.83789, 0.09501, 0.02365)))
+Cu <- list(name = "Cu", mass = 63, isotope = list(mass = c(-0.0704011, 0, -0.0722071), abundance = c(0.6917, 0, 0.3083)))
+F <- list(name = "F", mass = 19, isotope = list(mass = c(-0.00159678), abundance = c(1)))
+Fe <- list(name = "Fe", mass = 54, isotope = list(mass = c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227), abundance = c(0.058, 0, 0.9172, 0.022, 0.0028)))
+H <- list(name = "H", mass = 1, isotope = list(mass = c(0.00782503207, 0.014101778, 0.01604928), abundance = c(0.999885, 0.000115, 0)))
+Hg <- list(name = "Hg", mass = 196, isotope = list(mass = c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533), abundance = c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687)))
+I <- list(name = "I", mass = 127, isotope = list(mass = c(-0.095527), abundance = c(1)))
+K <- list(name = "K", mass = 39, isotope = list(mass = c(-0.0362926, -0.0360008, -0.0381746), abundance = c(0.932581, 0.000117, 0.067302)))
+Li <- list(name = "Li", mass = 6, isotope = list(mass = c(0.0151214, 0.016003), abundance = c(0.075, 0.925)))
+Mg <- list(name = "Mg", mass = 24, isotope = list(mass = c(-0.0149577, -0.0141626, -0.0174063), abundance = c(0.7899, 0.1, 0.1101)))
+Mn <- list(name = "Mn", mass = 55, isotope = list(mass = c(-0.0619529), abundance = c(1)))
+N <- list(name = "N", mass = 14, isotope = list(mass = c(0.003074002, 0.00010897), abundance = c(0.99634, 0.00366)))
+Na <- list(name = "Na", mass = 23, isotope = list(mass = c(-0.0102323), abundance = c(1)))
+Ni <- list(name = "Ni", mass = 58, isotope = list(mass = c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321), abundance = c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926)))
+O <- list(name = "O", mass = 16, isotope = list(mass = c(-0.00508538044, -0.0008683, -0.0008397), abundance = c(0.99757, 0.000381, 0.00205)))
+P <- list(name = "P", mass = 31, isotope = list(mass = c(-0.026238), abundance = c(1)))
+S <- list(name = "S", mass = 32, isotope = list(mass = c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924), abundance = c(0.9499, 0.0075, 0.0425, 0, 1e-04)))
+Se <- list(name = "Se", mass = 74, isotope = list(mass = c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022), abundance = c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873)))
+Si <- list(name = "Si", mass = 28, isotope = list(mass = c(-0.0230729, -0.0235051, -0.0262293), abundance = c(0.9223, 0.0467, 0.031)))
+Sn <- list(name = "Sn", mass = 112, isotope = list(mass = c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257), abundance = c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579)))
+Zn <- list(name = "Zn", mass = 64, isotope = list(mass = c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675), abundance = c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006)))
+
+# The following list is for our own use
+NH4 <- list(name = "NH4", mass = 18, isotope = list(mass = c(0.03437, 0.03141, -0.95935)), abundance = c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1
+Ac <- list(name = "Ac", mass = 60, isotope = list(mass = c(0.02114, 0.02450, 0.02538)), abundance = c(0.975, 0.021, 0.004)) # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4
+NaCl <- list(name = "NaCl", mass = 58, isotope = list(mass = c(-0.04137, 0, -0.04433)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4
+NaCl2 <- list(name = "NaCl2", mass = 116, isotope = list(mass = c(-0.08274, 0, -0.08866)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4
+NaCl3 <- list(name = "NaCl3", mass = 174, isotope = list(mass = c(-0.12411, 0, -0.13299)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4
+NaCl4 <- list(name = "NaCl4", mass = 232, isotope = list(mass = c(-0.16548, 0, -0.17732)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4
+NaCl5 <- list(name = "NaCl5", mass = 290, isotope = list(mass = c(-0.20685, 0, -0.22165)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4
+For <- list(name = "For", mass = 45, isotope = list(mass = c(-0.00233, 0.00103)), abundance = c(0.989, 0.011)) # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4
+Na2 <- list(name = "2Na-H", mass = 46, isotope = list(mass = c(-1.0282896)), abundance = c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H !
+Met <- list(name = "CH3OH", mass = 32, isotope = list(mass = c(1.034045, 1.037405)), abundance = c(0.989, 0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2
+CH3OH <- list(name = "CH3OH", mass = 32, isotope = list(mass = c(1.034045, 1.037405)), abundance = c(0.989, 0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2
+Na3 <- list(name = "3Na-2H", mass = 69, isotope = list(mass = c(-2.0463469)), abundance = c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H !
+KCl <- list(name = "KCl", mass = 74, isotope = list(mass = c(-0.06744, 0.92961, -0.06744, 0.92772)), abundance = c(0.7047, 0.2283, 0.0507, 0.0162)) # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3
+H2PO4 <- list(name = "H2PO4", mass = 97, isotope = list(mass = c(-0.03091)), abundance = c(1))
+HSO4 <- list(name = "HSO4", mass = 97, isotope = list(mass = c(-0.04042, 0, -0.04462)), abundance = c(0.96, 0, 0.04))
+Met2 <- list(name = "Met2", mass = 64, isotope = list(mass = c(1.060265, 1.013405)), abundance = c(0.978, 0.022))
+Met3 <- list(name = "Met3", mass = 96, isotope = list(mass = c(1.086485, 1.089845)), abundance = c(0.969, 0.031))
+Met4 <- list(name = "Met4", mass = 128, isotope = list(mass = c(1.112705, 1.116065)), abundance = c(0.959, 0.041))
+Met5 <- list(name = "Met5", mass = 160, isotope = list(mass = c(1.20935, 1.142285)), abundance = c(0.949, 0.051))
+NaminH <- list(name = "Na-H", mass = 21, isotope = list(mass = c(-0.02571416)), abundance = c(1))
+KminH <- list(name = "K-H", mass = 37, isotope = list(mass = c(-0.05194, 0.94617)), abundance = c(0.9328, 0.0672))
+H2O <- list(name = "H2O", mass = -19, isotope = list(mass = c(-0.01894358)), abundance = c(1))
+NaK <- list(name = "NaK-H", mass = 61, isotope = list(mass = c(-0.054345, 0.943765)), abundance = c(0.9328, 0.0672))
+min2H <- list(name = "min2H", mass = -2, isotope = list(mass = c(-0.0151014)), abundance = c(1))
+plus2H <- list(name = "plus2H", mass = 2, isotope = list(mass = c(0.0151014)), abundance = c(1))
+plus2Na <- list(name = "plus2Na", mass = 46, isotope = list(mass = c(-0.02046), abundance = c(1)))
+plusNaH <- list(name = "plusNaH", mass = 24, isotope = list(mass = c(-0.00295588), abundance = c(1)))
+plusKH <- list(name = "plusKH", mass = 40, isotope = list(mass = c(-0.029008, 0.969101)), abundance = c(0.9328, 0.0672))
+plusHNH <- list(name = "plusHNH", mass = 19, isotope = list(mass = c(0.04164642)), abundance = c(1))
+min3H <- list(name = "min3H", mass = -3, isotope = list(mass = c(-0.02182926)), abundance = c(1))
+plus3H <- list(name = "plus3H", mass = 3, isotope = list(mass = c(0.02182926)), abundance = c(1))
+plus3Na <- list(name = "plus3Na", mass = 68, isotope = list(mass = c(0.96931)), abundance = c(1))
+plus2NaH <- list(name = "plus2NaH", mass = 47, isotope = list(mass = c(0.2985712)), abundance = c(1))
+plusNa2H <- list(name = "plusNa2H", mass = 25, isotope = list(mass = c(0.00432284)), abundance = c(1))
+
+# add first isotope of Cl because of high abundance
+Cl37 <- list(name = "Cl37", mass = 37, isotope = list(mass = c(-0.03409741)), abundance = c(1))
+
+all_elements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn)
+all_adducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, 
+                    NH4, Ac, NaCl, For, Na2, CH3OH, NaCl2, NaCl3, NaCl4, NaCl5, Na3, KCl, H2PO4, HSO4,
+                    Met2, Met3, Met4, Met5, NaminH, KminH, H2O, NaK, min2H, plus2H, plus2Na, plusNaH,
+                    plusKH, min3H, plus3H, plusHNH, plus3Na, plus2NaH, plusNa2H)
+
+atoms_inuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "13C", "34S", "18O", "37Cl")
+atomic_weights <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259)
+electron <- 0.00054858
+
+hydrogen_mass <- Hmass <- H$mass + H$isotope$mass[1]
+Dmass <- H$mass + 1 + H$isotope$mass[2]
+Tmass <- H$mass + 2 + H$isotope$mass[3]
+C13mass <- C$mass + 1 + C$isotope$mass[2]
+N15mass <- N$mass + 1 + N$isotope$mass[2]
diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R
new file mode 100644
index 0000000..9791e45
--- /dev/null
+++ b/DIMS/Utils/calculate_zscores.R
@@ -0,0 +1,63 @@
+## adapted from statistics_z.R
+# refactor: change column names from avg.ctrls to avg_ctrls, sd.ctrls to sd_ctrls
+# remove parameter sort_col
+# check logic of parameter adducts
+calculate_zscores <- function(peakgroup_list, sort_col, adducts) {
+  #' Calculate Z-scores for peak groups based on average and standard deviation of controls
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param sort_col: Column to sort on (string)
+  #' @param adducts: Parameter indicating whether there are adducts in the list (boolean)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  case_label <- "P"
+  control_label <- "C"
+
+  # calculate mean and standard deviation for Control group
+  ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
+  case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE)
+  int_cols <- c(ctrl_cols, case_cols)
+  # set al zeros to NA
+  peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA
+  ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE]
+  peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE))
+  peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE))
+
+  # set new column names and calculate Z-scores
+  colnames_zscores <- NULL
+  for (col_index in int_cols) {
+    col_name <- colnames(peakgroup_list)[col_index]
+    colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore"))
+    zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) -
+                     peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls
+    peakgroup_list <- cbind(peakgroup_list, zscores_1col)
+  }
+
+  # apply new column names to columns at end plus avg and sd columns
+  startcol <- ncol(peakgroup_list) + 3
+  colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
+
+  # add ppm deviation column
+  zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE)
+  if (!adducts) {
+    if ((dim(peakgroup_list[, zscore_cols])[2] + 6) != (startcol - 1)) {
+      ppmdev <- array(1:nrow(peakgroup_list), dim = c(nrow(peakgroup_list)))
+      # calculate ppm deviation
+      for (i in 1:nrow(peakgroup_list)) {
+        if (!is.na(peakgroup_list$theormz_HMDB[i]) && 
+            !is.null(peakgroup_list$theormz_HMDB[i]) &&
+            (peakgroup_list$theormz_HMDB[i] != "")) {
+          ppmdev[i] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[i])) -
+                               as.numeric(as.vector(peakgroup_list$theormz_HMDB[i]))) /
+                               as.numeric(as.vector(peakgroup_list$theormz_HMDB[i]))
+        } else {
+          ppmdev[i] <- NA
+        }
+      }
+      peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)])
+    }
+  }
+
+  return(peakgroup_list)
+}
diff --git a/DIMS/Utils/check_overlap.R b/DIMS/Utils/check_overlap.R
new file mode 100644
index 0000000..66492e8
--- /dev/null
+++ b/DIMS/Utils/check_overlap.R
@@ -0,0 +1,24 @@
+## adapted from checkOverlap.R
+check_overlap <- function(range1, range2) {
+  #' Modify range1 and range2 in case of overlap
+  #'
+  #' @param range1: Vector of m/z values for first peak (float)
+  #' @param range2: Vector of m/z values for second peak (float)
+  #'
+  #' @return new_ranges: list of two ranges (list)
+  
+  # Check for overlap
+  if (length(intersect(range1, range2)) == 2) {
+    if (length(range1) >= length(range2)) {
+      range1 <- range1[-length(range1)]  
+    } else {
+      range2 <- range2[-1]
+    }
+  } else if (length(intersect(range1, range2)) == 3) {
+    range1 <- range1[-length(range1)]  
+    range2 <- range2[-1]
+  }
+  new_ranges <- list("range1" = range1, "range2" = range2)
+  return(new_ranges)
+}
+
diff --git a/DIMS/Utils/estimate_area.R b/DIMS/Utils/estimate_area.R
new file mode 100644
index 0000000..1d24de2
--- /dev/null
+++ b/DIMS/Utils/estimate_area.R
@@ -0,0 +1,30 @@
+## adapted from getArea.R
+# variables with fixed values will be removed from function parameters
+# int_factor
+estimate_area <- function(mass_max, resol, scale, sigma, int_factor) {
+  #' Estimate area of Gaussian curve
+  #'
+  #' @param mass_max: Value for m/z at maximum intensity of a peak (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param scale: Value for peak width (float)
+  #' @param sigma: Value for standard deviation (float)
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #'
+  #' @return area_curve: Value for area under the Gaussian curve (float)
+
+  # avoid vectors that are too big (cannot allocate vector of size ...)
+  if (mass_max > 1200) return(0)
+
+  # generate a mass_vector with equally spaced m/z values
+  fwhm <- get_fwhm(mass_max, resol)
+  mz_min <- mass_max - 2 * fwhm
+  mz_max <- mass_max + 2 * fwhm
+  mz_range <- mz_max - mz_min 
+  mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int.factor)
+
+  # estimate area under the curve
+  area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100
+
+  return(area_curve)  
+}
+
diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R
index 9dc7e52..35275cd 100644
--- a/DIMS/Utils/fit_gaussian.R
+++ b/DIMS/Utils/fit_gaussian.R
@@ -38,7 +38,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
   # One local maximum:
   if (force == 1) {
     # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
-    fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds)
+    fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            plot, fit_quality1, use_bounds)
     # set initial value for scale factor
     scale <- 2
     # test if the mean is outside the m/z range
@@ -75,8 +76,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
   #### Two local maxima; need at least 6 data points for this ####
   } else if (force == 2 && (length(mass_vector) > 6)) {
     # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
-    fit_values <- fit2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol,
-                            use_bounds, plot, fit_quality, int_factor)
+    fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol,
+                             use_bounds, plot, fit_quality, int_factor)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
@@ -114,9 +115,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
           new_index <- c(new_index, 2 * new_index, 3 * new_index)
         }
         # run this function again with three local maxima
-        return(fitGaussian(mass_vector2, mass_vector, int_vector, new_index,
-                           scale, resol, outdir, force = 3, use_bounds = FALSE,
-                           plot, scanmode, int_factor, width, height))
+        return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index,
+                            scale, resol, outdir, force = 3, use_bounds = FALSE,
+                            plot, scanmode, int_factor, width, height))
       # good fit, all means are within m/z range
       } else {
         # check if means are within 3 ppm and sum if so
@@ -125,32 +126,20 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         nr_means <- length(fit_values$mean)
         while (nr_means != nr_means_new) {
           nr_means <- length(fit_values$mean)
-          fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+          fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
                                      mass_vector2, mass_vector, ppm = 4, resol, plot)
           nr_means_new <- length(fit_values$mean)
         }
         # restore original quality score
         fit_values$qual <- tmp
-        # section for plot
-        # plot_header <- NULL
-        # for (i in 1:length(fit_values$mean)) {
-        #   plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " "))
-        #   peak_mean <- c(peak_mean, fit_values$mean[i])
-        #   peak_area <- c(peak_area, fit_values$area[i])
-        # }
-        # peak_qual <- fit_values$qual
-        # peak_min <- mass_vector[1]
-        # peak_max <- mass_vector[length(mass_vector)]
-        # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " "))
-        # if (plot) legend("topright", legend=plot_header)
       }  
     }
 
-  #### Three local maxima; need at least 6 data points for this ####
+  # Three local maxima; need at least 6 data points for this 
   } else if (force == 3 && (length(mass_vector) > 6)) {
     # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual)
-    fit_values <- fit3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
-                            use_bounds, plot, fit_quality, int_factor)
+    fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                             use_bounds, plot, fit_quality, int_factor)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
@@ -199,35 +188,22 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         tmp <- fit_values$qual
         nr_means_new <- -1
         nr_means <- length(fit_values$mean)
-        while (nr_means!=nr_means_new) {
+        while (nr_means != nr_means_new) {
           nr_means <- length(fit_values$mean)
-          fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
-                                     mass_vector2, mass_vector, ppm = 4, resol, plot)
+          fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+                                   mass_vector2, mass_vector, ppm = 4, resol, plot)
           nr_means_new <- length(fit_values$mean)
         }
         # restore original quality score
         fit_values$qual <- tmp
-
-        # plot_header <- NULL
-        # for (i in 1:length(fit_values$mean)){
-        #   plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " "))
-        #   
-        #   peak_mean <- c(peak_mean, fit_values$mean[i])
-        #   peak_area <- c(peak_area, fit_values$area[i])
-        # }
-        # peak_qual <- fit_values$qual
-        # peak_min <- mass_vector[1]
-        # peak_max <- mass_vector[length(mass_vector)]
-        # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " "))
-        # if (plot) legend("topright", legend=plot_header)
       }
     }
     
   #### Four local maxima; need at least 6 data points for this ####  
   } else if (force == 4 && (length(mass_vector) > 6)) {
     # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual)
-    fit_values <- fit4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
-                            use_bounds, plot, fit_quality, int_factor)
+    fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                             use_bounds, plot, fit_quality, int_factor)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
@@ -273,24 +249,12 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         nr_means <- length(fit_values$mean)
         while (nr_means != nr_means_new) {
           nr_means <- length(fit_values$mean)
-          fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
-                                     mass_vector2, mass_vector, ppm = 4, resol, plot)
+          fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, 
+                                   mass_vector2, mass_vector, ppm = 4, resol, plot)
           nr_means_new <- length(fit_values$mean)
         }
         # restore original quality score
         fit_values$qual <- tmp
-
-        # plot_header<-NULL
-        # for (i in 1:length(fit_values$mean)){
-        #   plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " "))
-        #   peak_mean <- c(peak_mean, fit_values$mean[i])
-        #   peak_area <- c(peak_area, fit_values$area[i])
-        # }
-        # peak_qual <- fit_values$qual
-        # peak_min <- mass_vector[1]
-        # peak_max <- mass_vector[length(mass_vector)]
-        # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " "))
-        # if (plot) legend("topright", legend=plot_header)
       }  
     }
     
@@ -300,8 +264,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     fit_quality1 <- 0.40
     use_bounds <- TRUE
     max_index <- which(int_vector == max(int_vector))
-    fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
-                           plot, fit_quality1, use_bounds)
+    fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
+                            plot, fit_quality1, use_bounds)
     # check for bad fit
     if (fit_values$qual > fit_quality1) {
       # remove
@@ -315,7 +279,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       peak_qual <- 0
     } else {
       peak_mean <- c(peak_mean, fit_values$mean)
-      peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
+      peak_area <- c(peak_area, get_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
       peak_qual <- fit_values$qual
       peak_min <- mass_vector[1]
       peak_max <- mass_vector[length(mass_vector)]
diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R
index 007b8ed..c4924e1 100644
--- a/DIMS/Utils/fit_gaussians.R
+++ b/DIMS/Utils/fit_gaussians.R
@@ -140,7 +140,6 @@ fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3,
   return(opt_fit)
 }
 
-
 fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma4,
                            query_mass1, scale1,
                            query_mass2, scale2,
diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R
index f9e0c27..c761194 100644
--- a/DIMS/Utils/fit_init.R
+++ b/DIMS/Utils/fit_init.R
@@ -2,6 +2,7 @@
 # variables with fixed values will be removed from function parameters
 # scale, outdir, plot, width, height
 # mz_index, start_index, end_index, sample_name not used.
+# use_bounds is used, but not defined.
 # fit_gaussian should be defined before this function is called.
 fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
                      outdir, sample_name, scanmode, plot, width, height,
diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R
index bc5f7d3..e280b57 100644
--- a/DIMS/Utils/fit_optim.R
+++ b/DIMS/Utils/fit_optim.R
@@ -34,10 +34,10 @@ fit_optim <- function(mass_vector, int_vector, resol,
   mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
                       length = mz_diff * int_factor)
   sigma <- get_stdev(mass_vector2, int_max_simple)
-  scale <- optimizeGauss(mass_vector2, int_max_simple, sigma, mass_max)
+  scale <- optimize_gaussfit(mass_vector2, int_max_simple, sigma, mass_max)
 
   # get an estimate of the area under the peak
-  area <- getArea(mass_max, resol, scale, sigma, int_factor)
+  area <- get_area(mass_max, resol, scale, sigma, int_factor)
   
   # put all values for this region of interest into a list
   roi_value_list <- list("mean" = mass_max,
diff --git a/DIMS/Utils/fit_peaks.R b/DIMS/Utils/fit_peaks.R
new file mode 100644
index 0000000..92e08d6
--- /dev/null
+++ b/DIMS/Utils/fit_peaks.R
@@ -0,0 +1,381 @@
+## adapted from fit1Peak.R, fit2peaks.R, fit3peaks.R and fit4peaks.R (combined)
+# variables with fixed values will be removed from function parameters
+# plot, int_factor
+fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality, use_bounds) {
+  #' Fit 1 Gaussian peak in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+
+  if (length(int_vector) < 3) {
+    message("Range too small, no fit possible!")
+  } else {
+    if ((length(int_vector) == 4)) {
+      # fit 1 peak
+      mu <- weighted.mean(mass_vector, int_vector)
+      sigma <- get_stdev(mass_vector, int_vector)
+      fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds)
+    } else {
+      # set range vector
+      if ((length(mass_vector) - length(max_index)) < 2) {
+        range1 <- c((length(mass_vector) - 4) : length(mass_vector))
+      } else if (length(max_index) < 2) {
+        range1 <- c(1:5)
+      } else {
+        range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+      }
+      if (range1[1] == 0) range1 <- range1[-1]
+      # remove NA
+      if (length(which(is.na(int_vector[range1]))) != 0) {
+        range1 <- range1[-which(is.na(int_vector[range1]))]
+      }
+      # fit 1 peak
+      mu <- weighted.mean(mass_vector[range1], int_vector[range1])
+      sigma <- get_stdev(mass_vector[range1], int_vector[range1])
+      fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds)
+    }
+
+    p1 <- fitted_peak$par
+
+    # get new value for fit quality and scale
+    fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
+    scale_new <- 1.2 * scale
+
+    # bad fit
+    if (fq_new > fit_quality) {
+      # optimize scaling factor
+      fq <- 0
+      scale <- 0
+      if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) {
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) {
+          fq <- fq_new
+          scale <- scale_new
+          # fit 1 peak
+          fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds)
+          p1 <- fitted_peak$par
+          # get new value for fit quality and scale
+          fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
+          scale_new <- 1.2 * scale
+        }
+      } else {
+        while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) {
+          fq <- fq_new
+          scale <- scale_new
+          # fit 1 peak
+          fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds)
+          p1 <- fitted_peak$par
+          # get new value for fit quality and scale
+          fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new
+          scale_new <- 0.8 * scale
+        }
+      }
+      # use optimized scale factor to fit 1 peak
+      if (fq < fq_new) {
+        fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds)
+        p1 <- fitted_peak$par
+        fq_new <- fq
+      }
+    }
+  }
+
+  roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new)
+  return(roi_value_list)
+}
+
+fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE,
+                       plot = FALSE, fit_quality, int_factor) {
+  #' Fit 2 Gaussian peaks in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+
+  peak_mean <- NULL
+  peak_area <- NULL
+  peak_scale <- NULL
+  peak_sigma <- NULL
+
+  # set range vectors for 2 peaks
+  range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+  if (range1[1] == 0) range1 <- range1[-1]
+  range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2)
+  if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)]
+  range1 <- check_overlap(range1, range2)[[1]]
+  range2 <- check_overlap(range1, range2)[[2]]
+  # check for negative or 0
+  remove <- which(range1 < 1)
+  if (length(remove) > 0) range1 <- range1[-remove]
+  remove <- which(range2 < 1)
+  if (length(remove) > 0) range2 <- range2[-remove]
+  # remove NA
+  if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))]
+  if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))]
+
+  # fit 2 peaks, first separately, then together
+  mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
+  sigma1 <- get_stdev(mass_vector[range1], int_vector[range1])
+  fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds)
+  p1 <- fitted_peak$par
+  # second peak
+  mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
+  sigma2 <- get_stdev(mass_vector[range2], int_vector[range2])
+  fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds)
+  p2 <- fitted_peak$par
+  # combined
+  fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds)
+  pc <- fitted_2peaks$par
+
+  # get fit quality
+  if (is.null(sigma2)) sigma2 <- sigma1
+  sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) +
+             (pc[4] * dnorm(mass_vector, pc[3], sigma2))
+  fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2],
+                        resol, sum_fit = sum_fit)$fq_new
+
+  # get parameter values
+  area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor)
+  peak_area <- c(peak_area, area1)
+  peak_area <- c(peak_area, area2)
+  peak_mean <- c(peak_mean, pc[1])
+  peak_mean <- c(peak_mean, pc[3])
+  peak_scale <- c(peak_scale, pc[2])
+  peak_scale <- c(peak_scale, pc[4])
+  peak_sigma <- c(peak_sigma, sigma1)
+  peak_sigma <- c(peak_sigma, sigma2)
+
+  roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq)
+  return(roi_value_list)
+}
+
+fit_3peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, 
+                       plot = FALSE, fit_quality, int_factor) {
+  #' Fit 3 Gaussian peaks in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+
+  peak_mean <- NULL
+  peak_area <- NULL
+  peak_scale <- NULL
+  peak_sigma <- NULL
+
+  # set range vectors for 3 peaks
+  range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+  range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2)
+  range3 <- c(max_index[3] - 2, max_index[3] - 1, max_index[3], max_index[3] + 1, max_index[3] + 2)
+  remove <- which(range1 < 1)
+  if (length(remove) > 0) {
+    range1 <- range1[-remove]
+  }
+  remove <- which(range2 < 1)
+  if (length(remove) > 0) {
+    range2 <- range2[-remove]
+  }
+  if (length(mass_vector) < range3[length(range3)]) range3 <- range3[-length(range3)]
+  range1 <- check_overlap(range1, range2)[[1]]
+  range2 <- check_overlap(range1, range2)[[2]]
+  range2 <- check_overlap(range2, range3)[[1]]
+  range3 <- check_overlap(range2, range3)[[2]]
+  # check for negative or 0
+  remove <- which(range1 < 1)
+  if (length(remove) > 0) range1 <- range1[-remove]
+  remove <- which(range2 < 1)
+  if (length(remove) > 0) range2 <- range2[-remove]
+  remove <- which(range3 < 1)
+  if (length(remove) > 0) range3 <- range3[-remove]
+  # remove NA
+  if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))]
+  if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))]
+  if (length(which(is.na(int_vector[range3]))) != 0) range3 <- range3[-which(is.na(int_vector[range3]))]
+
+  # fit 3 peaks, first separately, then together
+  mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
+  sigma1 <- get_stdev(mass_vector[range1], int_vector[range1])
+  fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds)
+  p1 <- fitted_peak$par
+  # second peak
+  mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
+  sigma2 <- get_stdev(mass_vector[range2], int_vector[range2])
+  fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds)
+  p2 <- fitted_peak$par
+  # third peak
+  mu3 <- weighted.mean(mass_vector[range3], int_vector[range3])
+  sigma3 <- get_stdev(mass_vector[range3], int_vector[range3])
+  fitted_peak <- fit_1gaussian(mass_vector[range3], int_vector[range3], sigma3, mu3, scale, use_bounds)
+  p3 <- fitted_peak$par
+  # combined
+  fitted_3peaks <- fit_3gaussians(mass_vector, int_vector, sigma1, sigma2, sigma3,
+                                  p1[1], p1[2], p2[1], p2[2], p3[1], p3[2], use_bounds)
+  pc <- fitted_3peaks$par
+
+  # get fit quality
+  sum_fit = (pc[2] * dnorm(mass_vector, pc[1], sigma1)) +
+            (pc[4] * dnorm(mass_vector, pc[3], sigma2)) +
+            (pc[6] * dnorm(mass_vector, pc[5], sigma3))
+  fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3], pc[5]))[1], sort(c(pc[1], pc[3], pc[5]))[3],
+                        resol, sum_fit = sum_fit)$fq_new
+
+  # get parameter values
+  area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor)
+  area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor)
+  peak_area <- c(peak_area, area1)
+  peak_area <- c(peak_area, area2)
+  peak_area <- c(peak_area, area3)
+  peak_mean <- c(peak_mean, pc[1])
+  peak_mean <- c(peak_mean, pc[3])
+  peak_mean <- c(peak_mean, pc[5])
+  peak_scale <- c(peak_scale, pc[2])
+  peak_scale <- c(peak_scale, pc[4])
+  peak_scale <- c(peak_scale, pc[6])
+  peak_sigma <- c(peak_sigma, sigma1)
+  peak_sigma <- c(peak_sigma, sigma2)
+  peak_sigma <- c(peak_sigma, sigma3)
+
+  roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq)
+  return(roi_value_list)
+}
+
+fit_4peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE,
+                       plot = FALSE, fit_quality, int_factor) {
+  #' Fit 4 Gaussian peaks in small region of m/z
+  #'
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param max_index: Index in int_vector with the highest intensity (integer)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float)
+  #' @param use_bounds: Boolean to indicate whether boundaries are to be used
+  #' @param int_factor: Value used to calculate area under Gaussian curve (integer)
+  #'
+  #' @return roi_value_list: list of fit values for region of interest (list)
+
+  peak_mean <- NULL
+  peak_area <- NULL
+  peak_scale <- NULL
+  peak_sigma <- NULL
+
+  # set range vectors for 4 peaks
+  range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2)
+  range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2)
+  range3 <- c(max_index[3] - 2, max_index[3] - 1, max_index[3], max_index[3] + 1, max_index[3] + 2)
+  range4 <- c(max_index[4] - 2, max_index[4] - 1, max_index[4], max_index[4] + 1, max_index[4] + 2)
+  if (range1[1] == 0) range1 <- range1[-1]
+  if (length(mass_vector) < range4[length(range4)]) range4 <- range4[-length(range4)]
+  range1 <- check_overlap(range1, range2)[[1]]
+  range2 <- check_overlap(range1, range2)[[2]]
+  range2 <- check_overlap(range2, range3)[[1]]
+  range3 <- check_overlap(range2, range3)[[2]]
+  range3 <- check_overlap(range3, range4)[[1]]
+  range4 <- check_overlap(range3, range4)[[2]]
+  remove <- which(range4 > length(mass_vector))
+  if (length(remove) > 0) {
+    range4 <- range4[-remove]
+  }
+  # check for negative or 0
+  remove <- which(range1 < 1)
+  if (length(remove) > 0) range1 <- range1[-remove]
+  remove <- which(range2 < 1)
+  if (length(remove) > 0) range2 <- range2[-remove]
+  remove <- which(range3 < 1)
+  if (length(remove) > 0) range3 <- range3[-remove]
+  remove <- which(range4 < 1)
+  if (length(remove) > 0) range4 <- range4[-remove]
+  # remove NA
+  if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))]
+  if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))]
+  if (length(which(is.na(int_vector[range3]))) != 0) range3 <- range3[-which(is.na(int_vector[range3]))]
+  if (length(which(is.na(int_vector[range4]))) != 0) range4 <- range4[-which(is.na(int_vector[range4]))]
+
+  # fit 4 peaks, first separately, then together
+  mu1 <- weighted.mean(mass_vector[range1], int_vector[range1])
+  sigma1 <- get_stdev(mass_vector[range1], int_vector[range1])
+  fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds)
+  p1 <- fitted_peak$par
+  # second peak
+  mu2 <- weighted.mean(mass_vector[range2], int_vector[range2])
+  sigma2 <- get_stdev(mass_vector[range2], int_vector[range2])
+  fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds)
+  p2 <- fitted_peak$par
+  # third peak
+  mu3 <- weighted.mean(mass_vector[range3], int_vector[range3])
+  sigma3 <- get_stdev(mass_vector[range3], int_vector[range3])
+  fitted_peak <- fit_1gaussian(mass_vector[range3], int_vector[range3], sigma3, mu3, scale, use_bounds)
+  p3 <- fitted_peak$par
+  # fourth peak
+  mu4 <- weighted.mean(mass_vector[range4], int_vector[range4])
+  sigma4 <- get_stdev(mass_vector[range4], int_vector[range4])
+  fitted_peak <- fit_1gaussian(mass_vector[range4], int_vector[range4], sigma4, mu4, scale, use_bounds)
+  p4 <- fitted_peak$par
+  # combined
+  fitted_4peaks <- fit_4gaussians(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma3, 
+                                  p1[1], p1[2], p2[1], p2[2], p3[1], p3[2],  p4[1], p4[2], use_bounds)
+  pc <- fitted_4peaks$par
+
+  # get fit quality
+  sum_fit <-  (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + 
+              (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + 
+              (pc[6] * dnorm(mass_vector, pc[5], sigma3)) + 
+              (pc[8] * dnorm(mass_vector, pc[7], sigma3))
+  fq <- get_fit_quality(mass_vector, int_vector, 
+                        sort(c(pc[1], pc[3], pc[5], pc[7]))[1], sort(c(pc[1], pc[3], pc[5], pc[7]))[4], 
+                        resol, sum_fit = sum_fit)$fq_new
+
+  # get parameter values
+  area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor)
+  area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor)
+  area4 <- get_area(pc[7], resol, pc[8], sigma4, int_factor)
+  peak_area <- c(peak_area, area1)
+  peak_area <- c(peak_area, area2)
+  peak_area <- c(peak_area, area3)
+  peak_area <- c(peak_area, area4)
+  peak_mean <- c(peak_mean, pc[1])
+  peak_mean <- c(peak_mean, pc[3])
+  peak_mean <- c(peak_mean, pc[5])
+  peak_mean <- c(peak_mean, pc[7])
+  peak_scale <- c(peak_scale, pc[2])
+  peak_scale <- c(peak_scale, pc[4])
+  peak_scale <- c(peak_scale, pc[6])
+  peak_scale <- c(peak_scale, pc[8])
+  peak_sigma <- c(peak_sigma, sigma1)
+  peak_sigma <- c(peak_sigma, sigma2)
+  peak_sigma <- c(peak_sigma, sigma3)
+  peak_sigma <- c(peak_sigma, sigma4)
+
+  roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq)
+  return(roi_value_list)
+}
+
diff --git a/DIMS/Utils/get_element_info.R b/DIMS/Utils/get_element_info.R
new file mode 100644
index 0000000..c4bdd92
--- /dev/null
+++ b/DIMS/Utils/get_element_info.R
@@ -0,0 +1,24 @@
+## adapted from elementInfo.R, which is adapted from Rdisop function .getElement
+# refactor: check where library is initialised: library(Rdisop)
+get_element_info <- function(name, elements = NULL) {
+  #' Get info on m/z and isotopes for all chemical elements
+  #'
+  #' @param name: Name of adduct, e.g. Na (string)
+  #' @param elements: List of all adducts to take into account (list of strings)
+  #'
+  #' @return element_info: peak group list with filled-in intensities (matrix)
+
+  # get information on all elements
+  if (!is.list(elements) || length(elements) == 0 ) {
+    all_elements <- initializePSE() 
+  }
+  # extract information for a particular adduct
+  if (name == "CH3OH+H") {
+    # regular_expr should be exact match for name, except for methanol
+    regular_expr <- "^CH3OH\\+H$"
+  } else {
+    regular_expr <- paste0("^", name, "$")
+  }
+  element_info <- all_elements[[grep(regular_expr, sapply(all_elements, function(x) { x$name }))]]
+  return(element_info)
+}
\ No newline at end of file
diff --git a/DIMS/Utils/get_fit_quality.R b/DIMS/Utils/get_fit_quality.R
new file mode 100644
index 0000000..23b1947
--- /dev/null
+++ b/DIMS/Utils/get_fit_quality.R
@@ -0,0 +1,34 @@
+## adapted from getFitQuality.R
+# parameter not used: mu_last
+get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) {
+  #' Fit 1 Gaussian peak in small region of m/z
+  #'
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Value used to calculate area under Gaussian curve (integer)
+  #' @param mu_first: Value for first peak (float)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float)
+  #'
+  #' @return list_params: list of parameters indicating quality of fit (list)
+  if (is.null(sum_fit)) {
+    mass_vector_int <- mass_vector
+    int_vector_int <- int_vector
+    # get new fit quality
+    fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / 
+               rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int)))
+  } else {
+    sum_fit_int <- sum_fit
+    int_vector_int <- int_vector
+    mass_vector_int <- mass_vector
+    # get new fit quality
+    fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int)))
+  }
+  
+  # Prevent division by 0
+  if (is.nan(fq_new)) fq_new <- 1 
+  
+  list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int)
+  return(list_params)
+}
+
diff --git a/DIMS/Utils/identify_noisepeaks.R b/DIMS/Utils/identify_noisepeaks.R
new file mode 100644
index 0000000..5e557cb
--- /dev/null
+++ b/DIMS/Utils/identify_noisepeaks.R
@@ -0,0 +1,103 @@
+## adapted from ident.hires.noise.HPC
+# refactor: remove variables slope, incpt, ppm_iso_fixed
+# combine with function get_element_info
+# modified identify function to also look for adducts and their isotopes
+identify_noisepeaks <- function(peakgroup_list, all_adducts, scanmode = "Negative", look4 = c("Cl", "Ac"),
+                                noise_mz = NULL, resol = 140000, slope = 0, incpt = 0, ppm_fixed, ppm_iso_fixed) {
+  #' Replace intensities that are zero with random value
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param all_adducts: List of adducts to take into account (list of strings)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param look4: List of adducts to look for (list of strings)
+  #' @param noise_mz: All known noise peaks (matrix)
+  #' @param resol: Value for resolution (integer)
+  #' @param slope: Value for slope for mass correction (float)
+  #' @param incpt: Value for intercept for mass correction (float)
+  #' @param ppm_fixed: Value for distance between two values of mass (integer)
+  #' @param ppm_iso_fixed: Value for distance between two values of mass for isotope peaks (integer)
+  #'
+  #' @return final_outlist: peak group list with filled-in intensities (matrix)
+
+  options(stringsAsFactors = FALSE)
+  metlin <- assi <- iso <- rep("", nrow(peakgroup_list))
+  theormz <- nisos <- expint <- conf <- rep(0, nrow(peakgroup_list))
+
+  # add adducts to identification list
+  if (scanmode == "Positive") {
+    adduct_scanmode <-  "+"
+  } else {
+    adduct_scanmode <- "-"
+  }
+  # make a copy of noise_mz
+  noise_mz_orig <- noise_mz
+
+  # loop over type of adduct
+  for (adduct_index in 1:length(look4)) {
+    noise_mz_adduct <- noise_mz_orig
+    noise_mz_adduct[, "CompoundName"] <- as.character(noise_mz_orig[, "CompoundName"])
+
+    if (look4[adduct_index] == "H2O") {
+      add2label <- paste0("[M-", look4[adduct_index], "]", adduct_scanmode)
+    } else {
+      add2label <- paste0("[M+", look4[adduct_index], "]", adduct_scanmode)
+    }
+
+    noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName "], add2label)
+    adduct_info <- get_element_info(look4[adduct_index], all_adducts)
+    if (scanmode == "Positive") {
+      adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] - hydrogen_mass
+    } else {
+      adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] + hydrogen_mass
+    }
+
+    # loop over compounds in database
+    for (compound_index in 1:nrow(noise_mz_adduct)) {
+      # construct information for compound + adduct:
+      if (scanmode == "Positive") {
+        noise_mz_adduct[compound_index, "Mpos"] <-  as.numeric(noise_mz_adduct[compound_index, "Mpos"]) + adduct_mass
+        noise_mz_adduct[compound_index, "MNeg"] <-  0
+      } else {
+        noise_mz_adduct[compound_index, "Mpos"] <-  0
+        noise_mz_adduct[compound_index, "MNeg"] <-  as.numeric(noise_mz_adduct[compound_index, "MNeg"]) + adduct_mass
+      }
+    }
+    noise_mz <- rbind(noise_mz, noise_mz_adduct)
+  }
+
+  if (scanmode == "Positive") {
+    theor_mcol <- as.numeric(noise_mz[, "Mpos"])
+  } else {
+    theor_mcol <- as.numeric(noise_mz[, "MNeg"])
+  }
+
+  # get mz information from peakgroup_list
+  mcol <- peakgroup_list[, "mzmed.pgrp"]
+  # if column with average intensities is missing, calculate it:
+  if (!("avg.int" %in% colnames(peakgroup_list))) {
+    mzmaxcol <- which(colnames(peakgroup_list) == "mzmax.pgrp")
+    endcol <- ncol(peakgroup_list)
+    peakgroup_list[, "avg.int"] <- apply(peakgroup_list[, (mzmaxcol + 1):(endcol)], 1, mean)
+  }
+
+  # do indentification using own database:
+  for (row_index in 1:nrow(noise_mz)) {
+    theor_mz <- theor_mcol[row_index]
+
+    # set tolerance for mz accuracy of main peak
+    mtol <- theor_mz * ppm_fixed / 1000000
+    # find main peak
+    selp <- which(mcol > (theor_mz - mtol) & mcol < (theor_mz + mtol))
+    # if there is more than one candidate peak for main, select best one based on mz_diff
+    if (length(selp) > 1) {
+      selp <- selp[abs(mcol[selp] - theor_mz) == min(abs(mcol[selp] - theor_mz))]
+    }
+    if (length(selp) == 1) {
+      assi[selp] <- paste(assi[selp], as.character(noise_mz[row_index, "CompoundName"]), sep = ";")
+      theormz[selp] <- theor_mz
+    }
+  }
+
+  final_outlist <- cbind(peakgroup_list, assi, theormz, conf, nisos, iso, expint, metlin)
+  return(final_outlist)
+}
diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R
new file mode 100644
index 0000000..da3bc52
--- /dev/null
+++ b/DIMS/Utils/merge_duplicate_rows.R
@@ -0,0 +1,59 @@
+## adapted from mergeDuplicatedRows.R
+merge_duplicate_rows <- function(peakgroup_list) {
+  #' Merge identification info for peak groups with the same mass
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #'
+  #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix)
+
+  collapse <- function(column_label, peakgroup_list, index_dup) {
+    #' Collapse identification info for peak groups with the same mass
+    #'
+    #' @param column_label: Name of column in peakgroup_list (string)
+    #' @param peakgroup_list: Peak group list (matrix)
+    #' @param index_dup: Index of duplicate peak group (integer)
+    #'
+    #' @return collapsed_items: Semicolon-separated list of info (string)
+    # get the item(s) that need to be collapsed
+    list_items <- as.vector(peakgroup_list[index_dup, column_label])
+    # remove NA
+    if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))]
+    collapsed_items <- paste(list_items, collapse = ";")
+    return(collapsed_items)
+  }
+
+  options(digits = 16)
+  collect <- NULL
+  remove <- NULL
+
+  # check for peak groups with identical mass
+  index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"]))
+
+  while (length(index_dup) > 0) {
+    # get the index for the peak group which is double
+    peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])
+    single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE]
+
+    # use function collapse to concatenate info
+    single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index)
+    single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index)
+    single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index)
+    if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA
+    single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index)
+    if (single_peakgroup[,"theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA
+
+    # keep track of deduplicated entries
+    collect <- rbind(collect, single_peakgroup)
+    remove <- c(remove, peaklist_index)
+
+    # remove current entry from index
+    index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])]
+  }
+
+  # remove duplicate entries
+  if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ]
+  # append deduplicated entries
+  peakgroup_list_dedup <- rbind(peakgroup_list, collect)
+  return(peakgroup_list_dedup)
+}
diff --git a/DIMS/Utils/optimize_gaussfit.R b/DIMS/Utils/optimize_gaussfit.R
new file mode 100644
index 0000000..2d7f95d
--- /dev/null
+++ b/DIMS/Utils/optimize_gaussfit.R
@@ -0,0 +1,23 @@
+## adapted from optimizeGauss.R
+optimize_gaussfit <- function(mass_vector, int_vector, sigma, mass_max) {
+  #' Optimize fit of Gaussian curve to small region of m/z
+  #'
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param int_vector: Vector of intensities for a region of interest (float)
+  #' @param sigma: Value for standard deviation (float)
+  #' @param mass_max: Value for mass at center of peak (float)
+  #'
+  #' @return opt_fit: list of fit values for region of interest (list)
+
+  # define optimization function for optim based on normal distribution
+  opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) {
+    curve <- p * dnorm(mass_vector, mass_max, sigma)
+    return((max(curve) - max(int_vector))^2)
+  }
+
+  # get optimal value for fitted Gaussian curve
+  opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max)
+
+  return(opt_fit$minimum)
+}
+
diff --git a/DIMS/Utils/replace_zeros.R b/DIMS/Utils/replace_zeros.R
new file mode 100644
index 0000000..cf65340
--- /dev/null
+++ b/DIMS/Utils/replace_zeros.R
@@ -0,0 +1,64 @@
+## adapted from replaceZeros.R
+# this function does two things: replace zeros with random value and identify noise peaks
+# refactor: split into two functions
+# remove parameters outdir and thresh
+# make hard-coded path to file with noise peaks into variable
+# remove variables outdir, thresh
+replace_zeros <- function(peakgroup_list, repl_pattern, scanmode, resol, outdir, thresh, ppm) {
+  #' Replace intensities that are zero with random value
+  #'
+  #' @param peakgroup_list: Peak group list (matrix)
+  #' @param repl_pattern: Replication pattern (list of strings)
+  #' @param scanmode: Scan mode, positive or negative (string)
+  #' @param resol: Value for resolution (integer)
+  #' @param outdir: Path for output directory (string)
+  #' @param thresh: Value for threshold (integer)
+  #' @param ppm: Value for distance between two values of mass (integer)
+  #'
+  #' @return final_outlist: peak group list with filled-in intensities (matrix)
+
+  # replace zeros
+  if (!is.null(peakgroup_list)) {
+    for (sample_index in 1:length(names(repl_pattern))) {
+      sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]]
+      zero_intensity <- which(sample_peaks <= 0)
+      if (!length(zero_intensity)) {
+        next
+      }
+      for (zero_index in 1:length(zero_intensity)) {
+        area <- generate_gaussian(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh,
+                                  resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area
+        peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, mean = area,
+                                                                                               sd = 0.25 * area)
+      }
+    }
+
+    # Add column with average intensity
+    peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, 7:(ncol(peakgroup_list) - 4)], 1, mean))
+
+    if (scanmode == "negative") {
+      label <- "MNeg"
+      label2 <- "Negative"
+      # look for adducts in negative mode
+      look4_adducts <- c("Cl", "Cl37", "For", "NaCl", "KCl", "H2PO4", "HSO4", "Na-H", "K-H", "H2O", "I")
+    } else {
+      label <- "Mpos"
+      label2 <- "Positive"
+      # look for adducts in positive mode
+      look4_adducts <- c("Na", "K", "NaCl", "NH4", "2Na-H", "CH3OH", "KCl", "NaK-H")
+    }
+
+    # Identify noise peaks
+    noise_mz <- read.table(file = "/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt",
+                           sep = "\t", header = TRUE, quote = "")
+    noise_mz <- noise_mz[(noise_mz[, label] != 0), 1:4]
+    outlist_withnoise <- identify_noisepeaks(peakgroup_list, all_adducts, scanmode = label2,
+                                             noise_mz, look4 = look4_adducts, resol = resol,
+                                             slope = 0, incpt = 0, ppm_fixed = ppm, ppm_iso_fixed = ppm)
+    noise_info <- outlist_withnoise[, c("assi", "theormz")]
+    colnames(noise_info) <- c("assi_noise",  "theormz_noise")
+
+    final_outlist <- cbind(peakgroup_list, noise_info)
+    return(final_outlist)
+  }
+}
diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R
index aa0abbd..ba6920e 100644
--- a/DIMS/Utils/search_mzrange.R
+++ b/DIMS/Utils/search_mzrange.R
@@ -21,10 +21,6 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
   #'
   #' @return allpeaks_values: list of m/z regions of interest
 
-  # initialise list to store results for all peaks. Currently in do_peakfinding
-  # allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL,
-  #                         "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0)
-
   # find indices where intensity is not equal to zero
   nonzero_indices <- as.vector(which(ints_fullrange != 0))
 
@@ -76,8 +72,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
 
           if (roi_values$qual[1] == 1) {
             # get optimized fit values
-            roi_values <- generateGaussian(mass_vector, int_vector, resol, plot,
-                                           scanmode, int_factor, width, height)
+            roi_values <- fit_optim(mass_vector, int_vector, resol, plot,
+                                    scanmode, int_factor, width, height)
             # add region of interest to list of all peaks
             allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
             allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
@@ -100,8 +96,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
 
         } else {
 
-          roi_values <- generateGaussian(mass_vector, int_vector, resol,
-                                         plot, scanmode, int_factor, width, height)
+          roi_values <- fit_optim(mass_vector, int_vector, resol,
+                                  plot, scanmode, int_factor, width, height)
 
           allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
           allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
@@ -148,8 +144,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
                                mz_index, start_index, end_index)
 
         if (roi_values$qual[1] == 1) {
-          roi_values <- generateGaussian(mass_vector, int_vector, resol,
-                                         plot, scanmode, int_factor, width, height)
+          roi_values <- fit_optim(mass_vector, int_vector, resol,
+                                  plot, scanmode, int_factor, width, height)
 
           allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
           allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
@@ -170,8 +166,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
           }
         }
       } else {
-        roi_values <- generateGaussian(mass_vector, int_vector, resol,
-                                       plot, scanmode, int_factor, width, height)
+        roi_values <- fit_optim(mass_vector, int_vector, resol,
+                                plot, scanmode, int_factor, width, height)
 
         allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
         allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
diff --git a/DIMS/Utils/sum_curves.R b/DIMS/Utils/sum_curves.R
new file mode 100644
index 0000000..542026b
--- /dev/null
+++ b/DIMS/Utils/sum_curves.R
@@ -0,0 +1,35 @@
+## adapted from sumCurves.R
+# variables with fixed values will be removed from function parameters
+# plot
+# parameter half_max not used
+sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol, plot) {
+  #' Sum two curves
+  #'
+  #' @param mean1: Value for mean m/z of first peak (float)
+  #' @param mean2: Value for mean m/z of second peak (float)
+  #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer)
+  #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer)
+  #' @param sigma1: Value for standard deviation for first peak (float)
+  #' @param sigma2: Value for standard deviation for second peak (float)
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #'
+  #' @return list_params: list of parameters indicating quality of fit (list)
+
+  sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2))
+
+  mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)),
+                                                  max(scale2 * dnorm(mass_vector2, mean2, sigma2))))
+
+  # get new values for parameters
+  fwhm <- get_fwhm(mean1_plus2, resol)
+  area <- max(sum_fit)
+  scale <- scale1 + scale2
+  sigma <- (fwhm / 2) * 0.85
+
+  list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma)
+  return(list_params)
+}
+
diff --git a/DIMS/Utils/within_ppm.R b/DIMS/Utils/within_ppm.R
new file mode 100644
index 0000000..abdb0d4
--- /dev/null
+++ b/DIMS/Utils/within_ppm.R
@@ -0,0 +1,64 @@
+## adapted from isWithinXppm.R
+# variables with fixed values will be removed from function parameters
+# plot
+within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol, plot) {
+  #' Test whether two mass ranges are within ppm distance of each other
+  #'
+  #' @param mean: Value for mean m/z (float)
+  #' @param scale: Initial value used to estimate scaling parameter (integer)
+  #' @param sigma: Value for standard deviation (float)
+  #' @param area: Value for area under the curve (float)
+  #' @param mass_vector2: Vector of equally spaced m/z values (float)
+  #' @param mass_vector: Vector of m/z values for a region of interest (float)
+  #' @param ppm: Value for distance between two values of mass (integer)
+  #' @param resol: Value for resolution (integer)
+  #' @param plot: Parameter indicating whether plots should be made (boolean)
+  #'
+  #' @return list_params: list of parameters indicating quality of fit (list)
+
+  # sort
+  index <- order(mean)
+  mean <- mean[index]
+  scale <- scale[index]
+  sigma <- sigma[index]
+  area <- area[index]
+
+  summed <- NULL
+  remove <- NULL
+
+  if (length(mean) > 1) {
+    for (i in 2:length(mean)) {
+      if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) {
+
+        # avoid double occurance in sum
+        if ((i - 1) %in% summed) next
+
+        result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i],
+                                    mass_vector2, mass_vector, resol, plot)
+        summed <- c(summed, i - 1, i)
+        if (is.nan(result_values$mean)) result_values$mean <- 0
+        mean[i - 1] <- result_values$mean
+        mean[i] <- result_values$mean
+        area[i - 1] <- result_values$area
+        area[i] <- result_values$area
+        scale[i - 1] <- result_values$scale
+        scale[i] <- result_values$scale
+        sigma[i - 1] <- result_values$sigma
+        sigma[i] <- result_values$sigma
+
+        remove <- c(remove, i)
+      }
+    }
+  }
+
+  if (length(remove) != 0) {
+    mean <- mean[-c(remove)]
+    area <- area[-c(remove)]
+    scale <- scale[-c(remove)]
+    sigma <- sigma[-c(remove)]
+  }
+
+  list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL)
+  return(list_params)
+}
+

From 0a8171971baf97600f5100f58d8cde954bbe489f Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Fri, 26 Apr 2024 14:24:53 +0200
Subject: [PATCH 62/73] TIC plots and TRFP flag

---
 DIMS/AverageTechReplicates.R | 6 ++++--
 DIMS/ThermoRawFileParser.nf  | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index b243025..bf27b3e 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -46,7 +46,7 @@ load(init_file)
 
 # lower the threshold below which a sample will be removed for DBS and for high m/z
 if (dims_matrix == "DBS") {
-  thresh2remove <- 50000000
+  thresh2remove <- 500000000
 }
 if (highest_mz > 700) {
   thresh2remove <- 1000000
@@ -66,6 +66,7 @@ for (sample_nr in 1:length(repl_pattern)) {
   nr_neg <- 0
   for (file_nr in 1:length(tech_reps)) {
     load(paste(tech_reps[file_nr], ".RData", sep = ""))
+    cat("\n\nParsing", tech_reps[file_nr])
     # negative scanmode
     cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1]))
     if (sum(peak_list$neg[, 1]) < thresh2remove) {
@@ -146,7 +147,8 @@ for (sample_nr in c(1:length(repl_pattern))) {
   sample_name <- names(repl_pattern)[sample_nr]
   for (file_nr in 1:length(tech_reps)) {
     plot_nr <- plot_nr + 1
-    repl1_nr <- read.table(tic_files[file_nr])
+    repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = ""))
+    # repl1_nr <- read.table(tic_files[file_nr])
     bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]]
     bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]]
     if (bad_color_neg & bad_color_pos) {
diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf
index 6e51984..022a2c9 100644
--- a/DIMS/ThermoRawFileParser.nf
+++ b/DIMS/ThermoRawFileParser.nf
@@ -13,6 +13,6 @@ process ConvertRawFile {
 
         """
         source /hpc/dbg_mz/tools/mono/etc/profile
-        mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./
+        mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./ -p
         """
 }

From 4041d978fd9989114b930462b757a91c3a9a3ed2 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Fri, 26 Apr 2024 15:18:30 +0200
Subject: [PATCH 63/73] Fix TIC plot pdf

---
 DIMS/AverageTechReplicates.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index bf27b3e..81f6b46 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -147,8 +147,8 @@ for (sample_nr in c(1:length(repl_pattern))) {
   sample_name <- names(repl_pattern)[sample_nr]
   for (file_nr in 1:length(tech_reps)) {
     plot_nr <- plot_nr + 1
-    repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = ""))
-    # repl1_nr <- read.table(tic_files[file_nr])
+    # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = ""))
+    repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt"))
     bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]]
     bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]]
     if (bad_color_neg & bad_color_pos) {

From 2089babdd517e1024746874336d3b0416a5e9b30 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 24 May 2024 10:46:20 +0200
Subject: [PATCH 64/73] corrected error in aggregate lines

---
 DIMS/AssignToBins.R | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 7c7a95a..472b1e6 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -76,19 +76,23 @@ bin_indices_neg <- cut(
 if (nrow(pos_raw_data_matrix) > 0) {
   # set NA in intensities to zero
   pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0
-  # use only values above dims_thresh
-  pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"]
-  # aggregate intensities, calculate mean
-  aggr_int_pos <- stats::aggregate(pos_intensity_above_threshold, list(bin_indices_pos), mean)
+  # aggregate intensities, calculate mean, use only values above dims_thresh
+  aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"],
+				   list(bin_indices_pos),
+				   FUN = function(x) { mean(x[which(x > dims_thresh)]) })
+  # set NA to zero in second column
+  aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0
   pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2]
 }
 if (nrow(neg_raw_data_matrix) > 0) {
   # set NA in intensities to zero
   neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0
-  # use only values above dims_thresh
-  neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"]
-  # aggregate intensities, calculate mean
-  aggr_int_neg <- stats::aggregate(neg_intensity_above_threshold, list(bin_indices_neg), mean)
+  # aggregate intensities, calculate mean, use only values above dims_thresh
+  aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"],
+				   list(bin_indices_neg),
+				   FUN = function(x) { mean(x[which(x > dims_thresh)]) })
+  # set NA to zero in second column
+  aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0
   neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2]
 }
 

From a9c267c6629ef2dcdbad52b813b854ef6cb968e7 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 24 May 2024 14:57:07 +0200
Subject: [PATCH 65/73] code review changes applied

---
 DIMS/AssignToBins.R             |  1 -
 DIMS/AverageTechReplicates.R    |  1 -
 DIMS/AverageTechReplicates.nf   |  3 +-
 DIMS/CollectFilled.R            |  5 ++-
 DIMS/CollectSumAdducts.R        |  1 -
 DIMS/FillMissing.R              | 21 ++++++------
 DIMS/GenerateBreaks.R           |  1 -
 DIMS/GenerateExcel.R            |  2 --
 DIMS/GenerateViolinPlots.R      |  1 -
 DIMS/HMDBparts.R                |  1 -
 DIMS/HMDBparts_main.R           |  1 -
 DIMS/MakeInit.R                 |  1 -
 DIMS/PeakFinding.R              | 59 +++++++++++++--------------------
 DIMS/PeakGrouping.R             |  1 -
 DIMS/SpectrumPeakFinding.R      |  3 +-
 DIMS/SumAdducts.R               |  1 -
 DIMS/UnidentifiedCalcZscores.R  |  1 -
 DIMS/UnidentifiedCollectPeaks.R |  1 -
 DIMS/UnidentifiedFillMissing.R  |  1 -
 DIMS/UnidentifiedPeakGrouping.R |  1 -
 20 files changed, 38 insertions(+), 69 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 472b1e6..8479c09 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 2-DIMS.R
 
 # load required packages
diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R
index 81f6b46..6a773f5 100644
--- a/DIMS/AverageTechReplicates.R
+++ b/DIMS/AverageTechReplicates.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 # adapted from 3-AverageTechReplicates.R
 
 # load packages
diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf
index 9e47af3..5387e61 100644
--- a/DIMS/AverageTechReplicates.nf
+++ b/DIMS/AverageTechReplicates.nf
@@ -24,7 +24,8 @@ process AverageTechReplicates {
         """
         Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \
                                                                       $params.nr_replicates \
-                                                                      $analysis_id $matrix \
+                                                                      $analysis_id \
+                                                                      $matrix \
                                                                       $highest_mz_file
         """
 }
diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index f28e1c9..48fba8c 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 10-collectSamplesFilled.R
 
 # define parameters
@@ -8,8 +7,8 @@ scripts_dir <- cmd_args[1]
 ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "mergeDuplicatedRows.R"))
-source(paste0(scripts_dir, "statistics_z.R"))
+source(paste0(scripts_dir, "merge_duplicate_rows.R"))
+source(paste0(scripts_dir, "calculate_zscores.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R
index dcd05af..f4f4723 100755
--- a/DIMS/CollectSumAdducts.R
+++ b/DIMS/CollectSumAdducts.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 12-collectSamplesAdded.R
 
 # define parameters 
diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R
index f96e257..a76c163 100755
--- a/DIMS/FillMissing.R
+++ b/DIMS/FillMissing.R
@@ -1,5 +1,4 @@
-#!/usr/bin/Rscript
-# adapted from 9-runFillMissing.R
+## adapted from 9-runFillMissing.R
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
@@ -12,15 +11,15 @@ ppm <- as.numeric(cmd_args[5])
 outdir <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "replaceZeros.R"))
-source(paste0(scripts_dir, "generateGaussian.R"))
-source(paste0(scripts_dir, "getFwhm.R"))
-source(paste0(scripts_dir, "getSD.R"))
-source(paste0(scripts_dir, "getArea.R"))
-source(paste0(scripts_dir, "optimizeGauss.R"))
-source(paste0(scripts_dir, "ident.hires.noise.HPC.R"))
-source(paste0(scripts_dir, "elementInfo.R"))
-source(paste0(scripts_dir, "globalAssignments.HPC.R"))
+source(paste0(scripts_dir, "replace_zeros.R"))
+source(paste0(scripts_dir, "fit_optim.R"))
+source(paste0(scripts_dir, "get_fwhm.R"))
+source(paste0(scripts_dir, "get_stdev.R"))
+source(paste0(scripts_dir, "estimate_area.R"))
+source(paste0(scripts_dir, "optimize_gaussfit.R"))
+source(paste0(scripts_dir, "identify_noisepeaks.R"))
+source(paste0(scripts_dir, "get_element_info.R"))
+source(paste0(scripts_dir, "atomic_info.R"))
 
 # determine scan mode
 if (grepl("_pos", peakgrouplist_file)) {
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index 3e38f1c..e475b34 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 1-generateBreaksFwhm.HPC.R ##
 
 # load required package
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index b2edf5c..7d34204 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 13-excelExport.R
 
 # load required packages
@@ -290,7 +289,6 @@ is_summed <- is_list[c(names(repl_pattern), "HMDB_code")]
 is_summed$HMDB.name <- is_list$name
 is_summed <- reshape2::melt(is_summed, id.vars = c("HMDB_code", "HMDB.name"))
 colnames(is_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity")
-is_summed$Intensity <- as.numeric(is_summed$Intensity)
 is_summed$Matrix <- dims_matrix
 is_summed$Rundate <- rundate
 is_summed$Project <- project
diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index 562c794..c7026a9 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -7,7 +7,6 @@
 #    corresponding Z-scores.
 # 2. All files from github: https://github.com/UMCUGenetics/DIMS
 
-#!/usr/bin/Rscript
 ## adapted from 15-dIEM_violin.R
 
 # load packages
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 4cb343f..520286b 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 # adapted from hmdb_parts.R
 
 # define parameters 
diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index 0486b03..932c9e4 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from hmdb_part_adductSums.R
 
 # define parameters 
diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R
index b01a31b..44d4996 100644
--- a/DIMS/MakeInit.R
+++ b/DIMS/MakeInit.R
@@ -1,4 +1,3 @@
-#!/usr/bin/env Rscript
 ## adapted from makeInit in old pipeline
 
 # define parameters
diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R
index 08eb546..697f8f7 100644
--- a/DIMS/PeakFinding.R
+++ b/DIMS/PeakFinding.R
@@ -1,5 +1,4 @@
-#!/usr/bin/Rscript
-# adapted from 4-peakFinding.R
+## adapted from 4-peakFinding.R
 
 # define parameters
 cmd_args <- commandArgs(trailingOnly = TRUE)
@@ -12,27 +11,22 @@ thresh <- 2000
 outdir <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "findPeaks.Gauss.HPC.R"))
-source(paste0(scripts_dir, "searchMZRange.R"))
-source(paste0(scripts_dir, "generateGaussian.R"))
-source(paste0(scripts_dir, "fitGaussian.R"))
-source(paste0(scripts_dir, "fitGaussianInit.R"))
-source(paste0(scripts_dir, "getFwhm.R"))
-source(paste0(scripts_dir, "getSD.R"))
-source(paste0(scripts_dir, "optimizeGauss.R"))
-source(paste0(scripts_dir, "fit1Peak.R"))
-source(paste0(scripts_dir, "fit2peaks.R"))
-source(paste0(scripts_dir, "fit3peaks.R"))
-source(paste0(scripts_dir, "fit4peaks.R"))
-source(paste0(scripts_dir, "fitG.R"))
-source(paste0(scripts_dir, "fit2G.R"))
-source(paste0(scripts_dir, "fit3G.R"))
-source(paste0(scripts_dir, "fit4G.R"))
-source(paste0(scripts_dir, "getArea.R"))
-source(paste0(scripts_dir, "getFitQuality.R"))
-source(paste0(scripts_dir, "checkOverlap.R"))
-source(paste0(scripts_dir, "sumCurves.R"))
-source(paste0(scripts_dir, "isWithinXppm.R"))
+source(paste0(scripts_dir, "do_peakfinding.R"))
+source(paste0(scripts_dir, "check_overlap.R"))
+source(paste0(scripts_dir, "search_mzrange.R"))
+source(paste0(scripts_dir, "fit_optim.R"))
+source(paste0(scripts_dir, "fit_gaussian.R"))
+source(paste0(scripts_dir, "fit_init.R"))
+source(paste0(scripts_dir, "get_fwhm.R"))
+source(paste0(scripts_dir, "get_stdev.R"))
+source(paste0(scripts_dir, "optimize_gaussfit.R"))
+source(paste0(scripts_dir, "fit_peaks.R"))
+source(paste0(scripts_dir, "fit_gaussians.R"))
+source(paste0(scripts_dir, "estimate_area.R"))
+source(paste0(scripts_dir, "get_fit_quality.R"))
+source(paste0(scripts_dir, "check_overlap.R"))
+source(paste0(scripts_dir, "sum_curves.R"))
+source(paste0(scripts_dir, "within_ppm.R"))
 
 load(breaks_file)
 
@@ -46,21 +40,14 @@ if (grepl("_pos", sample_file)) {
 
 # Initialize
 options(digits = 16)
-int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve
-scale <- 2 # Initial value used to estimate scaling parameter
+# Number used to calculate area under Gaussian curve
+int_factor <- 1 * 10^5 
+# Initial value used to estimate scaling parameter
+scale <- 2
 width <- 1024
 height <- 768
 
 # run the findPeaks function
-print(head(sample_avgtechrepl))
-print(head(breaks_fwhm))
-print(int_factor)
-print(scale)
-print(resol)
-print(outdir)
-print(scanmode)
-print(thresh)
-print(width)
-print(height)
 
-do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
+do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height)
diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R
index 8da74d9..dc29e76 100644
--- a/DIMS/PeakGrouping.R
+++ b/DIMS/PeakGrouping.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 # adapted from 6-peakGrouping.R
 
 # define parameters
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index cf6665c..143299e 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -1,5 +1,4 @@
-#!/usr/bin/Rscript
-# adapted from 5-collectSamples.R
+## adapted from 5-collectSamples.R
 
 # define parameters
 scanmodes <- c("positive", "negative")
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index 870a914..877b3c8 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 11-runSumAdducts.R
 
 # define parameters
diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
index e58063a..0192d0f 100755
--- a/DIMS/UnidentifiedCalcZscores.R
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 10-collectSamplesFilled.R
 
 # define parameters
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index be1ac0e..7376eb6 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 7-collectSamplesGroupedHMDB.R
 
 # load required packages
diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R
index 0fbdea4..1611986 100755
--- a/DIMS/UnidentifiedFillMissing.R
+++ b/DIMS/UnidentifiedFillMissing.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 # adapted from 9-runFillMissing.R
 
 # define parameters
diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R
index 4600820..08da5d2 100755
--- a/DIMS/UnidentifiedPeakGrouping.R
+++ b/DIMS/UnidentifiedPeakGrouping.R
@@ -1,4 +1,3 @@
-#!/usr/bin/Rscript
 ## adapted from 8-peakGrouping.rest.R
 
 # define parameters

From 79bc26a8ba09e60b0d8a553e31f7363e2b8f0381 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 24 May 2024 14:58:23 +0200
Subject: [PATCH 66/73] code review changes applied

---
 DIMS/HMDBparts_old.R             | 126 -------------------------------
 DIMS/UnidentifiedCollectPeaks.nf |   3 +-
 2 files changed, 1 insertion(+), 128 deletions(-)
 delete mode 100644 DIMS/HMDBparts_old.R

diff --git a/DIMS/HMDBparts_old.R b/DIMS/HMDBparts_old.R
deleted file mode 100644
index 10fdc37..0000000
--- a/DIMS/HMDBparts_old.R
+++ /dev/null
@@ -1,126 +0,0 @@
-#!/usr/bin/Rscript
-
-# load required packages 
-# none 
-
-# define parameters 
-cmd_args <- commandArgs(trailingOnly = TRUE)
-for (arg in cmd_args) cat("  ", arg, "\n")
-
-# outdir <- cmd_args[1]
-# scanmode <- cmd_args[2]
-db_path <- cmd_args[1] # location of HMDB db file
-breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData
-standard_run  <- "no" # cmd_args[5] # "yes"
-
-# Cut up entire HMDB into small parts based on the new binning/breaks 
-
-# load(paste(outdir, "breaks.fwhm.RData", sep = "/"))
-load(breaks_filepath)
-#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/")
-#dir.create(outdir_hmdb, showWarnings = FALSE)
-
-# New: in case of a standard run (m/z 69-606) use external HMDB parts
-min_mz <- round(breaks.fwhm[1])
-max_mz <- round(breaks.fwhm[length(breaks.fwhm)])
-# test if standard mz range is used
-if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) {
-  # skip generating HMDB parts
-  use_external_HMDB <- TRUE
-  save(use_external_HMDB, file="./using_external_HMDB_parts.RData")
-  hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/"
-  hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files
-  # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/"))
-  for (hmdb_file in hmdb_parts) {
-    file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE)
-  }
-} else { 
-  # generate HMDB parts in case of non-standard mz range
-  use_external_HMDB <- FALSE
-  save(use_external_HMDB, file="not_using_external_HMDB_parts.RData")
-  # db <- cmd_args[3]
-  load(db_path)
-  ppm <- as.numeric(cmd_args[4])
-  if (scanmode=="negative"){
-    label = "MNeg"
-    HMDB_add_iso=HMDB_add_iso.Neg
-  } else {
-    label = "Mpos"
-    HMDB_add_iso=HMDB_add_iso.Pos
-  }
-
-  # filter mass range meassured!!!
-  HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),]
-
-  # sort on mass
-  outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),]
-
-  n=dim(outlist)[1]
-  sub=20000 # max rows per file
-  end=0
-  min_1_last=sub
-  check=0
-  outlist_part=NULL
-
-
-  if (n < sub) {
-    outlist_part <- outlist
-    save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/"))
-  } else {
-  
-    if (n >= sub & (floor(n/sub) - 1) >= 2){
-      for (i in 2:floor(n/sub) - 1){
-        start <- -(sub - 1) + i*sub
-        end <- i*sub
-      
-        if (i > 1){
-          outlist_i = outlist[c(start:end),]
-        
-          n_moved = 0
-        
-          # Calculate 3ppm and replace border, avoid cut within peakgroup!
-          while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
-            outlist_part <- rbind(outlist_part, outlist_i[1,])
-            outlist_i <- outlist_i[-1,]
-            n_moved <- n_moved + 1
-          }
-        
-          # message(paste("Process", i-1,":", dim(outlist_part)[1]))
-          save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/"))
-          check <- check + dim(outlist_part)[1]
-        
-          outlist_part <- outlist_i
-          min_1_last <- dim(outlist_part)[1]
-        
-        } else {
-          outlist_part <- outlist[c(start:end),]
-        }
-      }
-    }
-  
-    start <- end + 1
-    end <- n
-    outlist_i <- outlist[c(start:end),]
-    n_moved <- 0
-  
-    if (!is.null(outlist_part)) {
-      # Calculate 3ppm and replace border, avoid cut within peakgroup!
-      while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) {
-        outlist_part = rbind(outlist_part, outlist_i[1,])
-        outlist_i = outlist_i[-1,]
-        n_moved = n_moved + 1
-      }
-    
-      # message(paste("Process", i+1-1,":", dim(outlist_part)[1]))
-      save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_"))
-      check <- check + dim(outlist_part)[1]
-    }
-  
-    outlist_part <- outlist_i
-    # message(paste("Process", i+2-1,":", dim(outlist_part)[1]))
-    save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_"))
-    check <- check + dim(outlist_part)[1]
-    cat("\n", "Check", check == dim(outlist)[1])
-  
-  }
-}
diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf
index 4f419a7..0983e9e 100644
--- a/DIMS/UnidentifiedCollectPeaks.nf
+++ b/DIMS/UnidentifiedCollectPeaks.nf
@@ -9,11 +9,10 @@ process UnidentifiedCollectPeaks {
        path(peaklist_identified)
 
     output:
-       // path('SpectrumPeaks_*_Unidentified.RData')
        path('*.RData')
 
     script:
         """
-        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $spectrumpeaks_file $params.ppm
+        Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $params.ppm
         """
 }

From 40da954972d7557dd717bc09c1fa8c661ada29b7 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Mon, 27 May 2024 12:59:16 +0200
Subject: [PATCH 67/73] fixed names of Utils function calls

---
 DIMS/Utils/fit_gaussian.R  | 17 +++++++++++----
 DIMS/Utils/fit_gaussians.R | 43 ++++++++++++++++++++++++++++----------
 DIMS/Utils/fit_init.R      |  6 ++----
 DIMS/Utils/fit_optim.R     |  7 ++++---
 DIMS/Utils/fit_peaks.R     | 18 ++++++++--------
 DIMS/Utils/get_stdev.R     |  3 +--
 6 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R
index 35275cd..b22ba01 100644
--- a/DIMS/Utils/fit_gaussian.R
+++ b/DIMS/Utils/fit_gaussian.R
@@ -40,6 +40,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
     fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             plot, fit_quality1, use_bounds)
+  print("nu in fit_gaussian, if force == 1")
+  print(fit_values)
     # set initial value for scale factor
     scale <- 2
     # test if the mean is outside the m/z range
@@ -47,7 +49,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       # run this function again with fixed boundaries
       return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                           outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height))
-
+    print("nu in fit_gaussian, mean outside m/z range")
     } else {
       # test if the fit is bad
       if (fit_values$qual > fit_quality1) {
@@ -62,10 +64,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index,
                             scale, resol, outdir, force = 2, use_bounds = FALSE,
                             plot, scanmode, int_factor, width, height))
+    print("nu in fit_gaussian, bad fit ")
       # good fit
       } else {
         peak_mean <- c(peak_mean, fit_values$mean)
-        peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale,
+        peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale,
                                           fit_values$sigma, int_factor))
         peak_qual <- fit_values$qual
         peak_min <- mass_vector[1]
@@ -76,8 +79,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
   #### Two local maxima; need at least 6 data points for this ####
   } else if (force == 2 && (length(mass_vector) > 6)) {
     # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
-    fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol,
+    fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
+  print("nu in fit_gaussian force == 2")
+  print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
@@ -140,6 +145,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
+  print("nu in fit_gaussian force == 3")
+  print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
@@ -204,6 +211,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
+  print("nu in fit_gaussian force == 4")
+  print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
@@ -279,7 +288,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       peak_qual <- 0
     } else {
       peak_mean <- c(peak_mean, fit_values$mean)
-      peak_area <- c(peak_area, get_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
+      peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
       peak_qual <- fit_values$qual
       peak_min <- mass_vector[1]
       peak_max <- mass_vector[length(mass_vector)]
diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R
index c4924e1..1a4c34e 100644
--- a/DIMS/Utils/fit_gaussians.R
+++ b/DIMS/Utils/fit_gaussians.R
@@ -17,15 +17,21 @@ fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use
     d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma)
     sum((d - int_vector) ^ 2)
   }
-
+print("nu in fit_1gaussian")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
     upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
     # get optimal value for fitted Gaussian curve
-    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+    tryCatch(opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
                      opt_f, control = list(maxit = 10000), method = "L-BFGS-B",
-                     lower = lower, upper = upper)
+                     lower = lower, upper = upper),
+	     error = function(e) {
+		     # in case of error, use regular optim without boundaries
+                     opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+                                      opt_f, control = list(maxit = 10000))
+                     write.table(opt_fit, file = paste0("tryCatch_error_", query_mass, ".txt"), row.names = FALSE)
+             } )
   } else {
     opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
                      opt_f, control = list(maxit = 10000))
@@ -58,6 +64,7 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
     sum((d - int_vector) ^ 2)
   }
 
+print("nu in fit_2gaussians")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
@@ -65,15 +72,27 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
     # get optimal value for 2 fitted Gaussian curves
     if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
       sigma2 <- sigma1
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                         as.numeric(query_mass1), as.numeric(scale1)),
-                       opt_f, control = list(maxit = 10000),
-                       method = "L-BFGS-B", lower = lower, upper = upper)
+      tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                                as.numeric(query_mass1), as.numeric(scale1)),
+                                opt_f, control = list(maxit = 10000),
+                                method = "L-BFGS-B", lower = lower, upper = upper),
+	       error = function(e) {
+                     # in case of error, use regular optim without boundaries
+                     opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                                      as.numeric(query_mass1), as.numeric(scale1)),
+                                      opt_f, control = list(maxit = 10000))
+                     write.table(res, file = paste0("tryCatch_error_2gauss_", query_mass, ".txt"), row.names = FALSE) } )
     } else {
-      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                         as.numeric(query_mass2), as.numeric(scale2)),
-                       opt_f, control = list(maxit = 10000),
-                       method = "L-BFGS-B", lower = lower, upper = upper)
+      tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                                as.numeric(query_mass2), as.numeric(scale2)),
+                                opt_f, control = list(maxit = 10000),
+                                method = "L-BFGS-B", lower = lower, upper = upper),
+	       error = function(e) {
+                     # in case of error, use regular optim without boundaries
+                     opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+                                      as.numeric(query_mass1), as.numeric(scale1)),
+                                      opt_f, control = list(maxit = 10000))
+                     write.table(res, file = paste0("tryCatch_error_2gauss_else_", query_mass, ".txt"), row.names = FALSE) } )
     }
   } else {
     if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
@@ -120,6 +139,7 @@ fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3,
     sum((d - int_vector) ^ 2)
   }
 
+print("nu in fit_3gaussians")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0)
@@ -174,6 +194,7 @@ fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigm
     sum((d - int_vector) ^ 2)
   }
 
+print("nu in fit_4gaussians")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0)
diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R
index c761194..0c4e513 100644
--- a/DIMS/Utils/fit_init.R
+++ b/DIMS/Utils/fit_init.R
@@ -2,7 +2,6 @@
 # variables with fixed values will be removed from function parameters
 # scale, outdir, plot, width, height
 # mz_index, start_index, end_index, sample_name not used.
-# use_bounds is used, but not defined.
 # fit_gaussian should be defined before this function is called.
 fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
                      outdir, sample_name, scanmode, plot, width, height,
@@ -34,17 +33,16 @@ fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
 
   # Find the index in int_vector with the highest intensity
   max_index <- which(int_vector == max(int_vector))
-
+print("nu in fit_init")
   roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index,
                              scale, resol, outdir, force = length(max_index),
-                             useBounds = FALSE, plot, scanmode, int_factor, width, height)
+                             use_bounds = FALSE, plot, scanmode, int_factor, width, height)
   # put all values for this region of interest into a list
   roi_value_list <- list("mean" = roi_values$mean,
                          "area" = roi_values$area,
                          "qual" = roi_values$qual,
                          "min" = roi_values$min,
                          "max" = roi_values$max)
-
   return(roi_value_list)
 }
 
diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R
index e280b57..ed71622 100644
--- a/DIMS/Utils/fit_optim.R
+++ b/DIMS/Utils/fit_optim.R
@@ -32,17 +32,18 @@ fit_optim <- function(mass_vector, int_vector, resol,
   mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1]
   # generate a second mass_vector with equally spaced m/z values
   mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], 
-                      length = mz_diff * int_factor)
+                      length = mass_diff * int_factor)
   sigma <- get_stdev(mass_vector2, int_max_simple)
   scale <- optimize_gaussfit(mass_vector2, int_max_simple, sigma, mass_max)
 
   # get an estimate of the area under the peak
-  area <- get_area(mass_max, resol, scale, sigma, int_factor)
-  
+  area <- estimate_area(mass_max, resol, scale, sigma, int_factor)
+print("nu in fit_optim")  
   # put all values for this region of interest into a list
   roi_value_list <- list("mean" = mass_max,
                          "area" = area,
                          "min" = mass_vector2[1],
                          "max" = mass_vector2[length(mass_vector2)])
+print(roi_value_list)
   return(roi_value_list)
 }
diff --git a/DIMS/Utils/fit_peaks.R b/DIMS/Utils/fit_peaks.R
index 92e08d6..02d5408 100644
--- a/DIMS/Utils/fit_peaks.R
+++ b/DIMS/Utils/fit_peaks.R
@@ -151,8 +151,8 @@ fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale,
                         resol, sum_fit = sum_fit)$fq_new
 
   # get parameter values
-  area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor)
-  area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor)
+  area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor)
   peak_area <- c(peak_area, area1)
   peak_area <- c(peak_area, area2)
   peak_mean <- c(peak_mean, pc[1])
@@ -245,9 +245,9 @@ fit_3peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale,
                         resol, sum_fit = sum_fit)$fq_new
 
   # get parameter values
-  area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor)
-  area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor)
-  area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor)
+  area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor)
+  area3 <- estimate_area(pc[5], resol, pc[6], sigma3, int_factor)
   peak_area <- c(peak_area, area1)
   peak_area <- c(peak_area, area2)
   peak_area <- c(peak_area, area3)
@@ -354,10 +354,10 @@ fit_4peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale,
                         resol, sum_fit = sum_fit)$fq_new
 
   # get parameter values
-  area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor)
-  area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor)
-  area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor)
-  area4 <- get_area(pc[7], resol, pc[8], sigma4, int_factor)
+  area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor)
+  area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor)
+  area3 <- estimate_area(pc[5], resol, pc[6], sigma3, int_factor)
+  area4 <- estimate_area(pc[7], resol, pc[8], sigma4, int_factor)
   peak_area <- c(peak_area, area1)
   peak_area <- c(peak_area, area2)
   peak_area <- c(peak_area, area3)
diff --git a/DIMS/Utils/get_stdev.R b/DIMS/Utils/get_stdev.R
index cfe0aa4..a385187 100644
--- a/DIMS/Utils/get_stdev.R
+++ b/DIMS/Utils/get_stdev.R
@@ -7,7 +7,6 @@ get_stdev <- function(mass_vector, int_vector, resol = 140000) {
   #' @param resol: Value for resolution (integer)
   #'
   #' @return stdev: Value for standard deviation
-
   # find maximum intensity in vector
   max_index <- which(int_vector == max(int_vector))
   # find corresponding mass at maximum intensity
@@ -15,7 +14,7 @@ get_stdev <- function(mass_vector, int_vector, resol = 140000) {
   # calculate resolution at given m/z value
   resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200)))
   # calculate full width at half maximum
-  fwhm <- mean / resol_mz
+  fwhm <- max_mass / resol_mz
   # calculate standard deviation
   stdev <- (fwhm / 2) * 0.85
   return(stdev)

From c37ddb8e97c15b3a4f870a8c0444ab8edec1205e Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Thu, 30 May 2024 15:58:25 +0200
Subject: [PATCH 68/73] Added Zscore if statement

---
 DIMS/GenerateViolinPlots.R | 879 ++++++++++++++++++-------------------
 1 file changed, 429 insertions(+), 450 deletions(-)

diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R
index c7026a9..29f41cf 100644
--- a/DIMS/GenerateViolinPlots.R
+++ b/DIMS/GenerateViolinPlots.R
@@ -29,480 +29,459 @@ file_expected_biomarkers_iem <- cmd_args[6]
 file_explanation <- cmd_args[7]
 file_isomers <- cmd_args[8]
 
-print("arguments")
-print(run_name)
-print(scripts_dir)
-print(z_score)
-print(path_metabolite_groups)
-print(file_ratios_metabolites)
-print(file_expected_biomarkers_iem)
-print(file_explanation)
-print(file_isomers)
-print("end arguments")
-
-# path: output folder for dIEM and violin plots
-output_dir <- "./"
-
-file.copy(file_isomers, output_dir)
-
-# folder for all metabolite lists (.txt)
-# path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups"
-# file for ratios step 3
-# file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv"
-# file for algorithm step 4
-# file_expected_biomarkers_iem <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv"
-# explanation: file with text to be included in violin plots
-# file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt"
-
-# copy list of isomers to project folder.
-# file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir)
-
-# load functions
-source(paste0(scripts_dir, "check_same_samplename.R"))
-source(paste0(scripts_dir, "prepare_data.R"))
-source(paste0(scripts_dir, "prepare_data_perpage.R"))
-source(paste0(scripts_dir, "prepare_toplist.R"))
-source(paste0(scripts_dir, "create_violin_plots.R"))
-source(paste0(scripts_dir, "prepare_alarmvalues.R"))
-source(paste0(scripts_dir, "output_helix.R"))
-source(paste0(scripts_dir, "get_patient_data_to_helix.R"))
-source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R"))
-source(paste0(scripts_dir, "is_diagnostic_patient.R"))
-
-# number of diseases that score highest in algorithm to plot
-top_nr_iem <- 5
-# probability score cut-off for plotting the top diseases
-threshold_iem <- 5
-# z-score cutoff of axis on the left for top diseases
-ratios_cutoff <- -5
-# number of violin plots per page in PDF
-nr_plots_perpage <- 20
-
-# binary variable: run function, yes(1) or no(0)
-if (z_score == 1) { 
-  algorithm <- ratios <- violin <- 1 
-} else {
-  algorithm <- ratios <- violin <- 0 
-}	
-# are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
-header_row <- 1
-# column name where the data starts (default B)
-col_start <- "B"
-zscore_cutoff <- 5
-xaxis_cutoff <- 20
-protocol_name <- "DIMS_PL_DIAG"
-
-#### STEP 1: Preparation ####
-# in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS
-
-# path to DIMS excel file
-path_dims_file <- paste0(run_name, ".xlsx")
-
-# Load the excel file.
-dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row)
-if (exists("dims_xls")) {
-  cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file))
-} else {
-  cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n"))
-}
-
-#### STEP 2: Edit DIMS data #####
-# in: dims_xls ||| out: Data, nr_contr, nr_pat
-# Input: the xlsx file that comes out of the pipeline with format:
-# [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore]
-# Output: "_CSV.csv" file that is suited for the algorithm in shiny.
-
-# Determine the number of Contols and Patients in column names:
-nr_contr <- length(grep("C", names(dims_xls))) / 2
-nr_pat   <- length(grep("P", names(dims_xls))) / 2
-# total number of samples
-nrsamples <- nr_contr + nr_pat
-# check whether the number of intensity columns equals the number of Zscore columns
-if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) {
-  cat("\n**** Error: there aren't as many intensities listed as Zscores")
-}
-cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n"))
-
-# Move the columns HMDB_code and HMDB_name to the beginning.
-hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
-other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols]
-dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)]
-# Remove the columns from 'name' to 'pathway'
-from_col <- which(colnames(dims_xls_copy) == "name")
-to_col <- which(colnames(dims_xls_copy) == "pathway")
-dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)]
-# in case the excel had an empty "plots" column, remove it
-if ("plots" %in% colnames(dims_xls_copy)) {
-  dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))]
-}
-# Rename columns
-names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy))
-names(dims_xls_copy) <- gsub("sd.ctrls",  "SD_controls", names(dims_xls_copy))
-names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy))
-names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy))
-
-# intensity columns and mean and standard deviation of controls
-numeric_cols <- c(3:ncol(dims_xls_copy))
-# make sure all values are numeric
-dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric)
-
-if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) {
-  cat("\n### Step 2 # Edit dims data is done.\n")
-} else {
-  cat("\n**** Error: Could not execute step 2 \n")
-}
-
-#### STEP 3: Calculate ratios of intensities for metabolites ####
-# in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file)
-# This script loads the file with Ratios (file_ratios_metabolites) and calculates
-# the ratios of the intensities of the given metabolites. It also calculates
-# Zs-cores based on the avg and sd of the ratios of the controls.
-
-# Input: dataframe with intenstities and Zscores of controls and patients:
-# [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore]
-
-# Output: "_CSV.csv" file that is suited for the algorithm, with format:
-# "_Ratios_CSV.csv" file, same file as above, but with ratio rows added.
-
-if (ratios == 1) {
-  cat(paste0("\nloading ratios file:\n ->  ", file_ratios_metabolites, "\n"))
-  ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE)
-
-  # Prepare empty data frame to fill with ratios
-  ratio_list <- setNames(data.frame(matrix(
-    ncol = ncol(dims_xls_copy),
-    nrow = nrow(ratio_input)
-  )), colnames(dims_xls_copy))
-  ratio_list <- as.data.frame(ratio_list)
-
-  # put HMDB info into first two columns of ratio_list
-  ratio_list[, 1:2] <- ratio_input[, 1:2]
-
-  # look for intensity columns (exclude Zscore columns)
-  control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
-  patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
-  intensity_cols <- c(control_cols, patient_cols)
-  # calculate each of the ratios of intensities
-  for (ratio_index in 1:nrow(ratio_input)) {
-    ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"]
-    ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]]
-    ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"]
-    ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]]
-    # find these HMDB IDs in dataset. Could be a sum of multiple metabolites
-    sel_denominator <- sel_numerator <- c()
-    for (numerator_index in 1:length(ratio_numerator)) {
-      sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index]))
-    }
-    for (denominator_index in 1:length(ratio_denominator)) {
-      # special case for sum of metabolites (dividing by one)
+if (z_score == 1){
+  # path: output folder for dIEM and violin plots
+  output_dir <- "./"
+
+  file.copy(file_isomers, output_dir)
+
+  # load functions
+  source(paste0(scripts_dir, "check_same_samplename.R"))
+  source(paste0(scripts_dir, "prepare_data.R"))
+  source(paste0(scripts_dir, "prepare_data_perpage.R"))
+  source(paste0(scripts_dir, "prepare_toplist.R"))
+  source(paste0(scripts_dir, "create_violin_plots.R"))
+  source(paste0(scripts_dir, "prepare_alarmvalues.R"))
+  source(paste0(scripts_dir, "output_helix.R"))
+  source(paste0(scripts_dir, "get_patient_data_to_helix.R"))
+  source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R"))
+  source(paste0(scripts_dir, "is_diagnostic_patient.R"))
+
+  # number of diseases that score highest in algorithm to plot
+  top_nr_iem <- 5
+  # probability score cut-off for plotting the top diseases
+  threshold_iem <- 5
+  # z-score cutoff of axis on the left for top diseases
+  ratios_cutoff <- -5
+  # number of violin plots per page in PDF
+  nr_plots_perpage <- 20
+
+  # binary variable: run function, yes(1) or no(0)
+  if (z_score == 1) { 
+    algorithm <- ratios <- violin <- 1 
+  } else {
+    algorithm <- ratios <- violin <- 0 
+  }	
+  # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1)
+  header_row <- 1
+  # column name where the data starts (default B)
+  col_start <- "B"
+  zscore_cutoff <- 5
+  xaxis_cutoff <- 20
+  protocol_name <- "DIMS_PL_DIAG"
+
+  #### STEP 1: Preparation ####
+  # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS
+
+  # path to DIMS excel file
+  path_dims_file <- paste0(run_name, ".xlsx")
+
+  # Load the excel file.
+  dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row)
+  if (exists("dims_xls")) {
+    cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file))
+  } else {
+    cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n"))
+  }
+
+  #### STEP 2: Edit DIMS data #####
+  # in: dims_xls ||| out: Data, nr_contr, nr_pat
+  # Input: the xlsx file that comes out of the pipeline with format:
+  # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore]
+  # Output: "_CSV.csv" file that is suited for the algorithm in shiny.
+
+  # Determine the number of Contols and Patients in column names:
+  nr_contr <- length(grep("C", names(dims_xls))) / 2
+  nr_pat   <- length(grep("P", names(dims_xls))) / 2
+  # total number of samples
+  nrsamples <- nr_contr + nr_pat
+  # check whether the number of intensity columns equals the number of Zscore columns
+  if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) {
+    cat("\n**** Error: there aren't as many intensities listed as Zscores")
+  }
+  cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n"))
+
+  # Move the columns HMDB_code and HMDB_name to the beginning.
+  hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name"))
+  other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols]
+  dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)]
+  # Remove the columns from 'name' to 'pathway'
+  from_col <- which(colnames(dims_xls_copy) == "name")
+  to_col <- which(colnames(dims_xls_copy) == "pathway")
+  dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)]
+  # in case the excel had an empty "plots" column, remove it
+  if ("plots" %in% colnames(dims_xls_copy)) {
+    dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))]
+  }
+  # Rename columns
+  names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy))
+  names(dims_xls_copy) <- gsub("sd.ctrls",  "SD_controls", names(dims_xls_copy))
+  names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy))
+  names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy))
+
+  # intensity columns and mean and standard deviation of controls
+  numeric_cols <- c(3:ncol(dims_xls_copy))
+  # make sure all values are numeric
+  dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric)
+
+  if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) {
+    cat("\n### Step 2 # Edit dims data is done.\n")
+  } else {
+    cat("\n**** Error: Could not execute step 2 \n")
+  }
+
+  #### STEP 3: Calculate ratios of intensities for metabolites ####
+  # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file)
+  # This script loads the file with Ratios (file_ratios_metabolites) and calculates
+  # the ratios of the intensities of the given metabolites. It also calculates
+  # Zs-cores based on the avg and sd of the ratios of the controls.
+
+  # Input: dataframe with intenstities and Zscores of controls and patients:
+  # [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore]
+
+  # Output: "_CSV.csv" file that is suited for the algorithm, with format:
+  # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added.
+
+  if (ratios == 1) {
+    cat(paste0("\nloading ratios file:\n ->  ", file_ratios_metabolites, "\n"))
+    ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE)
+
+    # Prepare empty data frame to fill with ratios
+    ratio_list <- setNames(data.frame(matrix(
+      ncol = ncol(dims_xls_copy),
+      nrow = nrow(ratio_input)
+    )), colnames(dims_xls_copy))
+    ratio_list <- as.data.frame(ratio_list)
+
+    # put HMDB info into first two columns of ratio_list
+    ratio_list[, 1:2] <- ratio_input[, 1:2]
+
+    # look for intensity columns (exclude Zscore columns)
+    control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+    patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")])
+    intensity_cols <- c(control_cols, patient_cols)
+    # calculate each of the ratios of intensities
+    for (ratio_index in 1:nrow(ratio_input)) {
+      ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"]
+      ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]]
+      ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"]
+      ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]]
+      # find these HMDB IDs in dataset. Could be a sum of multiple metabolites
+      sel_denominator <- sel_numerator <- c()
+      for (numerator_index in 1:length(ratio_numerator)) {
+        sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index]))
+      }
+      for (denominator_index in 1:length(ratio_denominator)) {
+        # special case for sum of metabolites (dividing by one)
+        if (ratio_denominator[denominator_index] != "one") {
+          sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index]))
+        }
+      }
+      # calculate ratio
       if (ratio_denominator[denominator_index] != "one") {
-        sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index]))
+        ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) /
+          apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum)
+      } else {
+        # special case for sum of metabolites (dividing by one)
+        ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum)
       }
+      # calculate log of ratio
+      ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols])
     }
-    # calculate ratio
-    if (ratio_denominator[denominator_index] != "one") {
-      ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) /
-        apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum)
-    } else {
-      # special case for sum of metabolites (dividing by one)
-      ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum)
-    }
-    # calculate log of ratio
-    ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols])
-  }
 
-  # Calculate means and SD's of the calculated ratios for Controls
-  ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean)
-  ratio_list[, "SD_controls"]   <- apply(ratio_list[, control_cols], 1, sd)
-
-  # Calc z-scores with the means and SD's of Controls
-  zscore_cols <- grep("Zscore", colnames(ratio_list))
-  for (sample_index in 1:length(zscore_cols)) {
-    zscore_col <- zscore_cols[sample_index]
-    # matching intensity column
-    int_col <- intensity_cols[sample_index]
-    # test on column names
-    if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) {
-      # calculate Z-scores
-      ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"]
+    # Calculate means and SD's of the calculated ratios for Controls
+    ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean)
+    ratio_list[, "SD_controls"]   <- apply(ratio_list[, control_cols], 1, sd)
+
+    # Calc z-scores with the means and SD's of Controls
+    zscore_cols <- grep("Zscore", colnames(ratio_list))
+    for (sample_index in 1:length(zscore_cols)) {
+      zscore_col <- zscore_cols[sample_index]
+      # matching intensity column
+      int_col <- intensity_cols[sample_index]
+      # test on column names
+      if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) {
+        # calculate Z-scores
+        ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"]
+      }
     }
-  }
 
-  # Add rows of the ratio hmdb codes to the data of zscores from the pipeline.
-  dims_xls_ratios <- rbind(ratio_list, dims_xls_copy)
-
-  # Edit the DIMS output Zscores of all patients in format:
-  # HMDB_code patientname1  patientname2
-  names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios))
-  names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios))
-
-  # for debugging:
-  write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t")
-
-  # Select only the cols with zscores of the patients
-  zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])]
-  # Select only the cols with zscores of the controls
-  zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])]
-
-}
-
-#### STEP 4: Run the IEM algorithm #########
-# in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file)
-# algorithm taken from DOI: 10.3390/ijms21030979
-
-if (algorithm == 1) {
-  # Load data
-  cat(paste0("\nloading expected file:\n ->  ", file_expected_biomarkers_iem, "\n"))
-  expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE)
-  # modify column names
-  names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code",  names(expected_biomarkers))
-  names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers))
-
-  # prepare dataframe scaffold rank_patients
-  rank_patients <- zscore_patients
-  # Fill df rank_patients with the ranks for each patient
-  for (patient_index in 3:ncol(zscore_patients)) {
-    # number of positive zscores in patient
-    pos <- sum(zscore_patients[, patient_index] > 0)
-    # sort the column on zscore; NB: this sorts the entire object, not just one column
-    rank_patients <- rank_patients[order(-rank_patients[patient_index]), ]
-    # Rank all positive zscores highest to lowest
-    rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index]))
-    # Rank all negative zscores lowest to highest
-    rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):
-										      nrow(rank_patients), patient_index]))
-  }
+    # Add rows of the ratio hmdb codes to the data of zscores from the pipeline.
+    dims_xls_ratios <- rbind(ratio_list, dims_xls_copy)
 
-  # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2).
-  expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
-  expected_zscores_original <- expected_zscores
-
-  # determine which columns contain Z-scores and which contain disease info
-  select_zscore_cols <- grep("_Zscore", colnames(expected_zscores))
-  select_info_cols <- 1:(min(select_zscore_cols) - 1)
-  # set some zscores to zero
-  select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable")
-  expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, 
-								     select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x))
-  select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable")
-  expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, 
-								     select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x))
-
-  # calculate rank score:
-  expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
-  rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / 
-	         (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9)
-  # combine disease info with rank scores
-  expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores)
-
-  # multiply weight score and rank score
-  weight_score <- expected_zscores
-  weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols]
-
-  # sort table on Disease and Absolute_Weight
-  weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ]
-
-  # select columns to check duplicates
-  dup <- weight_score[, c("Disease", "M.z")]
-  uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ]
-
-  # calculate probability score
-  prob_score <-  aggregate(uni[, select_zscore_cols], uni["Disease"], sum)
-
-  # list of all diseases that have at least one metabolite Zscore at 0
-  for (patient_index in 2:ncol(prob_score)) {
-    patient_zscore_colname <- colnames(prob_score)[patient_index]
-    matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname)
-    # determine which Zscores are 0 for this patient
-    zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0)
-    # get Disease for these
-    disease_zero <- unique(expected_zscores[zscores_zero, "Disease"])
-    # set the probability score of these diseases to 0
-    prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0
-  }
+    # Edit the DIMS output Zscores of all patients in format:
+    # HMDB_code patientname1  patientname2
+    names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios))
+    names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios))
+
+    # for debugging:
+    write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t")
+
+    # Select only the cols with zscores of the patients
+    zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])]
+    # Select only the cols with zscores of the controls
+    zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])]
 
-  # determine disease rank per patient
-  disease_rank <- prob_score
-  # rank diseases in decreasing order
-  disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) 
-					       as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x])))
-  # modify column names, Zscores have now been converted to probability scores
-  colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score))
-  colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank))
-
-  # Create conditional formatting for output Excel sheet. Colors according to values.
-  wb <- createWorkbook()
-  addWorksheet(wb, "Probability Scores")
-  writeData(wb, "Probability Scores", prob_score)
-  conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), 
-			type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100))
-  saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
-  # check whether prob_score df exists and has expected dimensions.
-  if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) {
-    cat("\n### Step 4 # Running the IEM algorithm is done.\n\n")
-  } else {
-    cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n")
   }
 
-  rm(wb)
-}
-
-#### STEP 5: Make violin plots #####
-# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, 
-# top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file
-
-if (violin == 1) {
-
-  # preparation
-  zscore_patients_copy <- zscore_patients
-  # for robust scaler, rename Z-score columns
-  colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients))
-  colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls))
-  colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients))
-  colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls))
-
-  # Make patient list for violin plots
-  patient_list <- names(zscore_patients)[-c(1, 2)]
-
-  # from table expected_biomarkers, choose selected columns
-  select_columns <- c("Disease", "HMDB_code", "HMDB_name")
-  select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns)
-  expected_biomarkers_select <- expected_biomarkers[, select_col_nrs]
-  # remove duplicates
-  expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ]
-
-  # load file with explanatory information to be included in PDF.
-  explanation <- readLines(file_explanation)
-
-  # first step: normal violin plots
-  # Find all text files in the given folder, which contain metabolite lists of which
-  # each file will be a page in the pdf with violin plots.
-  # Make a PDF file for each of the categories in metabolite_dirs
-  metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE)
-  for (metabolite_dir in metabolite_dirs) {
-    # create a directory for the output PDFs
-    pdf_dir <- paste(output_dir, metabolite_dir, sep = "/")
-    dir.create(pdf_dir, showWarnings = FALSE)
-    cat("making plots in category:", metabolite_dir, "\n")
-
-    # get a list of all metabolite files
-    metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), 
-				   pattern = "*.txt", full.names = FALSE, recursive = FALSE)
-    # put all metabolites into one list
-    metab_list_all <- list()
-    metab_list_names <- c()
-    cat("making plots from the input files:")
-    # open the text files and add each to a list of dataframes (metab_list_all)
-    for (file_index in seq_along(metabolite_files)) {
-      infile <- metabolite_files[file_index]
-      metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), 
-			       sep = "\t", header = TRUE, quote = "")
-      # put into list of all lists
-      metab_list_all[[file_index]] <- metab_list
-      metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1])
-      cat(paste0("\n", infile))
+  #### STEP 4: Run the IEM algorithm #########
+  # in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file)
+  # algorithm taken from DOI: 10.3390/ijms21030979
+
+  if (algorithm == 1) {
+    # Load data
+    cat(paste0("\nloading expected file:\n ->  ", file_expected_biomarkers_iem, "\n"))
+    expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE)
+    # modify column names
+    names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code",  names(expected_biomarkers))
+    names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers))
+
+    # prepare dataframe scaffold rank_patients
+    rank_patients <- zscore_patients
+    # Fill df rank_patients with the ranks for each patient
+    for (patient_index in 3:ncol(zscore_patients)) {
+      # number of positive zscores in patient
+      pos <- sum(zscore_patients[, patient_index] > 0)
+      # sort the column on zscore; NB: this sorts the entire object, not just one column
+      rank_patients <- rank_patients[order(-rank_patients[patient_index]), ]
+      # Rank all positive zscores highest to lowest
+      rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index]))
+      # Rank all negative zscores lowest to highest
+      rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):
+                            nrow(rank_patients), patient_index]))
     }
-    # include list of classes in metabolite list
-    names(metab_list_all) <- metab_list_names
-
-    # prepare list of metabolites; max nr_plots_perpage on one page
-    metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients)
-    metab_interest_controls <- prepare_data(metab_list_all, zscore_controls)
-    metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr)
-
-    # for Diagnostics metabolites to be saved in Helix
-    if(grepl("Diagnost", pdf_dir)) {
-      # get table that combines DIMS results with stofgroepen/Helix table
-      dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all)
-      
-      # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs
-      if(any(is_diagnostic_patient(dims_helix_table$Patient))){
-        # get output file for Helix
-        output_helix <- output_for_helix(protocol_name, dims_helix_table)
-        # write output to file
-        path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv")
-        write.csv(output_helix, path_helixfile, quote = F, row.names = F)
-      }
-    }
-    
-    # make violin plots per patient
-    for (pt_nr in 1:length(patient_list)) {
-      pt_name <- patient_list[pt_nr]
-      # for category Diagnostics, make list of metabolites that exceed alarm values for this patient
-      # for category Other, make list of top highest and lowest Z-scores for this patient
-      if (grepl("Diagnost", pdf_dir)) {
-        top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table)
-      } else {
-        top_metab_pt <- prepare_toplist(pt_name, zscore_patients)
-      }
 
-      # generate normal violin plots
-      create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt)
+    # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2).
+    expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
+    expected_zscores_original <- expected_zscores
+
+    # determine which columns contain Z-scores and which contain disease info
+    select_zscore_cols <- grep("_Zscore", colnames(expected_zscores))
+    select_info_cols <- 1:(min(select_zscore_cols) - 1)
+    # set some zscores to zero
+    select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable")
+    expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, 
+                      select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x))
+    select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable")
+    expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, 
+                      select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x))
+
+    # calculate rank score:
+    expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code"))
+    rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / 
+            (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9)
+    # combine disease info with rank scores
+    expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores)
+
+    # multiply weight score and rank score
+    weight_score <- expected_zscores
+    weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols]
+
+    # sort table on Disease and Absolute_Weight
+    weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ]
+
+    # select columns to check duplicates
+    dup <- weight_score[, c("Disease", "M.z")]
+    uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ]
+
+    # calculate probability score
+    prob_score <-  aggregate(uni[, select_zscore_cols], uni["Disease"], sum)
+
+    # list of all diseases that have at least one metabolite Zscore at 0
+    for (patient_index in 2:ncol(prob_score)) {
+      patient_zscore_colname <- colnames(prob_score)[patient_index]
+      matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname)
+      # determine which Zscores are 0 for this patient
+      zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0)
+      # get Disease for these
+      disease_zero <- unique(expected_zscores[zscores_zero, "Disease"])
+      # set the probability score of these diseases to 0
+      prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0
+    }
 
+    # determine disease rank per patient
+    disease_rank <- prob_score
+    # rank diseases in decreasing order
+    disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) 
+                  as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x])))
+    # modify column names, Zscores have now been converted to probability scores
+    colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score))
+    colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank))
+
+    # Create conditional formatting for output Excel sheet. Colors according to values.
+    wb <- createWorkbook()
+    addWorksheet(wb, "Probability Scores")
+    writeData(wb, "Probability Scores", prob_score)
+    conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), 
+        type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100))
+    saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE)
+    # check whether prob_score df exists and has expected dimensions.
+    if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) {
+      cat("\n### Step 4 # Running the IEM algorithm is done.\n\n")
+    } else {
+      cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n")
     }
 
+    rm(wb)
   }
 
-  # Second step: dIEM plots in separate directory
-  diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/")
-  dir.create(diem_plot_dir)
-
-  # Select the metabolites that are associated with the top highest scoring IEM, for each patient
-  # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely.
-  for (pt_nr in 1:length(patient_list)) {
-    pt_name <- patient_list[pt_nr]
-    # get top diseases for this patient
-    pt_colnr <- which(colnames(disease_rank) == pt_name)
-    pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem)
-    pt_iems <- disease_rank[pt_top_indices, "Disease"]
-    pt_top_iems <- pt_prob_score_top_iems <- c()
-    for (single_iem in pt_iems) {
-      # get the probability score
-      prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr]
-      # use only diseases for which probability score is above threshold
-      if (prob_score_iem >= threshold_iem) {
-        pt_top_iems <- c(pt_top_iems, single_iem)
-        pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem)
+  #### STEP 5: Make violin plots #####
+  # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, 
+  # top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file
+
+  if (violin == 1) {
+
+    # preparation
+    zscore_patients_copy <- zscore_patients
+    # for robust scaler, rename Z-score columns
+    colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients))
+    colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls))
+    colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients))
+    colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls))
+
+    # Make patient list for violin plots
+    patient_list <- names(zscore_patients)[-c(1, 2)]
+
+    # from table expected_biomarkers, choose selected columns
+    select_columns <- c("Disease", "HMDB_code", "HMDB_name")
+    select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns)
+    expected_biomarkers_select <- expected_biomarkers[, select_col_nrs]
+    # remove duplicates
+    expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ]
+
+    # load file with explanatory information to be included in PDF.
+    explanation <- readLines(file_explanation)
+
+    # first step: normal violin plots
+    # Find all text files in the given folder, which contain metabolite lists of which
+    # each file will be a page in the pdf with violin plots.
+    # Make a PDF file for each of the categories in metabolite_dirs
+    metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE)
+    for (metabolite_dir in metabolite_dirs) {
+      # create a directory for the output PDFs
+      pdf_dir <- paste(output_dir, metabolite_dir, sep = "/")
+      dir.create(pdf_dir, showWarnings = FALSE)
+      cat("making plots in category:", metabolite_dir, "\n")
+
+      # get a list of all metabolite files
+      metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), 
+            pattern = "*.txt", full.names = FALSE, recursive = FALSE)
+      # put all metabolites into one list
+      metab_list_all <- list()
+      metab_list_names <- c()
+      cat("making plots from the input files:")
+      # open the text files and add each to a list of dataframes (metab_list_all)
+      for (file_index in seq_along(metabolite_files)) {
+        infile <- metabolite_files[file_index]
+        metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), 
+              sep = "\t", header = TRUE, quote = "")
+        # put into list of all lists
+        metab_list_all[[file_index]] <- metab_list
+        metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1])
+        cat(paste0("\n", infile))
+      }
+      # include list of classes in metabolite list
+      names(metab_list_all) <- metab_list_names
+
+      # prepare list of metabolites; max nr_plots_perpage on one page
+      metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients)
+      metab_interest_controls <- prepare_data(metab_list_all, zscore_controls)
+      metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr)
+
+      # for Diagnostics metabolites to be saved in Helix
+      if(grepl("Diagnost", pdf_dir)) {
+        # get table that combines DIMS results with stofgroepen/Helix table
+        dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all)
+        
+        # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs
+        if(any(is_diagnostic_patient(dims_helix_table$Patient))){
+          # get output file for Helix
+          output_helix <- output_for_helix(protocol_name, dims_helix_table)
+          # write output to file
+          path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv")
+          write.csv(output_helix, path_helixfile, quote = F, row.names = F)
+        }
       }
+      
+      # make violin plots per patient
+      for (pt_nr in 1:length(patient_list)) {
+        pt_name <- patient_list[pt_nr]
+        # for category Diagnostics, make list of metabolites that exceed alarm values for this patient
+        # for category Other, make list of top highest and lowest Z-scores for this patient
+        if (grepl("Diagnost", pdf_dir)) {
+          top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table)
+        } else {
+          top_metab_pt <- prepare_toplist(pt_name, zscore_patients)
+        }
+
+        # generate normal violin plots
+        create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt)
+
+      }
+
     }
 
-    # prepare data for plotting dIEM violin plots
-    # If prob_score_top_iem is an empty list, don't make a plot
-    if (length(pt_top_iems) > 0) {
-      # Sorting from high to low, both prob_score_top_iems and pt_top_iems.
-      pt_prob_score_order <- order(-pt_prob_score_top_iems)
-      pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1)
-      pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order]
-      pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order]
-      # getting metabolites for each top_iem disease exactly like in metab_list_all
-      metab_iem_all <- list()
-      metab_iem_names <- c()
-      for (single_iem_index in 1:length(pt_top_iem_sorted)) {
-        single_iem <- pt_top_iem_sorted[single_iem_index]
-        single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index]
-        select_rows <- which(expected_biomarkers_select$Disease == single_iem)
-        metab_list <- expected_biomarkers_select[select_rows, ]
-        metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score))
-        metab_list <- metab_list[, -1]
-        metab_iem_all[[single_iem_index]] <- metab_list
+    # Second step: dIEM plots in separate directory
+    diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/")
+    dir.create(diem_plot_dir)
+
+    # Select the metabolites that are associated with the top highest scoring IEM, for each patient
+    # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely.
+    for (pt_nr in 1:length(patient_list)) {
+      pt_name <- patient_list[pt_nr]
+      # get top diseases for this patient
+      pt_colnr <- which(colnames(disease_rank) == pt_name)
+      pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem)
+      pt_iems <- disease_rank[pt_top_indices, "Disease"]
+      pt_top_iems <- pt_prob_score_top_iems <- c()
+      for (single_iem in pt_iems) {
+        # get the probability score
+        prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr]
+        # use only diseases for which probability score is above threshold
+        if (prob_score_iem >= threshold_iem) {
+          pt_top_iems <- c(pt_top_iems, single_iem)
+          pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem)
+        }
       }
-      # put all metabolites into one list
-      names(metab_iem_all) <- metab_iem_names
 
-      # get Zscore information from zscore_patients_copy, similar to normal violin plots
-      metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy)
-      metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls)
-      # make sure every page has 20 metabolites
-      diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat)
+      # prepare data for plotting dIEM violin plots
+      # If prob_score_top_iem is an empty list, don't make a plot
+      if (length(pt_top_iems) > 0) {
+        # Sorting from high to low, both prob_score_top_iems and pt_top_iems.
+        pt_prob_score_order <- order(-pt_prob_score_top_iems)
+        pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1)
+        pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order]
+        pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order]
+        # getting metabolites for each top_iem disease exactly like in metab_list_all
+        metab_iem_all <- list()
+        metab_iem_names <- c()
+        for (single_iem_index in 1:length(pt_top_iem_sorted)) {
+          single_iem <- pt_top_iem_sorted[single_iem_index]
+          single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index]
+          select_rows <- which(expected_biomarkers_select$Disease == single_iem)
+          metab_list <- expected_biomarkers_select[select_rows, ]
+          metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score))
+          metab_list <- metab_list[, -1]
+          metab_iem_all[[single_iem_index]] <- metab_list
+        }
+        # put all metabolites into one list
+        names(metab_iem_all) <- metab_iem_names
+
+        # get Zscore information from zscore_patients_copy, similar to normal violin plots
+        metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy)
+        metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls)
+        # make sure every page has 20 metabolites
+        diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat)
+
+        # generate dIEM violin plots
+        create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt)
 
-      # generate dIEM violin plots
-      create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt)
+      } else {
+        cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ".
+                    Therefore, this pdf was not made:\t ", pt_name, "_iem \n"))
+      }
 
-    } else {
-      cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ".
-                   Therefore, this pdf was not made:\t ", pt_name, "_iem \n"))
     }
 
   }
-
-}
+}
\ No newline at end of file

From 3f7b6909d4748d1e20895e8ff3ca5c6306cf54a9 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Thu, 30 May 2024 15:58:56 +0200
Subject: [PATCH 69/73] Fixed missing negative HMDB part

---
 DIMS/HMDBparts_main.R | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R
index 932c9e4..14335bf 100644
--- a/DIMS/HMDBparts_main.R
+++ b/DIMS/HMDBparts_main.R
@@ -25,10 +25,8 @@ for (scanmode in scanmodes) {
              HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
   # remove adducts and isotopes, put internal standard at the beginning
-  outlist_is <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ]
   outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ]
   outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ]
-  outlist <- rbind(outlist_is, outlist)
   # sort on m/z value
   outlist <- outlist[order(outlist[ , column_label]), ]
   nr_rows <- dim(outlist)[1]
@@ -47,11 +45,12 @@ for (scanmode in scanmodes) {
       save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData"))
     }
   }
-}
 
-# finish last hmdb part
-start <- end + 1
-end <- nr_rows
+  # finish last hmdb part
+  start <- end + 1
+  end <- nr_rows
 
-outlist_part <- outlist[c(start:end), ]
-save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData"))
+  outlist_part <- outlist[c(start:end), ]
+  save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData"))
+
+}
\ No newline at end of file

From b7b56d8dc084fc196c6f6f928f1f229b8025df71 Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 31 May 2024 15:09:23 +0200
Subject: [PATCH 70/73] code review changes applied

---
 DIMS/AssignToBins.R             | 12 ++++-----
 DIMS/GenerateBreaks.R           |  2 +-
 DIMS/GenerateExcel.R            | 46 ++++++++++++++++-----------------
 DIMS/HMDBparts.R                | 13 +++++-----
 DIMS/UnidentifiedCalcZscores.R  |  4 +--
 DIMS/UnidentifiedPeakGrouping.R | 20 +++++++-------
 DIMS/Utils/estimate_area.R      |  2 +-
 DIMS/Utils/search_mzrange.R     | 10 +++----
 8 files changed, 53 insertions(+), 56 deletions(-)

diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R
index 8479c09..4ca7fce 100644
--- a/DIMS/AssignToBins.R
+++ b/DIMS/AssignToBins.R
@@ -25,7 +25,7 @@ pos_results <- NULL
 neg_results <- NULL
 
 # read in the data for 1 sample
-raw_data <- suppressMessages(xcmsRaw(mzml_filepath))
+raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath))
 
 # for TIC plots: prepare txt files with data for plots
 tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic)
@@ -38,7 +38,7 @@ pos_bins <- bins
 neg_bins <- bins
 
 # Generate a matrix
-raw_data_matrix <- rawMat(raw_data)
+raw_data_matrix <- xcms::rawMat(raw_data)
 
 # Get time values for positive and negative scans
 pos_times <- raw_data@scantime[raw_data@polarity == "positive"]
@@ -111,13 +111,13 @@ rownames(pos_results_transpose) <- sample_name
 rownames(neg_results_transpose) <- sample_name
 
 # delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results
-breaks_fwhm_avg_minus1 <- breaks_fwhm_avg[-length(breaks_fwhm_avg)]
+breaks_fwhm_avg_minuslast <- breaks_fwhm_avg[-length(breaks_fwhm_avg)]
 # Format as string and show precision of float to 5 digits
-breaks_fwhm_avg_minus1 <- sprintf("%.5f", breaks_fwhm_avg_minus1)
+breaks_fwhm_avg_minuslast <- sprintf("%.5f", breaks_fwhm_avg_minuslast)
 
 # Use this as the column names
-colnames(pos_results_transpose) <- breaks_fwhm_avg_minus1
-colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1
+colnames(pos_results_transpose) <- breaks_fwhm_avg_minuslast
+colnames(neg_results_transpose) <- breaks_fwhm_avg_minuslast
 
 # transpose back
 pos_results_final <- t(pos_results_transpose)
diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R
index e475b34..6aacfe8 100644
--- a/DIMS/GenerateBreaks.R
+++ b/DIMS/GenerateBreaks.R
@@ -19,7 +19,7 @@ breaks_fwhm_avg <- NULL
 bins <- NULL
 
 # read in mzML file
-raw_data <- suppressMessages(xcmsRaw(filepath))
+raw_data <- suppressMessages(xcms::xcmsRaw(filepath))
 
 # trim (remove) scans at the start and end
 trim_left  <- round(raw_data@scantime[length(raw_data@scantime) * trim])
diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R
index 7d34204..d36ac29 100644
--- a/DIMS/GenerateExcel.R
+++ b/DIMS/GenerateExcel.R
@@ -37,10 +37,10 @@ robust_scaler <- function(control_intensities, control_col_ids, perc = 5) {
   #' @param perc: Percentage of outliers which will be removed from controls (float)
   #'
   #' @return trimmed_control_intensities: Intensities trimmed for outliers
-  nr_toremove <- ceiling(length(control_col_ids) * perc / 100)
+  nr_to_remove <- ceiling(length(control_col_ids) * perc / 100)
   sorted_control_intensities <- sort(as.numeric(control_intensities))
-  trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : 
-							    (length(sorted_control_intensities) - nr_toremove)]
+  trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1) : 
+							    (length(sorted_control_intensities) - nr_to_remove)]
   return(trimmed_control_intensities)
 }
 
@@ -233,37 +233,37 @@ if (z_score == 1) {
 
     file_png <- paste0(plot_dir, "/", hmdb_name, "_box.png")
     if (is.null(temp_png)) {
-      temp_png <- readPng(file_png)
+      temp_png <- loder::readPng(file_png)
       img_dim <- dim(temp_png)[c(1, 2)]
       cell_dim <- img_dim * imagesize_multiplier
       setColWidths(wb, filelist, cols = 1, widths = cell_dim[2] / 20)
     }
 
-    insertImage(wb,
-                filelist,
-                file_png,
-                startRow = p + 1,
-                startCol = 1,
-                height = cell_dim[1],
-                width = cell_dim[2],
-                units = "px")
+    openxlsx::insertImage(wb,
+                          filelist,
+                          file_png,
+                          startRow = p + 1,
+                          startCol = 1,
+                          height = cell_dim[1],
+                          width = cell_dim[2],
+                          units = "px")
 
     if (p %% 100 == 0) {
       cat("at row: ", p, "\n")
     }
   }
 
-  setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4)
-  setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20)
+  openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4)
+  openxlsx::setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20)
 } else {
-  setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18)
-  setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20)
+  openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18)
+  openxlsx::setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20)
 }
 
 # write Excel file
-writeData(wb, sheet = 1, outlist, startCol = 1)
+openxlsx::writeData(wb, sheet = 1, outlist, startCol = 1)
 xlsx_name <- paste0(outdir, "/", project, ".xlsx")
-saveWorkbook(wb, xlsx_name, overwrite = TRUE)
+openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE)
 rm(wb)
 
 #### INTERNAL STANDARDS ####
@@ -324,7 +324,7 @@ sample_count <- length(repl_pattern)
 
 # change the order of the x-axis summed plots to a natural sorted one
 sample_naturalorder <- unique(as.character(is_summed$Sample))
-sample_naturalorder <- str_sort(sample_naturalorder, numeric = TRUE)
+sample_naturalorder <- stringr::str_sort(sample_naturalorder, numeric = TRUE)
 is_summed$Sample_level <- factor(is_summed$Sample, levels = c(sample_naturalorder))
 is_pos$Sample_level <- factor(is_pos$Sample, levels = c(sample_naturalorder))
 is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder))
@@ -333,10 +333,10 @@ is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder))
 # theme for all IS bar plots
 theme_is_bar <- function(my_plot) {
   my_plot +
-    scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
-    theme(legend.position = "none", 
-	  axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), 
-	  axis.text.y = element_text(size = 6)
+    ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) +
+    ggplot2::theme(legend.position = "none", 
+	           axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), 
+               	   axis.text.y = element_text(size = 6)
     )
 }
 
diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R
index 520286b..3e7356d 100644
--- a/DIMS/HMDBparts.R
+++ b/DIMS/HMDBparts.R
@@ -12,7 +12,8 @@ load(breaks_file)
 min_mz <- round(breaks_fwhm[1])
 max_mz <- round(breaks_fwhm[length(breaks_fwhm)])
 
-# In case of a standard run (m/z 69-606) use external HMDB parts
+# In case of a standard run use external HMDB parts
+# m/z is approximately 70 to 600: set limits between 68-71 for min and 599-610 for max
 if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) {
   # skip generating HMDB parts
   hmdb_parts_path <- cmd_args[3] 
@@ -30,18 +31,18 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz <
   for (scanmode in scanmodes) {
     if (scanmode == "negative") {
       column_label <- "MNeg"
-      HMDB_add_iso <- HMDB_add_iso.Neg
+      hmdb_add_iso <- hmdb_add_iso.Neg
     } else if (scanmode == "positive") {
       column_label <- "Mpos"
-      HMDB_add_iso <- HMDB_add_iso.Pos
+      hmdb_add_iso <- hmdb_add_iso.Pos
     }
 
     # filter mass range meassured
-    HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] &
-                                      HMDB_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
+    hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= breaks_fwhm[1] &
+                                      hmdb_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ]
 
     # sort on mass
-    outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[ , column_label])),]
+    outlist <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),]
     nr_rows <- dim(outlist)[1]
 
     # maximum number of rows per file
diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
index 0192d0f..e62776c 100755
--- a/DIMS/UnidentifiedCalcZscores.R
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -7,8 +7,8 @@ scripts_dir <- cmd_args[1]
 ppm <- as.numeric(cmd_args[2])
 z_score <- as.numeric(cmd_args[3])
 
-source(paste0(scripts_dir, "mergeDuplicatedRows.R"))
-source(paste0(scripts_dir, "statistics_z.R"))
+source(paste0(scripts_dir, "merge_duplicate_rows.R"))
+source(paste0(scripts_dir, "calculate_zscores.R"))
 
 # for each scan mode, collect all filled peak group lists
 scanmodes <- c("positive", "negative")
diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R
index 08da5d2..edeb2f7 100755
--- a/DIMS/UnidentifiedPeakGrouping.R
+++ b/DIMS/UnidentifiedPeakGrouping.R
@@ -25,32 +25,32 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
 
   # while (dim(outlist_copy)[1] > 0) {
   while (dim(outlist_copy)[1] > nrow_div) {
-    sel <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1]
+    select_max_height <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1]
 
     # ppm range around max
-    mzref <- as.numeric(outlist_copy[sel, "mzmed.pkt"])
+    mzref <- as.numeric(outlist_copy[select_max_height, "mzmed.pkt"])
     pkmin <- -(range * mzref - mzref)
     pkmax <- 2 * mzref - pkmin
 
-    selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax
-    tmplist <- outlist_copy[selp, , drop = FALSE]
+    select_peaks_within_range <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax
+    tmplist <- outlist_copy[select_peaks_within_range, , drop = FALSE]
 
     nrsamples <- length(unique(tmplist[, "samplenr"]))
     if (nrsamples > 0) {
-      mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"]))
+      mzmed_pgrp <- mean(as.numeric(outlist_copy[select_peaks_within_range, "mzmed.pkt"]))
       mzmin_pgrp <- -(range * mzmed_pgrp - mzmed_pgrp)
       mzmax_pgrp <- 2 * mzmed_pgrp - mzmin_pgrp
       # select peaks within mz range
-      selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp
-      tmplist <- outlist_copy[selp, , drop = FALSE]
+      select_peaks_within_range <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp
+      tmplist <- outlist_copy[select_peaks_within_range, , drop = FALSE]
 
       # remove used peaks
       tmp <- as.vector(which(tmplist[, "height.pkt"] == -1))
       if (length(tmp) > 0) tmplist <- tmplist[-tmp, , drop = FALSE]
 
       nrsamples <- length(unique(tmplist[, "samplenr"]))
-      fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"]))
-      fq_best_pgrp  <- as.numeric(min(outlist_copy[selp, "fq"]))
+      fq_worst_pgrp <- as.numeric(max(outlist_copy[select_peaks_within_range, "fq"]))
+      fq_best_pgrp  <- as.numeric(min(outlist_copy[select_peaks_within_range, "fq"]))
       ints_allsamps <- rep(0, length(names(repl_pattern_filtered)))
       names(ints_allsamps) <- names(repl_pattern_filtered)
 
@@ -65,7 +65,7 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) {
 					mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA))
     }
 
-    outlist_copy <- outlist_copy[-which(selp == TRUE), , drop = FALSE]
+    outlist_copy <- outlist_copy[-which(select_peaks_within_range == TRUE), , drop = FALSE]
   }
 
   outpgrlist <- as.data.frame(outpgrlist)
diff --git a/DIMS/Utils/estimate_area.R b/DIMS/Utils/estimate_area.R
index 1d24de2..b191f55 100644
--- a/DIMS/Utils/estimate_area.R
+++ b/DIMS/Utils/estimate_area.R
@@ -20,7 +20,7 @@ estimate_area <- function(mass_max, resol, scale, sigma, int_factor) {
   mz_min <- mass_max - 2 * fwhm
   mz_max <- mass_max + 2 * fwhm
   mz_range <- mz_max - mz_min 
-  mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int.factor)
+  mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor)
 
   # estimate area under the curve
   area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100
diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R
index ba6920e..dcdc42a 100644
--- a/DIMS/Utils/search_mzrange.R
+++ b/DIMS/Utils/search_mzrange.R
@@ -69,7 +69,7 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
           roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol,
                                  outdir, sample_name, scanmode, plot, width, height,
                                  mz_index, start_index, end_index)
-
+	  print(roi_values)
           if (roi_values$qual[1] == 1) {
             # get optimized fit values
             roi_values <- fit_optim(mass_vector, int_vector, resol, plot,
@@ -98,7 +98,6 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
 
           roi_values <- fit_optim(mass_vector, int_vector, resol,
                                   plot, scanmode, int_factor, width, height)
-
           allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
           allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
           allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)
@@ -138,11 +137,9 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
         # Check only zeros
         if (sum(int_vector) == 0) next
 
-        roi_values <- fit_init(mass_vector, int_vector, int_factor, scale,
-                               resol, outdir, sample_name, scanmode,
-                               plot, width, height,
+        roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol, 
+			       outdir, sample_name, scanmode, plot, width, height,
                                mz_index, start_index, end_index)
-
         if (roi_values$qual[1] == 1) {
           roi_values <- fit_optim(mass_vector, int_vector, resol,
                                   plot, scanmode, int_factor, width, height)
@@ -168,7 +165,6 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r
       } else {
         roi_values <- fit_optim(mass_vector, int_vector, resol,
                                 plot, scanmode, int_factor, width, height)
-
         allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean)
         allpeaks_values$area <- c(allpeaks_values$area, roi_values$area)
         allpeaks_values$nr <- c(allpeaks_values$nr, sample_name)

From 72e53d5d265b9c53e063db77c45dc1a055f65d3a Mon Sep 17 00:00:00 2001
From: Mia Pras-Raves <M.L.Raves-2@umcutrecht.nl>
Date: Fri, 31 May 2024 15:11:25 +0200
Subject: [PATCH 71/73] code review changes applied

---
 DIMS/Utils/do_peakfinding.R |  5 +---
 DIMS/Utils/fit_gaussian.R   | 47 +++++++++++++++++++++++++++----------
 DIMS/Utils/fit_gaussians.R  | 30 +++++------------------
 3 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/DIMS/Utils/do_peakfinding.R b/DIMS/Utils/do_peakfinding.R
index 004d96f..fff6afe 100644
--- a/DIMS/Utils/do_peakfinding.R
+++ b/DIMS/Utils/do_peakfinding.R
@@ -43,10 +43,7 @@ do_peakfinding <- function(sample_avgtechrepl, int_factor, scale, resol, outdir,
                              "height.pkt" = allpeaks_values$area)
 
   # remove peaks with height = 0
-  index <- which(outlist_persample[, "height.pkt"] == 0)
-  if (length(index) > 0) {
-    outlist_persample <- outlist_persample[-index, ]
-  }
+  outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ]
 
   # save output to file
   save(outlist_persample, file = paste0(sample_name, "_", scanmode, ".RData"))
diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R
index b22ba01..ed3cd0c 100644
--- a/DIMS/Utils/fit_gaussian.R
+++ b/DIMS/Utils/fit_gaussian.R
@@ -40,19 +40,20 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
     fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             plot, fit_quality1, use_bounds)
-  print("nu in fit_gaussian, if force == 1")
-  print(fit_values)
+    print("fit_gaussian, force == 1, values fit_values:")
+    print(fit_values)
     # set initial value for scale factor
     scale <- 2
     # test if the mean is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) {
       # run this function again with fixed boundaries
+      print("nu in fit_gaussian, mean outside m/z range")
       return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                           outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height))
-    print("nu in fit_gaussian, mean outside m/z range")
     } else {
       # test if the fit is bad
       if (fit_values$qual > fit_quality1) {
+        print("fit_gaussian, force == 1, qual > fit_quality")
         # Try to fit two curves; find two local maxima
         new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
         # test if there are two indices in new_index
@@ -64,9 +65,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index,
                             scale, resol, outdir, force = 2, use_bounds = FALSE,
                             plot, scanmode, int_factor, width, height))
-    print("nu in fit_gaussian, bad fit ")
       # good fit
       } else {
+        print("fit_gaussian, force == 1, qual < fit_quality")
         peak_mean <- c(peak_mean, fit_values$mean)
         peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale,
                                           fit_values$sigma, int_factor))
@@ -81,18 +82,21 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
-  print("nu in fit_gaussian force == 2")
-  print(fit_values)
+    print("fit_gaussian force == 2, values fit_values:")
+    print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
+      print("fit_gaussian, force == 2, mean outside m/z range")
       # check if fit quality is bad
       if (fit_values$qual > fit_quality) {
+        print("fit_gaussian, force == 2, qual > fit_quality")
         # run this function again with fixed boundaries
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force = 2, use_bounds = TRUE,
                             plot, scanmode, int_factor, width, height))
       } else {
+        print("fit_gaussian, fource == 2, qual < fit_quality")
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
         for (i in 1:length(fit_values$mean)){
@@ -110,8 +114,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       }
     # if all means are within range
     } else {
+      print("fit_gaussian, force == 2, means within m/z range")
       # check for bad fit
       if (fit_values$qual > fit_quality) {
+        print("fit_gaussian, force == 2, qual > fit_quality")
         # Try to fit three curves; find three local maxima
         new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
         # test if there are three indices in new_index
@@ -125,6 +131,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
                             plot, scanmode, int_factor, width, height))
       # good fit, all means are within m/z range
       } else {
+        print("fit_gaussian, force == 2, qual < fit_quality")
         # check if means are within 3 ppm and sum if so
         tmp <- fit_values$qual
         nr_means_new <- -1
@@ -145,20 +152,22 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
-  print("nu in fit_gaussian force == 3")
-  print(fit_values)
+    print("fit_gaussian force == 3, values fit_values:")
+    print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
         fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) {
-
+      print("fit_gaussian, force == 3, mean outside mz range")
       # check if fit quality is bad
       if (fit_values$qual > fit_quality) {
+        print("fit_gaussian, force == 3, qual > fit_quality")
         # run this function again with fixed boundaries
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force, use_bounds = TRUE,
                             plot, scanmode, int_factor, width, height))
       } else {
+        print("fit_gaussian, force == 3, qual < fit_quality")
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
         for (i in 1:length(fit_values$mean)) {
@@ -176,8 +185,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       }
     # if all means are within range
     } else {
+      print("fit_gaussian, force == 3, means within range")
       # check for bad fit
       if (fit_values$qual > fit_quality) {
+        print("fit_gaussian, force == 3, qual > fit_quality")
         # Try to fit four curves; find four local maxima
         new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
         # test if there are four indices in new_index
@@ -191,6 +202,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
                             int_factor, width, height))
       # good fit, all means are within m/z range
       } else {
+        print("fit_gaussian, force == 3, qual < fit_quality")
         # check if means are within 4 ppm and sum if so  
         tmp <- fit_values$qual
         nr_means_new <- -1
@@ -211,22 +223,24 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
-  print("nu in fit_gaussian force == 4")
-  print(fit_values)
+    print("fit_gaussian force == 4, values fit_values:")
+    print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
         fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] ||
         fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) {
-
+      print("fit_gaussian, force == 4, mean outside mz range")
       # check if quality of fit is bad
       if (fit_values$qual > fit_quality) {
+	print("fit_gaussian, force == 4, qual > fit_quality")
         # run this function again with fixed boundaries
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force, use_bounds = TRUE,
                             plot, scanmode, int_factor, width, height))
 
       } else {
+        print("fit_gaussian, force == 4, qual < fit_quality")
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
         for (i in 1:length(fit_values$mean)) {
@@ -244,14 +258,17 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       }
     # if all means are within range
     } else {
+      print("fit_gaussian, force == 4, means within range")
       # check for bad fit
       if (fit_values$qual > fit_quality) {
+        print("fit_gaussian, force == 4, qual > fit_quality")
         # Try to fit 1 curve, force = 5
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force = 5, use_bounds = FALSE,
                             plot, scanmode, int_factor, width, height))
       # good fit, all means are within m/z range
       } else {
+        print("fit_gaussian, force == 4, qual < fit_quality")
         # check if means are within 4 ppm and sum if so
         tmp <- fit_values$qual 
         nr_means_new <- -1
@@ -275,8 +292,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     max_index <- which(int_vector == max(int_vector))
     fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             plot, fit_quality1, use_bounds)
+    print("fit_gaussian, force > 4, values fit_values:")
+    print(fit_values)
     # check for bad fit
     if (fit_values$qual > fit_quality1) {
+      print("fit_gaussian, force > 4, qual > fit_quality1")
       # remove
       if (plot) dev.off()    
       # get fit values from fit_optim
@@ -287,6 +307,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       peak_max <- fit_values$max
       peak_qual <- 0
     } else {
+      print("fit_gaussian, force > 4, qual < fit_quality1")
       peak_mean <- c(peak_mean, fit_values$mean)
       peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
       peak_qual <- fit_values$qual
@@ -301,6 +322,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
                          "qual" = peak_qual,
                          "min" = peak_min,
                          "max" = peak_max)
+  print("fit_gaussian, end script, values roi_value_list:")
+  print(roi_value_list)
   return(roi_value_list)
 }
 
diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R
index 1a4c34e..2e811fd 100644
--- a/DIMS/Utils/fit_gaussians.R
+++ b/DIMS/Utils/fit_gaussians.R
@@ -23,15 +23,9 @@ print("nu in fit_1gaussian")
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
     upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf)
     # get optimal value for fitted Gaussian curve
-    tryCatch(opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
+    opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
                      opt_f, control = list(maxit = 10000), method = "L-BFGS-B",
-                     lower = lower, upper = upper),
-	     error = function(e) {
-		     # in case of error, use regular optim without boundaries
-                     opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
-                                      opt_f, control = list(maxit = 10000))
-                     write.table(opt_fit, file = paste0("tryCatch_error_", query_mass, ".txt"), row.names = FALSE)
-             } )
+                     lower = lower, upper = upper)
   } else {
     opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)),
                      opt_f, control = list(maxit = 10000))
@@ -72,27 +66,15 @@ print("nu in fit_2gaussians")
     # get optimal value for 2 fitted Gaussian curves
     if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {
       sigma2 <- sigma1
-      tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
                                 as.numeric(query_mass1), as.numeric(scale1)),
                                 opt_f, control = list(maxit = 10000),
-                                method = "L-BFGS-B", lower = lower, upper = upper),
-	       error = function(e) {
-                     # in case of error, use regular optim without boundaries
-                     opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                                      as.numeric(query_mass1), as.numeric(scale1)),
-                                      opt_f, control = list(maxit = 10000))
-                     write.table(res, file = paste0("tryCatch_error_2gauss_", query_mass, ".txt"), row.names = FALSE) } )
+                                method = "L-BFGS-B", lower = lower, upper = upper)
     } else {
-      tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
+      opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
                                 as.numeric(query_mass2), as.numeric(scale2)),
                                 opt_f, control = list(maxit = 10000),
-                                method = "L-BFGS-B", lower = lower, upper = upper),
-	       error = function(e) {
-                     # in case of error, use regular optim without boundaries
-                     opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1),
-                                      as.numeric(query_mass1), as.numeric(scale1)),
-                                      opt_f, control = list(maxit = 10000))
-                     write.table(res, file = paste0("tryCatch_error_2gauss_else_", query_mass, ".txt"), row.names = FALSE) } )
+                                method = "L-BFGS-B", lower = lower, upper = upper)
     }
   } else {
     if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) {

From c7b44f6c6421cff96437c8268cc327de9843fa11 Mon Sep 17 00:00:00 2001
From: Anne Luesink <anneluesink@live.nl>
Date: Tue, 4 Jun 2024 10:59:39 +0200
Subject: [PATCH 72/73] Changes for using Utils

---
 DIMS/CollectFilled.R             |   4 +-
 DIMS/GenerateViolinPlots.nf      |  10 ++--
 DIMS/SpectrumPeakFinding.R       |   6 +-
 DIMS/SumAdducts.R                |   2 +-
 DIMS/UnidentifiedCalcZscores.R   |   4 +-
 DIMS/UnidentifiedCollectPeaks.R  |   9 ---
 DIMS/UnidentifiedFillMissing.R   |  20 +++----
 DIMS/Utils/calculate_zscores.R   |   6 +-
 DIMS/Utils/fit_gaussian.R        | 100 ++++++++++++++-----------------
 DIMS/Utils/fit_gaussians.R       |   4 --
 DIMS/Utils/fit_init.R            |   1 -
 DIMS/Utils/fit_optim.R           |   2 -
 DIMS/Utils/get_element_info.R    |   4 +-
 DIMS/Utils/identify_noisepeaks.R |   2 +-
 DIMS/Utils/replace_zeros.R       |   4 +-
 15 files changed, 76 insertions(+), 102 deletions(-)

diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R
index 48fba8c..a0ad358 100755
--- a/DIMS/CollectFilled.R
+++ b/DIMS/CollectFilled.R
@@ -22,7 +22,7 @@ for (scanmode in scanmodes) {
     outlist_total <- rbind(outlist_total, peakgrouplist_filled)
   }
   # remove duplicates; peak groups with exactly the same m/z
-  outlist_total <- mergeDuplicatedRows(outlist_total)
+  outlist_total <- merge_duplicate_rows(outlist_total)
   # sort on mass
   outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ]
   # load replication pattern
@@ -30,7 +30,7 @@ for (scanmode in scanmodes) {
   repl_pattern <- get(load(pattern_file))
   # calculate Z-scores
   if (z_score == 1) {
-    outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE)
+    outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE)
     nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
     order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
     outlist_stats_more <- cbind(
diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf
index 641d27a..44323ac 100644
--- a/DIMS/GenerateViolinPlots.nf
+++ b/DIMS/GenerateViolinPlots.nf
@@ -9,11 +9,11 @@ process GenerateViolinPlots {
        val(analysis_id)
 
     output:
-       path('Diagnostics/*.pdf'), emit: diag_plot_files
-       path('Other/*.pdf'), emit: other_plot_files
-       path('dIEM_plots/*.pdf'), emit: diem_plot_files
-       path('*.xlsx'), emit: excel_file
-       path('*.csv'), emit: helix_file
+       path('Diagnostics/*.pdf'), emit: diag_plot_files, optional: true
+       path('Other/*.pdf'), emit: other_plot_files, optional: true
+       path('dIEM_plots/*.pdf'), emit: diem_plot_files, optional: true
+       path('*.xlsx'), emit: excel_file, optional: true
+       path('*.csv'), emit: helix_file, optional: true
 
     script:
         """
diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R
index 143299e..8b249e2 100644
--- a/DIMS/SpectrumPeakFinding.R
+++ b/DIMS/SpectrumPeakFinding.R
@@ -26,7 +26,7 @@ for (scanmode in scanmodes) {
   for (file_nr in 1:length(peaklist_files)) {
     cat("\n", peaklist_files[file_nr])
     load(peaklist_files[file_nr])
-    if (is.null(outlist.persample) || (dim(outlist.persample)[1] == 0)) {
+    if (is.null(outlist_persample) || (dim(outlist_persample)[1] == 0)) {
       tmp <- strsplit(peaklist_files[file_nr], "/")[[1]]
       fname <- tmp[length(tmp)]
       fname <- strsplit(fname, ".RData")[[1]][1]
@@ -38,9 +38,9 @@ for (scanmode in scanmodes) {
       }
     } else {
       if (file_nr == 1) {
-        outlist_total <- outlist.persample
+        outlist_total <- outlist_persample
       } else {
-        outlist_total <- rbind(outlist_total, outlist.persample)
+        outlist_total <- rbind(outlist_total, outlist_persample)
       }
     }
   }
diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R
index 877b3c8..699da4a 100755
--- a/DIMS/SumAdducts.R
+++ b/DIMS/SumAdducts.R
@@ -43,7 +43,7 @@ sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number
         int_cols <- c(int_cols_C, int_cols_P)
         ints <- peaklist[metab, int_cols]
       } else {
-        ints <- peaklist[metab, c(7:(length(grpnames_long) + 6))]
+        ints <- peaklist[metab, c(3:(length(grpnames_long) + 2))]
       }
       total <- apply(ints, 2, sum)
 
diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R
index e62776c..ec31a3f 100755
--- a/DIMS/UnidentifiedCalcZscores.R
+++ b/DIMS/UnidentifiedCalcZscores.R
@@ -23,7 +23,7 @@ for (scanmode in scanmodes) {
   }
 
   # remove duplicates; peak groups with exactly the same m/z
-  outlist_total <- mergeDuplicatedRows(outlist_total)
+  outlist_total <- merge_duplicate_rows(outlist_total)
 
   # sort on mass
   outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ]
@@ -34,7 +34,7 @@ for (scanmode in scanmodes) {
 
   if (z_score == 1) {
     # calculate Z-scores
-    outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE)
+    outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE)
     nr_removed_samples <- length(which(repl_pattern[] == "character(0)"))
     order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)])
     outlist_stats_more <- cbind(outlist_stats[, 1:7],
diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R
index 7376eb6..1536213 100755
--- a/DIMS/UnidentifiedCollectPeaks.R
+++ b/DIMS/UnidentifiedCollectPeaks.R
@@ -52,15 +52,6 @@ for (scanmode in scanmodes) {
     }
 
     outlist_part <- outlist[c(start_part:end_part), ]
-    # # add ppm extra before start
-    # if (part != 1) {
-    #   mz_start <- outlist_part[1, "mzmed.pkt"]
-    #   mz_ppm_range <- ppm * as.numeric(mz_start) / 1e+06
-    #   mz_start_min_ppm <- mz_start - mz_ppm_range
-    #   outlist_before_part <- outlist %>% filter(mzmed.pkt >= mz_start_min_ppm & mzmed.pkt < mz_start)
-
-    #   outlist_part <- rbind(outlist_before_part, outlist_part)
-    # }
 
     # add ppm extra after end
     if (part != num_parts) {
diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R
index 1611986..78b7951 100755
--- a/DIMS/UnidentifiedFillMissing.R
+++ b/DIMS/UnidentifiedFillMissing.R
@@ -12,15 +12,15 @@ ppm <- as.numeric(cmd_args[5])
 outdir  <- "./"
 
 # load in function scripts
-source(paste0(scripts_dir, "replaceZeros.R"))
-source(paste0(scripts_dir, "generateGaussian.R"))
-source(paste0(scripts_dir, "getFwhm.R"))
-source(paste0(scripts_dir, "getSD.R"))
-source(paste0(scripts_dir, "getArea.R"))
-source(paste0(scripts_dir, "optimizeGauss.R"))
-source(paste0(scripts_dir, "ident.hires.noise.HPC.R"))
-source(paste0(scripts_dir, "elementInfo.R"))
-source(paste0(scripts_dir, "globalAssignments.HPC.R"))
+source(paste0(scripts_dir, "replace_zeros.R"))
+source(paste0(scripts_dir, "fit_optim.R"))
+source(paste0(scripts_dir, "get_fwhm.R"))
+source(paste0(scripts_dir, "get_stdev.R"))
+source(paste0(scripts_dir, "estimate_area.R"))
+source(paste0(scripts_dir, "optimize_gaussfit.R"))
+source(paste0(scripts_dir, "identify_noisepeaks.R"))
+source(paste0(scripts_dir, "get_element_info.R"))
+source(paste0(scripts_dir, "atomic_info.R"))
 
 # peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2)
 # for (peakgrouplist_file in peakgrouplist_files) {
@@ -41,7 +41,7 @@ outpgrlist_identified <- get(load(peakgrouplist_file))
 outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file)
 
 # replace missing values (zeros) with random noise
-peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
+peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm)
 
 # save output
 save(peakgrouplist_filled, file = outputfile_name)
diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R
index 9791e45..d687376 100644
--- a/DIMS/Utils/calculate_zscores.R
+++ b/DIMS/Utils/calculate_zscores.R
@@ -1,8 +1,7 @@
 ## adapted from statistics_z.R
 # refactor: change column names from avg.ctrls to avg_ctrls, sd.ctrls to sd_ctrls
-# remove parameter sort_col
 # check logic of parameter adducts
-calculate_zscores <- function(peakgroup_list, sort_col, adducts) {
+calculate_zscores <- function(peakgroup_list, adducts) {
   #' Calculate Z-scores for peak groups based on average and standard deviation of controls
   #'
   #' @param peakgroup_list: Peak group list (matrix)
@@ -13,6 +12,8 @@ calculate_zscores <- function(peakgroup_list, sort_col, adducts) {
 
   case_label <- "P"
   control_label <- "C"
+  # get index for new column names
+  startcol <- ncol(peakgroup_list) + 3
 
   # calculate mean and standard deviation for Control group
   ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE)
@@ -35,7 +36,6 @@ calculate_zscores <- function(peakgroup_list, sort_col, adducts) {
   }
 
   # apply new column names to columns at end plus avg and sd columns
-  startcol <- ncol(peakgroup_list) + 3
   colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores
 
   # add ppm deviation column
diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R
index ed3cd0c..6be9660 100644
--- a/DIMS/Utils/fit_gaussian.R
+++ b/DIMS/Utils/fit_gaussian.R
@@ -5,8 +5,8 @@
 # remove plot sections (commented out)
 # several functions need to be loaded before this function can run
 fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
-                        outdir, force, use_bounds, plot, scanmode,
-                        int_factor, width, height) {
+                         outdir, force, use_bounds, plot, scanmode,
+                         int_factor, width, height) {
   #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z
   #'
   #' @param mass_vector2: Vector of equally spaced m/z values (float)
@@ -40,20 +40,16 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual)
     fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             plot, fit_quality1, use_bounds)
-    print("fit_gaussian, force == 1, values fit_values:")
-    print(fit_values)
     # set initial value for scale factor
     scale <- 2
     # test if the mean is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) {
       # run this function again with fixed boundaries
-      print("nu in fit_gaussian, mean outside m/z range")
       return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                           outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height))
     } else {
       # test if the fit is bad
       if (fit_values$qual > fit_quality1) {
-        print("fit_gaussian, force == 1, qual > fit_quality")
         # Try to fit two curves; find two local maxima
         new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
         # test if there are two indices in new_index
@@ -65,12 +61,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index,
                             scale, resol, outdir, force = 2, use_bounds = FALSE,
                             plot, scanmode, int_factor, width, height))
-      # good fit
+        # good fit
       } else {
-        print("fit_gaussian, force == 1, qual < fit_quality")
         peak_mean <- c(peak_mean, fit_values$mean)
         peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale,
-                                          fit_values$sigma, int_factor))
+                                                fit_values$sigma, int_factor))
         peak_qual <- fit_values$qual
         peak_min <- mass_vector[1]
         peak_max <- mass_vector[length(mass_vector)]
@@ -82,21 +77,16 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
-    print("fit_gaussian force == 2, values fit_values:")
-    print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) {
-      print("fit_gaussian, force == 2, mean outside m/z range")
       # check if fit quality is bad
       if (fit_values$qual > fit_quality) {
-        print("fit_gaussian, force == 2, qual > fit_quality")
         # run this function again with fixed boundaries
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force = 2, use_bounds = TRUE,
                             plot, scanmode, int_factor, width, height))
       } else {
-        print("fit_gaussian, fource == 2, qual < fit_quality")
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
         for (i in 1:length(fit_values$mean)){
@@ -112,12 +102,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         peak_min <- mass_vector[1]
         peak_max <- mass_vector[length(mass_vector)]
       }
-    # if all means are within range
+      # if all means are within range
     } else {
-      print("fit_gaussian, force == 2, means within m/z range")
       # check for bad fit
       if (fit_values$qual > fit_quality) {
-        print("fit_gaussian, force == 2, qual > fit_quality")
         # Try to fit three curves; find three local maxima
         new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
         # test if there are three indices in new_index
@@ -129,9 +117,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index,
                             scale, resol, outdir, force = 3, use_bounds = FALSE,
                             plot, scanmode, int_factor, width, height))
-      # good fit, all means are within m/z range
+        # good fit, all means are within m/z range
       } else {
-        print("fit_gaussian, force == 2, qual < fit_quality")
         # check if means are within 3 ppm and sum if so
         tmp <- fit_values$qual
         nr_means_new <- -1
@@ -144,30 +131,33 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         }
         # restore original quality score
         fit_values$qual <- tmp
-      }  
+
+        for (i in 1:length(fit_values$mean)){
+          peak_mean <- c(peak_mean, fit_values$mean[i])
+          peak_area <- c(peak_area, fit_values$area[i])
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+      }
     }
 
-  # Three local maxima; need at least 6 data points for this 
+    # Three local maxima; need at least 6 data points for this 
   } else if (force == 3 && (length(mass_vector) > 6)) {
     # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
-    print("fit_gaussian force == 3, values fit_values:")
-    print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
         fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) {
-      print("fit_gaussian, force == 3, mean outside mz range")
       # check if fit quality is bad
       if (fit_values$qual > fit_quality) {
-        print("fit_gaussian, force == 3, qual > fit_quality")
         # run this function again with fixed boundaries
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force, use_bounds = TRUE,
                             plot, scanmode, int_factor, width, height))
       } else {
-        print("fit_gaussian, force == 3, qual < fit_quality")
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
         for (i in 1:length(fit_values$mean)) {
@@ -183,12 +173,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         peak_min <- mass_vector[1]
         peak_max <- mass_vector[length(mass_vector)]
       }
-    # if all means are within range
+      # if all means are within range
     } else {
-      print("fit_gaussian, force == 3, means within range")
       # check for bad fit
       if (fit_values$qual > fit_quality) {
-        print("fit_gaussian, force == 3, qual > fit_quality")
         # Try to fit four curves; find four local maxima
         new_index <- which(diff(sign(diff(int_vector))) == -2) + 1
         # test if there are four indices in new_index
@@ -200,10 +188,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol,
                             outdir, force = 4, use_bounds = FALSE, plot, scanmode,
                             int_factor, width, height))
-      # good fit, all means are within m/z range
+        # good fit, all means are within m/z range
       } else {
-        print("fit_gaussian, force == 3, qual < fit_quality")
-        # check if means are within 4 ppm and sum if so  
+        # check if means are within 4 ppm and sum if so
         tmp <- fit_values$qual
         nr_means_new <- -1
         nr_means <- length(fit_values$mean)
@@ -215,32 +202,36 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         }
         # restore original quality score
         fit_values$qual <- tmp
+
+        for (i in 1:length(fit_values$mean)){
+          peak_mean <- c(peak_mean, fit_values$mean[i])
+          peak_area <- c(peak_area, fit_values$area[i])
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+
       }
     }
-    
-  #### Four local maxima; need at least 6 data points for this ####  
+
+    #### Four local maxima; need at least 6 data points for this ####  
   } else if (force == 4 && (length(mass_vector) > 6)) {
     # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual)
     fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                              use_bounds, plot, fit_quality, int_factor)
-    print("fit_gaussian force == 4, values fit_values:")
-    print(fit_values)
     # test if one of the means is outside the m/z range
     if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] ||
         fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] ||
         fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] ||
         fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) {
-      print("fit_gaussian, force == 4, mean outside mz range")
       # check if quality of fit is bad
       if (fit_values$qual > fit_quality) {
-	print("fit_gaussian, force == 4, qual > fit_quality")
         # run this function again with fixed boundaries
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force, use_bounds = TRUE,
                             plot, scanmode, int_factor, width, height))
 
       } else {
-        print("fit_gaussian, force == 4, qual < fit_quality")
         # check which mean is outside range and remove it from the list of means
         # NB: peak_mean and other variables have not been given values from 2-peak fit yet!
         for (i in 1:length(fit_values$mean)) {
@@ -256,19 +247,16 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         peak_min <- mass_vector[1]
         peak_max <- mass_vector[length(mass_vector)]
       }
-    # if all means are within range
+      # if all means are within range
     } else {
-      print("fit_gaussian, force == 4, means within range")
       # check for bad fit
       if (fit_values$qual > fit_quality) {
-        print("fit_gaussian, force == 4, qual > fit_quality")
         # Try to fit 1 curve, force = 5
         return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             outdir, force = 5, use_bounds = FALSE,
                             plot, scanmode, int_factor, width, height))
-      # good fit, all means are within m/z range
+        # good fit, all means are within m/z range
       } else {
-        print("fit_gaussian, force == 4, qual < fit_quality")
         # check if means are within 4 ppm and sum if so
         tmp <- fit_values$qual 
         nr_means_new <- -1
@@ -281,10 +269,19 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
         }
         # restore original quality score
         fit_values$qual <- tmp
-      }  
+
+        for (i in 1:length(fit_values$mean)){
+          peak_mean <- c(peak_mean, fit_values$mean[i])
+          peak_area <- c(peak_area, fit_values$area[i])
+        }
+        peak_qual <- fit_values$qual
+        peak_min <- mass_vector[1]
+        peak_max <- mass_vector[length(mass_vector)]
+
+      }
     }
-    
-  # More than four local maxima; fit 1 peak.
+
+    # More than four local maxima; fit 1 peak.
   } else {
     scale <- 2
     fit_quality1 <- 0.40
@@ -292,13 +289,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
     max_index <- which(int_vector == max(int_vector))
     fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol,
                             plot, fit_quality1, use_bounds)
-    print("fit_gaussian, force > 4, values fit_values:")
-    print(fit_values)
     # check for bad fit
     if (fit_values$qual > fit_quality1) {
-      print("fit_gaussian, force > 4, qual > fit_quality1")
       # remove
-      if (plot) dev.off()    
+      if (plot) dev.off()
       # get fit values from fit_optim
       fit_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height)
       peak_mean <- c(peak_mean, fit_values$mean)
@@ -307,7 +301,6 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
       peak_max <- fit_values$max
       peak_qual <- 0
     } else {
-      print("fit_gaussian, force > 4, qual < fit_quality1")
       peak_mean <- c(peak_mean, fit_values$mean)
       peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor))
       peak_qual <- fit_values$qual
@@ -322,8 +315,5 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale
                          "qual" = peak_qual,
                          "min" = peak_min,
                          "max" = peak_max)
-  print("fit_gaussian, end script, values roi_value_list:")
-  print(roi_value_list)
   return(roi_value_list)
 }
-
diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R
index 2e811fd..05bff86 100644
--- a/DIMS/Utils/fit_gaussians.R
+++ b/DIMS/Utils/fit_gaussians.R
@@ -17,7 +17,6 @@ fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use
     d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma)
     sum((d - int_vector) ^ 2)
   }
-print("nu in fit_1gaussian")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
@@ -58,7 +57,6 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2,
     sum((d - int_vector) ^ 2)
   }
 
-print("nu in fit_2gaussians")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0)
@@ -121,7 +119,6 @@ fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3,
     sum((d - int_vector) ^ 2)
   }
 
-print("nu in fit_3gaussians")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0)
@@ -176,7 +173,6 @@ fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigm
     sum((d - int_vector) ^ 2)
   }
 
-print("nu in fit_4gaussians")
   if (use_bounds) {
     # determine lower and upper boundaries
     lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0)
diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R
index 0c4e513..ecae337 100644
--- a/DIMS/Utils/fit_init.R
+++ b/DIMS/Utils/fit_init.R
@@ -33,7 +33,6 @@ fit_init <- function(mass_vector, int_vector, int_factor, scale, resol,
 
   # Find the index in int_vector with the highest intensity
   max_index <- which(int_vector == max(int_vector))
-print("nu in fit_init")
   roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index,
                              scale, resol, outdir, force = length(max_index),
                              use_bounds = FALSE, plot, scanmode, int_factor, width, height)
diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R
index ed71622..404cc7a 100644
--- a/DIMS/Utils/fit_optim.R
+++ b/DIMS/Utils/fit_optim.R
@@ -38,12 +38,10 @@ fit_optim <- function(mass_vector, int_vector, resol,
 
   # get an estimate of the area under the peak
   area <- estimate_area(mass_max, resol, scale, sigma, int_factor)
-print("nu in fit_optim")  
   # put all values for this region of interest into a list
   roi_value_list <- list("mean" = mass_max,
                          "area" = area,
                          "min" = mass_vector2[1],
                          "max" = mass_vector2[length(mass_vector2)])
-print(roi_value_list)
   return(roi_value_list)
 }
diff --git a/DIMS/Utils/get_element_info.R b/DIMS/Utils/get_element_info.R
index c4bdd92..a619f6a 100644
--- a/DIMS/Utils/get_element_info.R
+++ b/DIMS/Utils/get_element_info.R
@@ -10,7 +10,7 @@ get_element_info <- function(name, elements = NULL) {
 
   # get information on all elements
   if (!is.list(elements) || length(elements) == 0 ) {
-    all_elements <- initializePSE() 
+    elements <- initializePSE()
   }
   # extract information for a particular adduct
   if (name == "CH3OH+H") {
@@ -19,6 +19,6 @@ get_element_info <- function(name, elements = NULL) {
   } else {
     regular_expr <- paste0("^", name, "$")
   }
-  element_info <- all_elements[[grep(regular_expr, sapply(all_elements, function(x) { x$name }))]]
+  element_info <- elements[[grep(regular_expr, sapply(elements, function(x) { x$name }))]]
   return(element_info)
 }
\ No newline at end of file
diff --git a/DIMS/Utils/identify_noisepeaks.R b/DIMS/Utils/identify_noisepeaks.R
index 5e557cb..cf79279 100644
--- a/DIMS/Utils/identify_noisepeaks.R
+++ b/DIMS/Utils/identify_noisepeaks.R
@@ -43,7 +43,7 @@ identify_noisepeaks <- function(peakgroup_list, all_adducts, scanmode = "Negativ
       add2label <- paste0("[M+", look4[adduct_index], "]", adduct_scanmode)
     }
 
-    noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName "], add2label)
+    noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName"], add2label)
     adduct_info <- get_element_info(look4[adduct_index], all_adducts)
     if (scanmode == "Positive") {
       adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] - hydrogen_mass
diff --git a/DIMS/Utils/replace_zeros.R b/DIMS/Utils/replace_zeros.R
index cf65340..0ef06a3 100644
--- a/DIMS/Utils/replace_zeros.R
+++ b/DIMS/Utils/replace_zeros.R
@@ -26,8 +26,8 @@ replace_zeros <- function(peakgroup_list, repl_pattern, scanmode, resol, outdir,
         next
       }
       for (zero_index in 1:length(zero_intensity)) {
-        area <- generate_gaussian(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh,
-                                  resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area
+        area <- fit_optim(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh,
+                          resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area
         peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, mean = area,
                                                                                                sd = 0.25 * area)
       }

From 3a0fcbb467edeaca541375b978ff16bd2276ad48 Mon Sep 17 00:00:00 2001
From: Robert Ernst <rfernst@gmail.com>
Date: Tue, 18 Jun 2024 10:18:29 +0200
Subject: [PATCH 73/73] Revert "Feature/giabeval"

---
 CheckQC/CheckQC.nf                            |   6 +-
 CheckQC/check_qc.py                           | 194 +-----------------
 CheckQC/test_check_qc.py                      | 194 ++++++------------
 ExomeDepth/CallCNV.nf                         |   2 +-
 ExomeDepth/GetRefset.nf                       |   2 +-
 ExomeDepth/Summary.nf                         |   2 +-
 ExonCov/ImportBam.nf                          |   2 +-
 ExonCov/SampleQC.nf                           |   2 +-
 Kinship/Kinship.nf                            |   2 +-
 .../1.0.0/test_get_gender_from_bam_chrx.py    |   4 +-
 TrendAnalysis/TrendAnalysis.nf                |   2 +-
 UPD/IGV.nf                                    |   2 +-
 Utils/CreateHSmetricsSummary.nf               |   2 +-
 Utils/EditSummaryFileHappy.nf                 |  30 ---
 Utils/GetStatsFromFlagstat.nf                 |   2 +-
 Utils/ParseChildFromFullTrio.nf               |   2 +-
 Utils/SavePedFile.nf                          |   2 +-
 Utils/VersionLog.nf                           |   5 +-
 Utils/create_hsmetrics_summary.py             |   2 +-
 Utils/get_stats_from_flagstat.py              |  24 +--
 20 files changed, 92 insertions(+), 391 deletions(-)
 delete mode 100644 Utils/EditSummaryFileHappy.nf

diff --git a/CheckQC/CheckQC.nf b/CheckQC/CheckQC.nf
index 1cc6ee6..0dbf4d7 100644
--- a/CheckQC/CheckQC.nf
+++ b/CheckQC/CheckQC.nf
@@ -6,13 +6,13 @@ process CheckQC {
 
     input:
         val(identifier)
-        path(input_files, stageAs: "?/*")
+        path(input_files)
      
     output:
-        path("${identifier}_checkqc_summary.csv", emit: qc_output)
+        path("${identifier}_summary.csv", emit: qc_output)
 
     script:
         """
-        python ${projectDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier}_checkqc ${input_files}
+        python ${projectDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier} ${input_files}
         """
 }
diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py
index d0b86f7..a47b14a 100644
--- a/CheckQC/check_qc.py
+++ b/CheckQC/check_qc.py
@@ -5,7 +5,6 @@
 from os import strerror as os_strerror
 from pathlib import Path
 import re
-from string import punctuation
 import sys
 import warnings
 
@@ -15,19 +14,6 @@
 
 
 def non_empty_existing_path(file_or_dir):
-    """
-    This function checks whether the provided file or dir exists and is not empty.
-
-    Args:
-        file_or_dir (string): Input file or directory
-
-    Raises:
-        FileNotFoundError: If input string file_or_dir is neither a file nor a dir.
-        OSError: If input is not a dir and file is empty.
-
-    Returns:
-        string: Provided input file or directory. If dir, suffix '/' might be added.
-    """
     input_path = Path(file_or_dir)
     if not input_path.is_file() and not input_path.is_dir():
         raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir)
@@ -39,15 +25,6 @@ def non_empty_existing_path(file_or_dir):
 
 
 def parse_arguments_and_check(args_in):
-    """
-    Parses arguments and validates / checks format of input.
-
-    Args:
-        args_in (list of strings): Commandline input arguments.
-
-    Returns:
-        Namespace: Convert argument strings to objects and assign them as attributes of the namespace.
-    """
     parser = argparse.ArgumentParser(
         description="Check and summarize sample quality using qc metrics and their thresholds."
     )
@@ -75,18 +52,6 @@ def parse_arguments_and_check(args_in):
 
 
 def read_yaml(yaml_file):
-    """
-    Read input yaml
-
-    Args:
-        yaml_file (string): String with path to yaml file
-
-    Raises:
-        ValueError: If reading the file returns None (not recognized as YAML format).
-
-    Returns:
-        Object: Content of the YAML file.
-    """
     yaml_loaded = yaml.safe_load(open(yaml_file))
     if not yaml_loaded:
         raise ValueError("Could not load YAML.")
@@ -94,52 +59,19 @@ def read_yaml(yaml_file):
 
 
 def check_allowed_operators(qc_operator):
-    """
-    Check if provided qc_operator is allowed.
-
-    Args:
-        qc_operator (string): (Custom / math) operator
-
-    Raises:
-        ValueError: If provided qc_operator is invalid / unsupported.
-    """
     operators = ["<", "<=", ">", ">=", "==", "!=", "match"]
     if qc_operator not in operators:
         raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}")
 
 
-def check_required_keys_metrics(qc_metrics):
-    """
-    Check if all required settings are included in the qc_settings
-
-    Args:
-        qc_metrics (list with dicts): qc settings per qc metric
-
-    Raises:
-        KeyError: Required key is not provided for the qc metric settings
-    """
+def check_required_keys_metrics(qc_settings):
     for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]:
-        if any([req_key not in setting.keys() for setting in qc_metrics]):
+        if any([req_key not in setting.keys() for setting in qc_settings["metrics"]]):
             raise KeyError(f"Required key {req_key} not in all metrics settings.")
 
 
 def select_metrics(filename, input_files):
-    """
-    Using regular expression to match the qc metric filename with the input files
-
-    Args:
-        filename (string): Filename of qc metric, could be a regex.
-        input_files (list): All qc metrics input files.
-
-    Returns:
-        list: Input files matching the given filename.
-    """
-    # Change filename without special characters (except for '_', '-' and '.') into
-    # a regex pattern to match absolute and relative paths in input_files.
-    special_symbols = set(punctuation) - set(".", "_", "-")
-    if not any([char in special_symbols for char in set(filename)]):
-        filename = ".*" + filename
-    metrics = list(filter(re.compile(f"{filename}").match, input_files))
+    metrics = list(filter(re.compile(f".*{filename}").match, input_files))
     if not metrics:
         warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}"))
         return None
@@ -147,21 +79,6 @@ def select_metrics(filename, input_files):
 
 
 def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
-    """
-    Get valid columns to include in final output report
-
-    Args:
-        qc_report_cols (list, string): column name(s) to include in report.
-        qc_metric_cols (list, string): column name(s) in qc metric.
-        qc_col (string): column name that contains the qc value/score.
-
-    Raises:
-        TypeError: qc_report_cols is neither a string nor list.
-        ValueError: Provided column names in qc_report_cols does not exist in the qc metric.
-
-    Returns:
-        list of strings: Valid column names to include in report.
-    """
     not_existing_cols = list(set(qc_report_cols) - set(qc_metric_cols))
     if qc_report_cols == "@all":
         qc_report_cols = qc_metric_cols
@@ -177,19 +94,6 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col):
 
 
 def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshold):
-    """
-    Add and rename columns in qc_metric.
-
-    Args:
-        qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
-        qc_title (string): Title of the qc check
-        qc_col (string): qc column name with qc value/score
-        qc_operator (string): (Custom / math) operator
-        qc_threshold (string, int or float): qc threshold
-
-    Returns:
-        Pandas DataFrame: DataFrame with qc metric.
-    """
     qc_metric_assigned = qc_metric.assign(
         qc_title=qc_title.lower(),
         qc_status="PASS",
@@ -201,21 +105,6 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol
 
 
 def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
-    """
-    Get rows that fail provided qc threshold
-
-    Args:
-        qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
-        qc_col (string): column name that contains the qc value/score.
-        qc_operator (string): (Custom / math) operator
-        qc_threshold (string, int or float): qc threshold
-
-    Raises:
-        TypeError: If qc threshold is neither 'match', str, int or float.
-
-    Returns:
-        object: The indexes of failed rows in qc_metric
-    """
     # Select failed rows using qc_threshold regex pattern and qc_operator 'match'
     if qc_operator == "match" and isinstance(qc_threshold, str):
         return qc_metric[qc_col].str.match(qc_threshold)
@@ -232,19 +121,6 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold):
 
 
 def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
-    """
-    Failed samples are added to the output metric, and removed from qc_metric.
-
-    Args:
-        qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values
-        failed_rows (object): Object with indexes of failed rows in qc_metric
-        report_cols (list): Valid column names (strings) to include in report.
-        sample_cols (list): Columnames (strings) of sample names.
-
-    Returns:
-        qc_metric (DataFrame): DataFrame of qc metric without failed rows
-        qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows
-    """
     qc_metric_out = DataFrame(columns=["sample", "qc_check", "qc_status", "qc_msg", "qc_value"])
     failed_samples = []
     if failed_rows.to_list():
@@ -276,17 +152,6 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols):
 
 
 def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
-    """
-    Passed samples are added to the output metric
-
-    Args:
-        qc_metric (DataFrame): DataFrame of qc metric without failed rows
-        qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows
-        sample_cols (list): Columnames (strings) of sample names.
-
-    Returns:
-        pandas DataFrame: Sorted qc metric for both passed and failed samples, without duplicates.
-    """
     # Add passed samples to output
     for sample_col in sample_cols:
         qc_metric_out = concat([
@@ -297,28 +162,12 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols):
                 .loc[:, qc_metric_out.columns]
             )
         ])
-    # Some qc metrics did result in errors when merging the tables.
-    # The merge failed when the column qc_value has floats stored as strings.
-    # Try to convert column qc_value to float.
-    # If ValueError is raised, probably because column is a string, continue.
-    try:
-        qc_metric_out["qc_value"] = qc_metric_out["qc_value"].astype("float")
-    except ValueError:
-        pass
     # In case 'multiple sample qc check',
     # output could contain duplicate rows for individual samples used in multiple comparisons.
     return qc_metric_out.sort_values(by=["qc_check", "qc_status"]).drop_duplicates(keep="first")
 
 
 def create_and_write_output(qc_output, output_path, output_prefix):
-    """
-    Joined qc metrics is created and written to output file.
-
-    Args:
-        qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates.
-        output_path (string): Relative or absolute path where output should be saved.
-        output_prefix (string): Output prefix for output file.
-    """
     # Add qc_summary
     qc_output.insert(1, "qc_summary", "PASS")
     qc_output.loc[qc_output.isin(["FAIL"]).any(axis=1), "qc_summary"] = "FAIL"
@@ -327,20 +176,8 @@ def create_and_write_output(qc_output, output_path, output_prefix):
 
 
 def read_and_judge_metrics(qc, metrics):
-    """
-    Read and judge each single qc metric and join results.
-
-    Args:
-        qc (dict): qc settings of the metric
-        metrics (list): List of input files specific for single qc metric
-
-    Returns:
-        pandas DataFrame: Joined and judged qc metrics.
-    """
     for qc_file in metrics:
-        qc_metric_raw = read_csv(
-            qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar=qc.get("quotechar", '"')
-        )
+        qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter="\t", quotechar='"')
         report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"])
         qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"])
         failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"])
@@ -374,30 +211,9 @@ def read_and_judge_metrics(qc, metrics):
 
 
 def check_qc(input_files, settings, output_path, output_prefix):
-    """
-    Main function to judge input files on configured qc settings.
-    It creates a single results table, each row representing
-        sample (string): sample name
-        qc_summary: Summarized status of all qcs for sample (pass or fail)
-        qc columns (5 per each qc metric);
-            qc_check (string): QC check consiting of qc title, operator and threshold
-            qc_status (string): Status of performed qc check (pass or fail)
-            qc_msg (string): String with human readable message if sample failed qc check, empty if passed.
-            qc_value (string, int, float): qc value/score to check.
-
-    Args:
-        input_files (list): All qc metrics input files.
-        settings (string): Path to yaml file
-        output_path (string): Relative or absolute path where output should be saved.
-        output_prefix (string): Output prefix for output file.
-
-    Raises:
-        ValueError: No input files found to match any qc metric patterns defined in settings.
-        ValueError: Duplicated samples with different values found in some of the input files.
-    """
     # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings.
     qc_settings = read_yaml(settings)
-    check_required_keys_metrics(qc_settings["metrics"])
+    check_required_keys_metrics(qc_settings)
     duplicated_sample_file = []
     for qc_metric_settings in qc_settings["metrics"]:
         check_allowed_operators(qc_metric_settings["operator"])
diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py
index baedf3d..d6ef6ae 100644
--- a/CheckQC/test_check_qc.py
+++ b/CheckQC/test_check_qc.py
@@ -15,7 +15,7 @@
 @pytest.fixture(scope="module", autouse=True)
 def setup_test_path(tmp_path_factory):
     test_tmp_path = str(tmp_path_factory.mktemp("test")) + "/"
-    # Create empty files
+    # create empty files
     open(str(test_tmp_path) + "/empty.txt", "a").close()
     open(str(test_tmp_path) + "/empty.yaml", "a").close()
     return test_tmp_path
@@ -66,95 +66,44 @@ def test_not_existing_operator(self):
 
 class TestCheckRequiredKeysMetrics():
     def test_required_keys_present(self):
-        qc_metrics = [
+        qc_settings = {"metrics": [
             {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake",  "report_cols": "fake"},
-        ]
-        check_qc.check_required_keys_metrics(qc_metrics)
+        ]}
+        check_qc.check_required_keys_metrics(qc_settings)
         assert True
 
     @pytest.mark.parametrize(
-        "incomplete_qc_metrics",
+        "incomplete_qc_settings",
         [
-            [{"filename": "fakename"}],
-            [
+            {"metrics": [{"filename": "fakename"}]},
+            {"metrics": [
                 {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake",  "report_cols": "fake"},
-                # Missing report_cols
-                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},
-            ],
-            [
-                # Missing report_cols
-                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},
-                # Missing report_cols
-                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},
-            ]
+                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
+            ]},
+            {"metrics": [
+                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
+                {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"},  # missing report_cols
+            ]}
         ]
     )
-    def test_missing_keys(self, incomplete_qc_metrics):
+    def test_missing_keys(self, incomplete_qc_settings):
         with pytest.raises(KeyError) as required_error:
-            check_qc.check_required_keys_metrics(incomplete_qc_metrics)
+            check_qc.check_required_keys_metrics(incomplete_qc_settings)
         error_val = str(required_error.value)
         assert "not in all metrics settings." in error_val
         assert error_val.split(" ")[2] in ["filename", "qc_col", "threshold", "operator", "report_cols"]
 
 
 class TestSelectMetrics():
-    @pytest.mark.parametrize("filename_or_regex,input_files,expected", [
-        # Multi match
-        ("test", ["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]),
-        # Single match
-        ("test", ["test1.txt", "fake2.txt"],  ["test1.txt"]),
-        # Match with relative path
-        ("test", ["./random/path/to/test1.txt"],  ["./random/path/to/test1.txt"]),
-        # Match with absolute path
-        ("test", ["/random/path/to/test1.txt"],  ["/random/path/to/test1.txt"]),
-        # Match regex: kinship file suffix
-        (
-            ".*.kinship_check.out$",
-            ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"],
-            ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"]
-        ),
-        # Match on word truth and SNP
-        (
-            ".*truth.*SNP",
-            [
-                '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv',
-                '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'
-            ],
-            ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
-        ),
-        # Match on word truth and SNP
-        (
-            ".*truth.*SNP",
-            [
-                'U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv',
-                'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'
-            ],
-            ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv']
-        ),
-        # Match if 'truth' is absent and contains 'SNP'
-        # ?: Match expression but do not capture it
-        # ?! Match if 'truth' is absent.
-        (
-            "(?:(?!truth).)*SNP.*$",
-            [
-                '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv',
-                '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'
-            ],
-            ['12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv']
-        ),
-
+    @pytest.mark.parametrize("input_files,expected", [
+        (["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]),  # multi match
+        (["test1.txt", "fake2.txt"],  ["test1.txt"]),  # single match
     ])
-    def test_select_metric(self, filename_or_regex, input_files, expected):
-        metrics = check_qc.select_metrics(filename_or_regex, input_files)
+    def test_select_metric(self, input_files, expected):
+        metrics = check_qc.select_metrics("test", input_files)
         assert metrics == expected
 
-    @pytest.mark.parametrize("filename_or_regex,input_files", [
-        # No match
-        ("test", ["fake1.txt", "fake2.txt"]),
-        # No match, filename with specialchar assumed to be a regex
-        ("specialchar_@", ["12/specialchar_@.txt"]),
-    ])
-    def test_no_match(self, filename_or_regex, input_files):
+    def test_no_match(self):
         with pytest.warns(UserWarning) as match_warning:
             return_val = check_qc.select_metrics("test", ["fake1.txt", "fake2.txt"])
         warn_msg = match_warning[0].message.args[0]
@@ -166,14 +115,10 @@ def test_no_match(self, filename_or_regex, input_files):
 class TestGetColumnsToReport():
     @pytest.mark.parametrize("report_cols,metric_cols,qc_col,expected", [
         (["col1"], ["col1"], "col1", ["qc_title", "qc_value"]),
-        # Additional report col
-        (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),
-        # Different order output
-        (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]),
-        # Additional metric col
-        (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]),
-        # Special @all option
-        ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),
+        (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),  # additional report col
+        (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]),  # different order output
+        (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]),  # additional metric col
+        ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]),  # special @all option
     ])
     def test_get_columns_to_report(self, report_cols, metric_cols, qc_col, expected):
         qc_report_cols = check_qc.get_columns_to_report(report_cols, metric_cols, qc_col)
@@ -195,11 +140,11 @@ class TestAddAndRenameColumns():
     def test_add_and_rename_columns(self):
         fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": ["0.01"]})
         qc_metric_out = check_qc.add_and_rename_columns(fake_qc_metric, "FAKE_title", "fake_qc_col", "fake_op", "fake_thres")
-        # Assert expected column values
+        # assert expected column values
         assert qc_metric_out["qc_title"].values == "fake_title"
         assert qc_metric_out["qc_status"].values == "PASS"
         assert qc_metric_out["qc_check"].values == "fake_thres fake_op fake_qc_col"
-        # Assert all expected columns exist
+        # assert all expected columns exist
         assert not list(
             set(['sample', 'qc_value', 'qc_title', 'qc_status', 'qc_check', 'qc_msg']) - set(qc_metric_out.columns)
         )
@@ -208,14 +153,10 @@ def test_add_and_rename_columns(self):
 
 class TestGetFailedRows():
     @pytest.mark.parametrize("qc_op,qc_thres", [
-        # Test match
-        ("match", "fake_thres"),
-        # Test string
-        ("==", "FAIL"),
-        # Test int
-        ("==", 1),
-        # Test float
-        ("==", 0.1),
+        ("match", "fake_thres"),  # test match
+        ("==", "FAIL"),  # test string
+        ("==", 1),  # test int
+        ("==", 0.1),  # test float
     ])
     def test_correct(self, qc_op, qc_thres):
         fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": [qc_thres]})
@@ -240,12 +181,9 @@ def test_add_failed_samples_single_sample_col(self):
         failed_rows = fake_qc_metric.loc[fake_qc_metric["sample_col"] == "sample2"].index
         qc_metric, qc_metric_out = check_qc.add_failed_samples_metric(
             fake_qc_metric, failed_rows, fake_qc_metric.columns.to_list(), ["sample_col"])
-        # Test rename column
-        assert "sample" in qc_metric_out.columns.to_list()
-        # Test removal failed sample
-        assert "sample2" not in qc_metric["sample_col"].to_list()
-        # Test added failed sample
-        assert "sample2" in qc_metric_out["sample"].to_list()
+        assert "sample" in qc_metric_out.columns.to_list()  # test rename column
+        assert "sample2" not in qc_metric["sample_col"].to_list()  # test removal failed sample
+        assert "sample2" in qc_metric_out["sample"].to_list()  # test added failed sample
         assert len(qc_metric) == 1 and len(qc_metric_out) == 1
         assert qc_metric_out["qc_status"].values == "FAIL"
 
@@ -255,21 +193,18 @@ def test_add_failed_samples_multi_sample_col(self):
             columns=["sample_col1", "sample_col2"]
         )
         fake_kinship_metric = fake_kinship_metric.assign(qc_check="checks", qc_value="wrong")
-        # Define sample1 vs sample2 and sample1 vs sample3 as failed
-        failed_rows = fake_kinship_metric.iloc[0:2].index
+        failed_rows = fake_kinship_metric.iloc[0:2].index  # define sample1 vs sample2 and sample1 vs sample3 as failed
         qc_metric, qc_metric_out = check_qc.add_failed_samples_metric(
             fake_kinship_metric, failed_rows, fake_kinship_metric.columns.to_list(), ["sample_col1", "sample_col2"])
         for failed_sample in ["sample1", "sample2", "sample3"]:
-            # Test removal failed sample
+            # test removal failed sample
             assert failed_sample not in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel())
-            # Test added failed sample
-            assert failed_sample in qc_metric_out["sample"].to_list()
+            assert failed_sample in qc_metric_out["sample"].to_list()  # test added failed sample
         assert qc_metric_out["qc_status"].values.all() == "FAIL"
         assert len(qc_metric) == 1
         twice_failed = qc_metric_out.loc[qc_metric_out["sample"] == "sample1"]
-        # Assert join with ';' on column qc_value
-        assert "wrong;wrong" == twice_failed["qc_value"].item()
-        # Assert join with ';' on column qc_msg
+        assert "wrong;wrong" == twice_failed["qc_value"].item()  # assert join with ';' on column qc_value
+        # assert join with ';' on column qc_msg
         assert "sample1 sample2 checks wrong;sample1 sample3 checks wrong" == twice_failed["qc_msg"].item()
         for passed_sample in ["sample4", "sample5"]:
             assert passed_sample in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel())
@@ -299,23 +234,20 @@ def test_add_passed_samples_multi_sample_col(self):
         )
         qc_metric_out = check_qc.add_passed_samples_metric(
             fake_qc_metric, fake_sample_qc, ["sample_col1", "sample_col2"])
-        # Test rename column
-        assert "sample" in qc_metric_out.columns.to_list()
-        # Test removal duplicates
-        assert qc_metric_out["sample"].to_list().count("s4") == 1
-        # Test additional columns ignored
-        assert "new_col" not in qc_metric_out.columns.to_list()
+        assert "sample" in qc_metric_out.columns.to_list()  # test rename column
+        assert qc_metric_out["sample"].to_list().count("s4") == 1  # test removal duplicates
+        assert "new_col" not in qc_metric_out.columns.to_list()  # test additional columns ignored
 
 
 class TestCreateAndWriteOutput():
     @pytest.mark.parametrize("exp_summary,qc_output", [
-        # All qc checks passed
+        # all qc checks passed
         ("PASS", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["PASS"]})),
-        # Single qc check failed
+        # single qc check failed
         ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["FAIL"]})),
-        # All qc check failed
+        # all qc check failed
         ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["FAIL"], "qc_status_kinship": ["FAIL"]})),
-        # Not restricted to qc_status_<check> column name
+        # not restricted to qc_status_<check> column name
         ("PASS", DataFrame({"sample": ["s1"], "random_col1": ["PASS"], "random_col2": ["PASS"]})),
     ])
     def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output):
@@ -330,61 +262,61 @@ def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output):
 
 class TestGetOutputMetrics():
     @pytest.mark.parametrize("data_in,nr_rows", [
-        # Single sample
+        # single sample
         (["sample1_fake_check.txt"], 1),
-        # Multiple single samples
+        # multiple single samples
         (["sample1_fake_check.txt", "sample2_fake_check.txt"], 2),
-        # Single multi samples
+        # single multi samples
         (["240101_fake_check.txt"], 2),
-        # Multiple multi samples
+        # multiple multi samples
         (["240101_fake_check.txt", "240102_fake_check.txt"], 4),
-        # Multi and single sample
+        # multi and single sample
         (["sample1_fake_check.txt", "240101_fake_check.txt"], 3),
     ])
     def test_input_ok(self, data_in, nr_rows, dataset, datadir):
         datadir_files = [f"{datadir}/{filename}" for filename in data_in]
+        # input1 = datadir / "sample1_fake_check.txt"
         df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files)
         assert not df_output.empty
         observed_cols = df_output.columns.to_list()
-        # Shape results in tuple with no. rows and no. cols
-        assert df_output.shape[0] == nr_rows
+        assert df_output.shape[0] == nr_rows  # shape results in tuple with no. rows and no. cols
         assert len(observed_cols) == 5
         assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc']
 
     @pytest.mark.parametrize("data_in,nr_rows,exp_warn_msg", [
-        # Single sample duplicate
+        # single sample duplicate
         (["sample1_fake_check.txt"]*2, 1, "Sample IDs occur multiple times in input:"),
-        # Single multi samples duplicate
+        # single multi samples duplicate
         (["240101_fake_check.txt"]*2, 2, "Sample IDs occur multiple times in input:"),
-        # Multiple multi samples, duplicate samples
+        # multiple multi samples, duplicate samples
         (["240101_fake_check.txt", "240101_v2_fake_check.txt"], 4, "Different qc values for duplicated sample IDs in input:"),
     ])
     def test_input_warn(self, data_in, nr_rows, exp_warn_msg, dataset, datadir):
         datadir_files = [f"{datadir}/{filename}" for filename in data_in]
+        # input1 = datadir / "sample1_fake_check.txt"
         with pytest.warns(UserWarning) as match_warning:
             df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files)
         warn_msg = match_warning[0].message.args[0]
         assert exp_warn_msg in warn_msg
         assert not df_output.empty
         observed_cols = df_output.columns.to_list()
-        # Shape: tuple with no. rows and no. cols
-        assert df_output.shape[0] == nr_rows
+        assert df_output.shape[0] == nr_rows  # Shape: tuple with no. rows and no. cols
         assert len(observed_cols) == 5
         assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc']
 
 
 class TestCheckQc():
     @pytest.mark.parametrize("settings,data_in,exp_shape", [
-        # Single metric, single sample input
+        # single metric, single sample input
         ("settings_single_metric", ["sample1_fake_check.txt"], (1, 5)),
-        # Two metrics, single sample input
+        # two metrics, single sample input
         ("settings_two_metrics", ["sample1_fake_check.txt"], (1, 9)),
-        # Single metric, multiple samples input
+        # single metric, multiple samples input
         ("settings_single_metric", ["240101_fake_check.txt"], (2, 5)),
         ("settings_single_metric", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 5)),
-        # Two metrics, multiple sample input
+        # two metrics, multiple sample input
         ("settings_two_metrics", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 9)),
-        # Two metricS, multi and single sample input
+        # two metric, multi and single sample input
         ("settings_two_metrics", ["sample1_fake_check.txt", "240101_fake_check.txt"], (3, 9)),
     ])
     def test_ok(self, settings, data_in, exp_shape, datadir, dataset, mocker, ):
diff --git a/ExomeDepth/CallCNV.nf b/ExomeDepth/CallCNV.nf
index fda8ef1..9a3197e 100644
--- a/ExomeDepth/CallCNV.nf
+++ b/ExomeDepth/CallCNV.nf
@@ -22,4 +22,4 @@ process CallCNV {
         source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate
         python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py callcnv ./ ${bam_file} ${analysis_id} ${sample_id}
         """
-}
+}
\ No newline at end of file
diff --git a/ExomeDepth/GetRefset.nf b/ExomeDepth/GetRefset.nf
index fbe9d7b..efff299 100644
--- a/ExomeDepth/GetRefset.nf
+++ b/ExomeDepth/GetRefset.nf
@@ -18,4 +18,4 @@ process GetRefset {
         python ${params.dx_resources_path}/${params.exomedepth_path}/exomedepth_db.py add_sample_return_refset_bam ${bam_file} --print_refset_stdout | \
         tr -d '\n'
         """
-}
+}
\ No newline at end of file
diff --git a/ExomeDepth/Summary.nf b/ExomeDepth/Summary.nf
index a883376..3244d8c 100644
--- a/ExomeDepth/Summary.nf
+++ b/ExomeDepth/Summary.nf
@@ -17,4 +17,4 @@ process Summary {
         source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate
         python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py summary ${exomedepth_logs} > ${analysis_id}_exomedepth_summary.txt
         """
-}
+}
\ No newline at end of file
diff --git a/ExonCov/ImportBam.nf b/ExonCov/ImportBam.nf
index dc26f50..6afbfe8 100644
--- a/ExonCov/ImportBam.nf
+++ b/ExonCov/ImportBam.nf
@@ -21,4 +21,4 @@ process ImportBam {
         --exon_bed ${params.dxtracks_path}/${params.exoncov_bed} \
         ${analysis_id} WES ${bam_file} | tr -d '\n'
         """
-}
+}
\ No newline at end of file
diff --git a/ExonCov/SampleQC.nf b/ExonCov/SampleQC.nf
index cde757f..b0a2b95 100644
--- a/ExonCov/SampleQC.nf
+++ b/ExonCov/SampleQC.nf
@@ -19,4 +19,4 @@ process SampleQC {
         flask --app ${params.exoncov_path}/ExonCov sample_qc \
         -s ${samples} -p ${panels} > ${analysis_id}.ExonCovQC_check.out
         """
-}
+}
\ No newline at end of file
diff --git a/Kinship/Kinship.nf b/Kinship/Kinship.nf
index fd051fe..9241dbf 100644
--- a/Kinship/Kinship.nf
+++ b/Kinship/Kinship.nf
@@ -19,4 +19,4 @@ process Kinship {
         cp king.kin0 ${analysis_id}.kinship
         python ${projectDir}/CustomModules/Kinship/check_kinship.py ${analysis_id}.kinship ${ped_file} > ${analysis_id}.kinship_check.out
         """
-}
+}
\ No newline at end of file
diff --git a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py
index 7674f45..0a494ea 100644
--- a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py
+++ b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py
@@ -26,9 +26,7 @@ class TestGetGenderFromBam:
         ("./test_bam.bam", 20, "X:2699520-154931044", 5.5, 7.5, ("M", False)),
         ("./test_bam.bam", 20, "X:2699520-154931044", 4.5, 6.5, ("F", True)),
     ])
-    def test_get_gender_from_bam(
-            self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome
-    ):
+    def test_get_gender_from_bam(self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome):
         assert expected_outcome == get_gender_from_bam_chrx.get_gender_from_bam_chrx(
             bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female)
 
diff --git a/TrendAnalysis/TrendAnalysis.nf b/TrendAnalysis/TrendAnalysis.nf
index 5009861..c8968ab 100644
--- a/TrendAnalysis/TrendAnalysis.nf
+++ b/TrendAnalysis/TrendAnalysis.nf
@@ -12,4 +12,4 @@ process TrendAnalysis {
         source ${params.trend_analysis_path}/venv/bin/activate
         python ${params.trend_analysis_path}/trend_analysis.py upload processed_data ${analysis_id} .
         """
-}
+}
\ No newline at end of file
diff --git a/UPD/IGV.nf b/UPD/IGV.nf
index f2974c7..2ea72af 100644
--- a/UPD/IGV.nf
+++ b/UPD/IGV.nf
@@ -19,4 +19,4 @@ process IGV {
         source ${params.upd_path}/venv/bin/activate
         python ${params.upd_path}/make_UPD_igv.py ${ped_file} ${analysis_id} $trio_sample ${vcf_files} -c
         """
-}
+}
\ No newline at end of file
diff --git a/Utils/CreateHSmetricsSummary.nf b/Utils/CreateHSmetricsSummary.nf
index 45553cb..2ca443f 100644
--- a/Utils/CreateHSmetricsSummary.nf
+++ b/Utils/CreateHSmetricsSummary.nf
@@ -14,4 +14,4 @@ process CreateHSmetricsSummary {
         """
         python2 ${projectDir}/CustomModules/Utils/create_hsmetrics_summary.py ${hsmetrics_files} > HSMetrics_summary.txt
         """
-}
+}
\ No newline at end of file
diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf
deleted file mode 100644
index a4aba58..0000000
--- a/Utils/EditSummaryFileHappy.nf
+++ /dev/null
@@ -1,30 +0,0 @@
-process EditSummaryFileHappy {
-    tag "$meta.id"
-    label 'EditSummaryFileHappy'
-    shell = ['/bin/bash', '-euo', 'pipefail']
-
-    input:
-        // meta should have the keys 'id', 'query' and 'truth'
-        tuple(val(meta), path(summary_csv))
-
-    output:
-        path("${meta.truth}_${meta.query}_INDEL_PASS.csv"), emit: indel_pass_csv
-        path("${meta.truth}_${meta.query}_INDEL_ALL.csv"), emit: indel_all_csv
-        path("${meta.truth}_${meta.query}_SNP_PASS.csv"), emit: snp_pass_csv
-        path("${meta.truth}_${meta.query}_SNP_ALL.csv"), emit: snp_all_csv
-
-    script:
-        """
-        # Add samplenames as columns (header and row values) at start of line
-        sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp
-        
-        # Split file including header (first line)
-        awk -F',' 'FNR==1{hdr=\$0;next} {
-            print hdr>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv";
-            print \$0>>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv"
-        }' ${summary_csv}.tmp
-        
-        # Remove tmp files
-        rm ${summary_csv}.tmp     
-        """
-}
diff --git a/Utils/GetStatsFromFlagstat.nf b/Utils/GetStatsFromFlagstat.nf
index 27e6bbe..99018ed 100644
--- a/Utils/GetStatsFromFlagstat.nf
+++ b/Utils/GetStatsFromFlagstat.nf
@@ -14,4 +14,4 @@ process GetStatsFromFlagstat {
         """
         python2 ${projectDir}/CustomModules/Utils/get_stats_from_flagstat.py ${flagstat_files} > run_stats.txt
         """
-}
+}
\ No newline at end of file
diff --git a/Utils/ParseChildFromFullTrio.nf b/Utils/ParseChildFromFullTrio.nf
index b9377bd..9963277 100644
--- a/Utils/ParseChildFromFullTrio.nf
+++ b/Utils/ParseChildFromFullTrio.nf
@@ -17,4 +17,4 @@ process ParseChildFromFullTrio {
         """
         python2 ${projectDir}/CustomModules/Utils/parse_child_from_fulltrio.py ${ped_file} ${sample_ids} | tr -d '\n'
         """
-}
+}
\ No newline at end of file
diff --git a/Utils/SavePedFile.nf b/Utils/SavePedFile.nf
index 00f45a3..1c49c07 100644
--- a/Utils/SavePedFile.nf
+++ b/Utils/SavePedFile.nf
@@ -14,4 +14,4 @@ process SavePedFile {
         """
         cp --remove-destination "\$(readlink ${ped_file})" ./
         """
-}
+}
\ No newline at end of file
diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf
index cff4f49..4a295b1 100644
--- a/Utils/VersionLog.nf
+++ b/Utils/VersionLog.nf
@@ -10,7 +10,6 @@ process VersionLog {
 
     output:
         path('repository_version.log')
-        path("versions.yml", emit: versions)
 
     script:
         """
@@ -18,8 +17,6 @@ process VersionLog {
         do
             echo "\${git_dir}" >> repository_version.log
             git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log
-            described_tags=\$(git --git-dir=\${git_dir}/.git describe --tags)
-            echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml
         done
         """
-}
+}
\ No newline at end of file
diff --git a/Utils/create_hsmetrics_summary.py b/Utils/create_hsmetrics_summary.py
index 582b61e..405e1ce 100644
--- a/Utils/create_hsmetrics_summary.py
+++ b/Utils/create_hsmetrics_summary.py
@@ -8,7 +8,7 @@
     parser.add_argument('hsmetrics_files', type=argparse.FileType('r'), nargs='*', help='HSMetric file')
     arguments = parser.parse_args()
 
-    interval_files_pattern = re.compile(r"BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]")
+    interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]")
     summary_header = []
     summary_data = {}
     for hsmetrics_file in arguments.hsmetrics_files:
diff --git a/Utils/get_stats_from_flagstat.py b/Utils/get_stats_from_flagstat.py
index 52560bc..2f46abb 100644
--- a/Utils/get_stats_from_flagstat.py
+++ b/Utils/get_stats_from_flagstat.py
@@ -43,24 +43,12 @@
 
         print("\n\t{0} %duplication\n".format(100*sample_dups/sample_mapped))
 
-    print(
-        "Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, "
-        "150bp={total_150bp:,} bp)".format(
-            total=counts['total'],
-            total_75bp=counts['total']*75,
-            total_100bp=counts['total']*100,
-            total_150bp=counts['total']*150
-        )
-    )
-    print(
-        "Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, "
-        "150bp={total_150bp:,} bp)".format(
-            total=counts['mapped'],
-            total_75bp=counts['mapped']*75,
-            total_100bp=counts['mapped']*100,
-            total_150bp=counts['mapped']*150
-        )
-    )
+    print("Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format(
+        total=counts['total'], total_75bp=counts['total']*75, total_100bp=counts['total']*100, total_150bp=counts['total']*150
+    ))
+    print("Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format(
+        total=counts['mapped'], total_75bp=counts['mapped']*75, total_100bp=counts['mapped']*100, total_150bp=counts['mapped']*150
+    ))
     print("Average mapped per lib: {:,} reads".format(int(round(float(counts['mapped'])/float(counts['files'])))))
     print("Average dups per lib: {:,} reads".format(int(round(float(counts['dups'])/float(counts['files'])))))
     print("Average dups % per lib: {:.2f} %".format(100*float(counts['dups'])/float(counts['mapped'])))