From b28bfd33a5d0ceeda2f311d5c9214943b70658c8 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 30 Jun 2023 10:47:13 +0200 Subject: [PATCH 01/73] added functionality for AssignToBins --- DIMS/AssignToBins.R | 126 +++++++++++++++++++++++++++++++++++++++++++ DIMS/AssignToBins.nf | 20 +++++++ 2 files changed, 146 insertions(+) create mode 100644 DIMS/AssignToBins.R create mode 100644 DIMS/AssignToBins.nf diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R new file mode 100644 index 0000000..0d9fd0b --- /dev/null +++ b/DIMS/AssignToBins.R @@ -0,0 +1,126 @@ +#!/usr/bin/Rscript + +#.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.6.2") + +# load required packages +suppressPackageStartupMessages(library("xcms")) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +filepath <- cmd_args[1] # location of mzML file +breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData +resol <- as.numeric(cmd_args[3]) # 140000 +trim <- 0.1 +dimsThresh <- 100 +print(filepath) +print(breaks_filepath) +print(resol) + +#sampname <- sub('\\..*$', '', basename(filepath)) +#cat(paste0("\n", sampname)) +sampname <- "AssignToBins" + +#suppressPackageStartupMessages(library("Cairo")) +options(digits=16) + +### process one sample at a time and find peaks FOR BOTH SCAN MODES! # +int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) +scale=2 # Initial value used to estimate scaling parameter +width=1024 +height=768 + +# Aggregate +trimLeft=NULL +trimRight=NULL +breaks.fwhm=NULL +breaks.fwhm.avg=NULL +bins=NULL +pos_results=NULL +neg_results=NULL + +# read in the data for 1 sample +raw_data <- suppressMessages(xcmsRaw(filepath)) + +# load breaks.fwhm +load(breaks_filepath) + +# Create empty placeholders for later use +bins <- rep(0, length(breaks.fwhm) - 1) +pos_bins <- bins +neg_bins <- bins + +# Generate a matrix +raw_data_matrix <- rawMat(raw_data) + +# Get time values for positive and negative scans +pos_times <- raw_data@scantime[raw_data@polarity == "positive"] +neg_times <- raw_data@scantime[raw_data@polarity == "negative"] +# Select scans between trimLeft and trimRight +pos_times <- pos_times[pos_times > trimLeft & pos_times < trimRight] +neg_times <- neg_times[neg_times > trimLeft & neg_times < trimRight] + +# Generate an index with which to select values for each mode +pos_index <- which(raw_data_matrix[ ,"time"] %in% pos_times) +neg_index <- which(raw_data_matrix[ ,"time"] %in% neg_times) +# Separate each mode into its own matrix +pos_raw_data_matrix <- raw_data_matrix[pos_index, ] +neg_raw_data_matrix <- raw_data_matrix[neg_index, ] + +# Get index for binning intensity values +bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) +bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) + +# Get the list of intensity values for each bin, and add the +# intensity values which are in the same bin +if (nrow(pos_raw_data_matrix) > 0) { + # set NA in intensities to zero + pos_raw_data_matrix[is.na(pos_raw_data_matrix[,"intensity"]), "intensity"] <- 0 + # use only values above dimsThresh + pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"] + # aggregate intensities, calculate mean + aggr_int_pos <- stats::aggregate(pos_intensity_above_threshold, list(bin_indices_pos), mean) + pos_bins[aggr_int_pos[ ,1]] <- aggr_int_pos[ ,2] +} +if (nrow(neg_raw_data_matrix) > 0) { + # set NA in intensities to zero + neg_raw_data_matrix[is.na(neg_raw_data_matrix[,"intensity"]), "intensity"] <- 0 + # use only values above dimsThresh + neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"] + # aggregate intensities, calculate mean + aggr_int_neg <- stats::aggregate(neg_intensity_above_threshold, list(bin_indices_neg), mean) + neg_bins[aggr_int_neg[ ,1]] <- aggr_int_neg[ ,2] +} + +# Zero any values that are below the threshold +pos_bins[pos_bins < dimsThresh] <- 0 +neg_bins[neg_bins < dimsThresh] <- 0 + +pos_results = cbind(pos_results, pos_bins) +neg_results = cbind(neg_results, neg_bins) + +# transpose +pos_results_transpose = t(pos_results) +neg_results_transpose = t(neg_results) + +# Add file names as row names +rownames(pos_results_transpose) = sampname +rownames(neg_results_transpose) = sampname + +# delete the last value of breaks.fwhm.avg to match dimensions of pos_results and neg_results +breaks.fwhm.avg.minus1 <- breaks.fwhm.avg[-length(breaks.fwhm.avg)] +# Format as string and show precision of float to 5 digits +breaks.fwhm.avg.minus1 <- sprintf("%.5f", breaks.fwhm.avg.minus1) + +# Use this as the column names +colnames(pos_results_transpose) <- breaks.fwhm.avg.minus1 +colnames(neg_results_transpose) <- breaks.fwhm.avg.minus1 + +# transpose back +pos_results_final <- t(pos_results_transpose) +neg_results_final <- t(neg_results_transpose) + +pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks.fwhm) + +save(pklist, file=paste("./", sampname, ".RData", sep="")) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf new file mode 100644 index 0000000..872e32f --- /dev/null +++ b/DIMS/AssignToBins.nf @@ -0,0 +1,20 @@ +process AssignToBins { + label 'AssignToBins' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(file_id, path(mzML_filename)) + file(breaks_file) + val(resolution) + + output: + file 'AssignToBins.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $resolution + """ +} + + From dcbfbcf2cffd91ed8f009022cd1a57ddcc017533 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 30 Jun 2023 10:48:55 +0200 Subject: [PATCH 02/73] added functionality for GenerateBreaks --- DIMS/GenerateBreaks.R | 57 ++++++++++++++++++++++++++++++++++++++++++ DIMS/GenerateBreaks.nf | 17 +++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 DIMS/GenerateBreaks.R create mode 100644 DIMS/GenerateBreaks.nf diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R new file mode 100644 index 0000000..85065ac --- /dev/null +++ b/DIMS/GenerateBreaks.R @@ -0,0 +1,57 @@ +#### GenerateBreaks.R #### +## adapted from 1-generateBreaksFwhm.HPC.R ## +#!/usr/bin/Rscript + +# .libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") + +# load required package +suppressPackageStartupMessages(library("xcms")) + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +filepath <- cmd_args[1] # 1 of the mzML files +outdir <- cmd_args[2] +trim <- as.numeric(cmd_args[3]) # 0.1 +resol <- as.numeric(cmd_args[4]) # 140000 + +# initialize +trimLeft = NULL +trimRight = NULL +breaks.fwhm = NULL +breaks.fwhm.avg = NULL +bins = NULL +posRes = NULL +negRes = NULL + +# read in mzML file +raw_data <- suppressMessages(xcmsRaw(filepath)) + +# trim scans at the start and end +trimLeft = round(raw_data@scantime[length(raw_data@scantime)*trim]) +trimRight = round(raw_data@scantime[length(raw_data@scantime)*(1-trim)]) + +# Mass range m/z +lowMZ = raw_data@mzrange[1] +highMZ = raw_data@mzrange[2] + +# determine number of segments (bins) +nsegment = 2*(highMZ-lowMZ) +segment = seq(from=lowMZ, to=highMZ, length.out=nsegment+1) + +# determine start and end of each bin. +for (i in 1:nsegment) { + startsegm <- segment[i] + endsegm <- segment[i+1] + resol.mz <- resol*(1/sqrt(2)^(log2(startsegm/200))) + fwhmsegm <- startsegm/resol.mz + breaks.fwhm <- c(breaks.fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)) + # average the m/z instead of start value + range = seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm) + deltaMZ = range[2] - range[1] + breaks.fwhm.avg <- c(breaks.fwhm.avg, range + 0.5*deltaMZ) +} + +save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/")) + diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf new file mode 100644 index 0000000..f73ad54 --- /dev/null +++ b/DIMS/GenerateBreaks.nf @@ -0,0 +1,17 @@ +process GenerateBreaks { + label 'GenerateBreaks' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(file_id, path(mzML_file)) + + + output: + file 'breaks.fwhm.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/GenerateBreaks.R $mzML_file ./ $params.trim $params.resolution + """ +} From ca3ae4431a82cad812f377ae3a7f36ec3e0bcf77 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 30 Jun 2023 13:27:36 +0200 Subject: [PATCH 03/73] reference to old file added --- DIMS/AssignToBins.R | 1 + 1 file changed, 1 insertion(+) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 0d9fd0b..90f510a 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,4 +1,5 @@ #!/usr/bin/Rscript +## adapted from 2-DIMS.R #.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.6.2") From 0e666355e93ff283a9612184eeebe6245ca99b6b Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 30 Jun 2023 13:28:39 +0200 Subject: [PATCH 04/73] added functionality for MakeInit file --- DIMS/MakeInit.R | 34 ++++++++++++++++++++++++++++++++++ DIMS/MakeInit.nf | 16 ++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 DIMS/MakeInit.R create mode 100644 DIMS/MakeInit.nf diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R new file mode 100644 index 0000000..2657b66 --- /dev/null +++ b/DIMS/MakeInit.R @@ -0,0 +1,34 @@ +#!/usr/bin/env Rscript +## adapted from makeInit in old pipeline + +# used for when init.RData has to be created manually +# arg1 : path to sampleNames.txt or whatever the name of the samplesheet txt file may be +# arg2 : amount of technical replicates (usually 3) + +args <- commandArgs(trailingOnly=TRUE) +sample_sheet <- read.csv(args[1], sep="\t") +nr_replicates <- as.numeric(args[2]) + +sampleNames <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_sampgrps <- length(sampleNames)/nr_replicates +groupNames <- trimws(as.vector(unlist(sample_sheet[2]))) +groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames) +groupNamesUnique <- unique(groupNames) +#groupNamesNotUnique <- groupNames[duplicated(groupNames)] + +repl.pattern <- c() +for (sampgrp in 1:nr_sampgrps) { + tmp <- c() + for (repl in nr_replicates:1) { + index <- ((sampgrp*nr_replicates) - repl) + 1 + tmp <- c(tmp, sampleNames[index]) + } + repl.pattern <- c(repl.pattern, list(tmp)) +} + +names(repl.pattern) <- groupNamesUnique + +# just to preview +head(repl.pattern) + +save(repl.pattern, file="init.RData") diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf new file mode 100644 index 0000000..064225c --- /dev/null +++ b/DIMS/MakeInit.nf @@ -0,0 +1,16 @@ +process MakeInit { + label 'MakeInit' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(path(samplesheet), val(nr_replicates)) + + output: + file 'init.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/MakeInit.R $samplesheet $nr_replicates + """ +} From e5a015446a78096f67221f0b06e60bfa79bbe7df Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 11 Aug 2023 15:07:17 +0200 Subject: [PATCH 05/73] added new files --- .../10-collectSamplesFilled_extraoutput.R | 94 +++++ DIMS/AddOnFunctions/checkOverlap.R | 16 + DIMS/AddOnFunctions/elementInfo.R | 6 + DIMS/AddOnFunctions/export.R | 13 + DIMS/AddOnFunctions/fit1Peak.R | 120 +++++++ DIMS/AddOnFunctions/fit2G.R | 43 +++ DIMS/AddOnFunctions/fit2peaks.R | 100 ++++++ DIMS/AddOnFunctions/fit3G.R | 18 + DIMS/AddOnFunctions/fit3peaks.R | 122 +++++++ DIMS/AddOnFunctions/fit4G.R | 18 + DIMS/AddOnFunctions/fit4peaks.R | 149 ++++++++ DIMS/AddOnFunctions/fitG.R | 18 + DIMS/AddOnFunctions/fitGaussian.R | 327 ++++++++++++++++++ DIMS/AddOnFunctions/fitGaussianInit.R | 60 ++++ DIMS/AddOnFunctions/generateBreaksFwhm.R | 26 ++ DIMS/AddOnFunctions/generateExcelFile.R | 24 ++ DIMS/AddOnFunctions/generateGaussian.R | 48 +++ DIMS/AddOnFunctions/getArea.R | 23 ++ DIMS/AddOnFunctions/getDeltaMZ.R | 5 + DIMS/AddOnFunctions/getFitQuality.R | 65 ++++ DIMS/AddOnFunctions/getFwhm.R | 24 ++ DIMS/AddOnFunctions/getPatients.R | 9 + DIMS/AddOnFunctions/getSD.R | 9 + DIMS/AddOnFunctions/globalAssignments.HPC.R | 114 ++++++ DIMS/AddOnFunctions/iden.code.R | 45 +++ DIMS/AddOnFunctions/ident.hires.noise.HPC.R | 242 +++++++++++++ DIMS/AddOnFunctions/isWithinXppm.R | 51 +++ DIMS/AddOnFunctions/mergeDuplicatedRows.R | 49 +++ DIMS/AddOnFunctions/normalization_2.1.R | 75 ++++ DIMS/AddOnFunctions/optimizeGauss.R | 11 + DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R | 88 +++++ DIMS/AddOnFunctions/remove.dupl.2.1.R | 32 ++ DIMS/AddOnFunctions/remove.dupl.R | 39 +++ DIMS/AddOnFunctions/replaceZeros.R | 108 ++++++ DIMS/AddOnFunctions/replaceZeros_setseed.R | 108 ++++++ DIMS/AddOnFunctions/run.vbs | 22 ++ DIMS/AddOnFunctions/runVBAMacro.R | 48 +++ DIMS/AddOnFunctions/searchMZRange.R | 187 ++++++++++ DIMS/AddOnFunctions/sourceDir.R | 8 + DIMS/AddOnFunctions/statistics_z.R | 69 ++++ DIMS/AddOnFunctions/sumCurves.R | 39 +++ DIMS/AddOnFunctions/trimZeros.R | 8 + DIMS/AssignToBins.R | 4 +- DIMS/AssignToBins.nf | 5 +- DIMS/AverageTechReplicates.R | 119 +++++++ DIMS/AverageTechReplicates.nf | 24 ++ DIMS/GenerateBreaks.R | 2 + DIMS/GenerateBreaks.nf | 2 +- DIMS/HMDBparts.R | 126 +++++++ DIMS/HMDBparts.nf | 20 ++ DIMS/MakeInit.nf | 2 +- DIMS/PeakFinding.R | 68 ++++ DIMS/PeakFinding.nf | 23 ++ DIMS/PeakGroupingIdentified.R | 180 ++++++++++ DIMS/PeakGroupingIdentified.nf | 18 + DIMS/SpectrumPeakFinding.R | 77 +++++ DIMS/SpectrumPeakFinding.nf | 17 + 57 files changed, 3360 insertions(+), 7 deletions(-) create mode 100644 DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R create mode 100644 DIMS/AddOnFunctions/checkOverlap.R create mode 100644 DIMS/AddOnFunctions/elementInfo.R create mode 100644 DIMS/AddOnFunctions/export.R create mode 100644 DIMS/AddOnFunctions/fit1Peak.R create mode 100644 DIMS/AddOnFunctions/fit2G.R create mode 100644 DIMS/AddOnFunctions/fit2peaks.R create mode 100644 DIMS/AddOnFunctions/fit3G.R create mode 100644 DIMS/AddOnFunctions/fit3peaks.R create mode 100644 DIMS/AddOnFunctions/fit4G.R create mode 100644 DIMS/AddOnFunctions/fit4peaks.R create mode 100644 DIMS/AddOnFunctions/fitG.R create mode 100644 DIMS/AddOnFunctions/fitGaussian.R create mode 100644 DIMS/AddOnFunctions/fitGaussianInit.R create mode 100644 DIMS/AddOnFunctions/generateBreaksFwhm.R create mode 100644 DIMS/AddOnFunctions/generateExcelFile.R create mode 100644 DIMS/AddOnFunctions/generateGaussian.R create mode 100644 DIMS/AddOnFunctions/getArea.R create mode 100644 DIMS/AddOnFunctions/getDeltaMZ.R create mode 100644 DIMS/AddOnFunctions/getFitQuality.R create mode 100644 DIMS/AddOnFunctions/getFwhm.R create mode 100644 DIMS/AddOnFunctions/getPatients.R create mode 100644 DIMS/AddOnFunctions/getSD.R create mode 100644 DIMS/AddOnFunctions/globalAssignments.HPC.R create mode 100644 DIMS/AddOnFunctions/iden.code.R create mode 100644 DIMS/AddOnFunctions/ident.hires.noise.HPC.R create mode 100644 DIMS/AddOnFunctions/isWithinXppm.R create mode 100644 DIMS/AddOnFunctions/mergeDuplicatedRows.R create mode 100644 DIMS/AddOnFunctions/normalization_2.1.R create mode 100644 DIMS/AddOnFunctions/optimizeGauss.R create mode 100644 DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R create mode 100644 DIMS/AddOnFunctions/remove.dupl.2.1.R create mode 100644 DIMS/AddOnFunctions/remove.dupl.R create mode 100644 DIMS/AddOnFunctions/replaceZeros.R create mode 100644 DIMS/AddOnFunctions/replaceZeros_setseed.R create mode 100644 DIMS/AddOnFunctions/run.vbs create mode 100644 DIMS/AddOnFunctions/runVBAMacro.R create mode 100644 DIMS/AddOnFunctions/searchMZRange.R create mode 100644 DIMS/AddOnFunctions/sourceDir.R create mode 100644 DIMS/AddOnFunctions/statistics_z.R create mode 100644 DIMS/AddOnFunctions/sumCurves.R create mode 100644 DIMS/AddOnFunctions/trimZeros.R create mode 100644 DIMS/AverageTechReplicates.R create mode 100644 DIMS/AverageTechReplicates.nf create mode 100644 DIMS/HMDBparts.R create mode 100644 DIMS/HMDBparts.nf create mode 100644 DIMS/PeakFinding.R create mode 100644 DIMS/PeakFinding.nf create mode 100644 DIMS/PeakGroupingIdentified.R create mode 100644 DIMS/PeakGroupingIdentified.nf create mode 100644 DIMS/SpectrumPeakFinding.R create mode 100644 DIMS/SpectrumPeakFinding.nf diff --git a/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R new file mode 100644 index 0000000..21ccdce --- /dev/null +++ b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R @@ -0,0 +1,94 @@ +#!/usr/bin/Rscript + +.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +outdir <- cmd_args[1] +scanmode <- cmd_args[2] +normalization <- cmd_args[3] +scripts <- cmd_args[4] +z_score <- as.numeric(cmd_args[5]) +ppm <- as.numeric(cmd_args[6]) + +#outdir <- "/Users/nunen/Documents/Metab/processed/test_old" +#scanmode <- "negative" +#normalization <- "disabled" +#scripts <- "/Users/nunen/Documents/Metab/DIMS/scripts" +#db <- "/Users/nunen/Documents/Metab/DIMS/db/HMDB_add_iso_corrNaCl_withIS_withC5OH.RData" +#z_score <- 0 + +object.files = list.files(paste(outdir, "9-samplePeaksFilled", sep="/"), full.names=TRUE, pattern=scanmode) +outlist.tot=NULL +for (i in 1:length(object.files)) { + load(object.files[i]) + print(print(object.files[i])) + outlist.tot = rbind(outlist.tot, final.outlist.idpat3) +} + +source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/")) +sourceDir(paste(scripts, "AddOnFunctions", sep="/")) + +# remove duplicates +outlist.tot = mergeDuplicatedRows(outlist.tot) + +# sort on mass +outlist.tot = outlist.tot[order(outlist.tot[,"mzmed.pgrp"]),] + +# normalization +load(paste0(outdir, "/repl.pattern.",scanmode,".RData")) + +if (normalization != "disabled") { + outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") +} + +if (z_score == 1) { + outlist.stats = statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) + nr.removed.samples=length(which(repl.pattern.filtered[]=="character(0)")) + order.index.int=order(colnames(outlist.stats)[8:(length(repl.pattern.filtered)-nr.removed.samples+7)]) + outlist.stats.more = cbind(outlist.stats[,1:7], + outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+8):(length(repl.pattern.filtered)-nr.removed.samples+8+6)], + outlist.stats[,8:(length(repl.pattern.filtered)-nr.removed.samples+7)][order.index.int], + outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+5+10):ncol(outlist.stats)]) + + tmp.index=grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) + tmp.index.order=order(colnames(outlist.stats.more[,tmp.index])) + tmp = outlist.stats.more[,tmp.index[tmp.index.order]] + outlist.stats.more=outlist.stats.more[,-tmp.index] + outlist.stats.more=cbind(outlist.stats.more,tmp) + outlist.tot = outlist.stats.more +} + +# filter identified compounds +index.1=which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"]))) +index.2=which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"]))) +index=union(index.1,index.2) +outlist.ident = outlist.tot[index,] +outlist.not.ident = outlist.tot[-index,] + +if (z_score == 1) { + outlist.ident$ppmdev=as.numeric(outlist.ident$ppmdev) + outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),] +} +# NAs in theormz_noise <======================================================================= uitzoeken!!! +outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0 +outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise) +outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0 +outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise) + +save(outlist.not.ident, outlist.ident, file=paste(outdir, "/outlist_identified_", scanmode, ".RData", sep="")) + +# Extra output in Excel-readable format: +remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") +remove_colindex <- which(colnames(outlist.ident) %in% remove_columns) +outlist.ident <- outlist.ident[ , -remove_colindex] +write.table(outlist.ident, file=paste0(outdir, "/outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) +remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns) +outlist.not.ident <- outlist.not.ident[ , -remove_colindex] +write.table(outlist.not.ident, file=paste0(outdir, "/outlist_not_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) + diff --git a/DIMS/AddOnFunctions/checkOverlap.R b/DIMS/AddOnFunctions/checkOverlap.R new file mode 100644 index 0000000..f7ef818 --- /dev/null +++ b/DIMS/AddOnFunctions/checkOverlap.R @@ -0,0 +1,16 @@ +checkOverlap <- function(range1,range2){ + if (length(intersect(range1,range2))==2) { + # Overlap + # message("Overlap, smaller range is used") + if (length(range1) >= length(range2)){ + range1=range1[-length(range1)] + } else { + range2=range2[-1] + } + } else if (length(intersect(range1,range2))==3){ + # message("Overlap, smaller range is used") + range1=range1[-length(range1)] + range2=range2[-1] + } + return(list("range1"=range1,"range2"=range2)) +} diff --git a/DIMS/AddOnFunctions/elementInfo.R b/DIMS/AddOnFunctions/elementInfo.R new file mode 100644 index 0000000..18eaeff --- /dev/null +++ b/DIMS/AddOnFunctions/elementInfo.R @@ -0,0 +1,6 @@ +elementInfo <- function(name, elements = NULL) { # from Rdisop function .getElement + if (!is.list(elements) || length(elements)==0 ) { + elements <- initializePSE() } + if (name=="CH3OH+H"){rex<-"^CH3OH\\+H$"}else{rex <- paste ("^",name,"$", sep="")} + elements [[grep (rex, sapply (elements, function(x) {x$name}))]] +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/export.R b/DIMS/AddOnFunctions/export.R new file mode 100644 index 0000000..cab79a0 --- /dev/null +++ b/DIMS/AddOnFunctions/export.R @@ -0,0 +1,13 @@ +export <- function(peaklist, plotdir, adducts, control_label, case_label, patients, sub, fileName){ + # peaklist = outlist.adducts + # adducts=TRUE + # control_label="C" + # case_label="P" + # patients = getPatients(outlist.adducts) + # sub=3000 + + # peaklist = statistics_z_4export(as.data.frame(peaklist), plotdir, patients, adducts, control_label, case_label) + + # generateExcelFile(peaklist, file.path(plotdir), imageNum=2, fileName, subName=c("","_box"), sub, adducts) + +} diff --git a/DIMS/AddOnFunctions/fit1Peak.R b/DIMS/AddOnFunctions/fit1Peak.R new file mode 100644 index 0000000..b09ce20 --- /dev/null +++ b/DIMS/AddOnFunctions/fit1Peak.R @@ -0,0 +1,120 @@ +fit1Peak <- function(x2,x,y,index,scale,resol,plot,FQ,useBounds) { + #FQ=FQ1 + + if (length(y)<3){ + message("Range to small, no fit possible!") + } else { + + if ((length(y)==4)) { + mu = weighted.mean(x,y) + sigma = getSD(x,y) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + } else { + + if ((length(x) - length(index)) < 2) { + range1=c((length(x)-4):length(x)) + } else if (length(index) < 2) { + range1=c(1:5) + } else { + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + } + + if (range1[1]==0) range1=range1[-1] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + + mu = weighted.mean(x[range1],y[range1]) + sigma = getSD(x[range1],y[range1]) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + } + + p2 = fitP$par + + #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) + fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new + + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + + scale_new = 1.2*scale + # cat(fq_new) + + if (fq_new > FQ) { # <=== bad fit? + # optimize scaling factor + fq = 0 + scale = 0 + + if (sum(y)>sum(p2[2]*dnorm(x,p2[1],sigma))){ + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) { + fq = fq_new + scale = scale_new + + #cat(scale) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + p2 = fitP$par + + #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) + fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new + scale_new=1.2*scale + + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + # cat(paste("fq_new: ", fq_new)) + # cat(paste("scale_new: ", scale_new)) + # (round(fq, digits = 4) != round(fq_new, digits = 4)) + } + } else { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) { + fq = fq_new + scale = scale_new + + # cat(scale) + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + p2 = fitP$par + + #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) + fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new + scale_new=0.8*scale + + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + # cat(paste("fq_new: ", fq_new)) + } + } + + if (fq < fq_new) { + # cat(paste("fq_new: ", fq_new)) + # cat(paste("fq: ", fq)) + # cat(paste("scale_new: ", scale_new)) + # cat(paste("scale: ", scale)) + + fitP = fitG_2(x,y,sigma,mu,scale,useBounds) + p2 = fitP$par + fq_new = fq + # cat(paste("==> fq_new: ", fq_new)) + if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="dark green") + + } + } + + if (plot & (fq_new < FQ)) { + # plot ################### + #lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") + fwhm = getFwhm(p2[1],resol) + half_max = max(p2[2]*dnorm(x2,p2[1],sigma))*0.5 + lines(c(p2[1] - 0.5*fwhm, p2[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + abline(v = p2[1], col="green") + h=c(paste("mean =", p2[1], sep=" "), + paste("fq =", fq_new, sep=" ")) + legend("topright", legend=h) + ########################## + + + # abline(v = x[6], col="red") + # fwhm = getFwhm(x[6]) + # abline(v =x[6] + 0.6*fwhm, col="red") + # abline(v =x[6] - 0.6*fwhm, col="red") + + } + } + + return(list("mean"=p2[1], "scale"=p2[2], "sigma"=sigma, "qual"=fq_new)) +} diff --git a/DIMS/AddOnFunctions/fit2G.R b/DIMS/AddOnFunctions/fit2G.R new file mode 100644 index 0000000..9dc76dd --- /dev/null +++ b/DIMS/AddOnFunctions/fit2G.R @@ -0,0 +1,43 @@ +fit2G_2 <- function(x,y,sig1,sig2,mu1,scale1,mu2,scale2,useBounds){ + + f = function(p){ + d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf) + + if (is.null(mu2) && is.null(scale2) && is.null(sig2)){ + sig2=sig1 + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu1), + as.numeric(scale1)), + f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + } else { + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu2), + as.numeric(scale2)), + f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + } + + } else { + if (is.null(mu2) && is.null(scale2) && is.null(sig2)){ + sig2=sig1 + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu1), + as.numeric(scale1)), + f,control=list(maxit=10000)) + } else{ + optim(c(as.numeric(mu1), + as.numeric(scale1), + as.numeric(mu2), + as.numeric(scale2)), + f,control=list(maxit=10000)) + } + } +} diff --git a/DIMS/AddOnFunctions/fit2peaks.R b/DIMS/AddOnFunctions/fit2peaks.R new file mode 100644 index 0000000..4d9c64d --- /dev/null +++ b/DIMS/AddOnFunctions/fit2peaks.R @@ -0,0 +1,100 @@ +fit2peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){ + + peak.mean = NULL + peak.area = NULL + peak.scale = NULL + peak.sigma = NULL + + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + if (range1[1]==0) range1=range1[-1] + + range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) + + if (length(x)0) range1=range1[-remove] + remove=which(range2<1) + if (length(remove)>0) range2=range2[-remove] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] + + mu1 = weighted.mean(x[range1],y[range1]) + sigma1 = getSD(x[range1],y[range1]) + + # message(paste("fit2peaks mu =>", mu1)) + # message(paste("fit2peaks sigma =>", sigma1)) + # message(paste("fit2peaks scale =>", scale)) + + fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) + p = fitP$par + + mu2 = weighted.mean(x[range2],y[range2]) + sigma2 = getSD(x[range2],y[range2]) + fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) + p2 = fitP$par + + fit2P = fit2G_2(x, y, sigma1, sigma2, p[1], p[2], p2[1], p2[2],useBounds) + p3 = fit2P$par + + if (is.null(sigma2)) sigma2=sigma1 + + + # plot ################### + sumFit2 = (p3[2]*dnorm(x2,p3[1],sigma1))+(p3[4]*dnorm(x2,p3[3],sigma2)) + sumFit = (p3[2]*dnorm(x,p3[1],sigma1))+(p3[4]*dnorm(x,p3[3],sigma2)) + fq=getFitQuality(x,y,sort(c(p3[1],p3[3]))[1],sort(c(p3[1],p3[3]))[2],resol,sumFit=sumFit)$fq_new + + fwhm = getFwhm(p3[1],resol) + half_max = max(p3[2]*dnorm(x2,p3[1],sigma1))*0.5 + if (plot & (fq < FQ)) lines(c(p3[1] - 0.5*fwhm, p3[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + if (plot & (fq < FQ)) lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="grey") + if (plot & (fq < FQ)) abline(v = p3[3], col="grey") + + fwhm = getFwhm(p3[3],resol) + half_max = max(p3[4]*dnorm(x2,p3[3],sigma2))*0.5 + if (plot & (fq < FQ)) lines(c(p3[3] - 0.5*fwhm, p3[3] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,sumFit2,col="black") + + if (plot & (fq < FQ)) lines(x2,p3[2]*dnorm(x2,p3[1],sigma1),col="grey") + if (plot & (fq < FQ)) abline(v = p3[1], col="grey") + + + h2=c(paste("mean =", p3[1], sep=" "), + paste("mean =", p3[3], sep=" "), + paste("fq =", fq, sep=" ")) + + if (plot & (fq < FQ)) legend("topright", legend=h2) + ########################## + + # lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="red") + # area1 = sum(p3[2]*dnorm(x2,p3[1],sigma1)) + # area2 = sum(p3[4]*dnorm(x2,p3[3],sigma2)) + + # area1 = max(p3[2]*dnorm(x2,p3[1],sigma1)) + # area2 = max(p3[4]*dnorm(x2,p3[3],sigma2)) + + area1 = getArea(p3[1],resol,p3[2],sigma1,int.factor) + area2 = getArea(p3[3],resol,p3[4],sigma2,int.factor) + + peak.area = c(peak.area, area1) + peak.area = c(peak.area, area2) + + peak.mean = c(peak.mean, p3[1]) + peak.mean = c(peak.mean, p3[3]) + + peak.scale = c(peak.scale, p3[2]) + peak.scale = c(peak.scale, p3[4]) + + peak.sigma = c(peak.sigma, sigma1) + peak.sigma = c(peak.sigma, sigma2) + + return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fit3G.R b/DIMS/AddOnFunctions/fit3G.R new file mode 100644 index 0000000..05543c5 --- /dev/null +++ b/DIMS/AddOnFunctions/fit3G.R @@ -0,0 +1,18 @@ +fit3G_2 <- function(x,y,sig1,sig2,sig3,mu1,scale1,mu2,scale2,mu3,scale3,useBounds){ + + f = function(p){ + d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf) + + optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + + } else { + optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000)) + } + +} diff --git a/DIMS/AddOnFunctions/fit3peaks.R b/DIMS/AddOnFunctions/fit3peaks.R new file mode 100644 index 0000000..e373f14 --- /dev/null +++ b/DIMS/AddOnFunctions/fit3peaks.R @@ -0,0 +1,122 @@ +fit3peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){ + + peak.mean = NULL + peak.area = NULL + peak.scale = NULL + peak.sigma = NULL + + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) + range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2) + + remove=which(range1<1) + if (length(remove)>0) { + range1=range1[-remove] + } + remove=which(range2<1) + if (length(remove)>0) { + range2=range2[-remove] + } + + if (length(x)0) range1=range1[-remove] + remove=which(range2<1) + if (length(remove)>0) range2=range2[-remove] + remove=which(range3<1) + if (length(remove)>0) range3=range3[-remove] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] + if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))] + + mu1 = weighted.mean(x[range1],y[range1]) + sigma1 = getSD(x[range1],y[range1]) + fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) + p = fitP$par + + mu2 = weighted.mean(x[range2],y[range2]) + sigma2 = getSD(x[range2],y[range2]) + fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) + p2 = fitP$par + + mu3 = weighted.mean(x[range3],y[range3]) + sigma3 = getSD(x[range3],y[range3]) + fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds) + p3 = fitP$par + + fit3P = fit3G_2(x, y, sigma1, sigma2, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], useBounds) + p4 = fit3P$par + + # plot ############################## + sumFit2 = (p4[2]*dnorm(x2,p4[1],sigma1))+(p4[4]*dnorm(x2,p4[3],sigma2))+(p4[6]*dnorm(x2,p4[5],sigma3)) + sumFit = (p4[2]*dnorm(x,p4[1],sigma1))+(p4[4]*dnorm(x,p4[3],sigma2))+(p4[6]*dnorm(x,p4[5],sigma3)) + fq=getFitQuality(x,y,sort(c(p4[1],p4[3],p4[5]))[1],sort(c(p4[1],p4[3],p4[5]))[3],resol,sumFit=sumFit)$fq_new + + if (plot & (fq < FQ)) lines(x2,p4[2]*dnorm(x2,p4[1],sigma1),col="yellow") + if (plot & (fq < FQ)) abline(v = p4[1], col="yellow") + fwhm = getFwhm(p4[1],resol) + half_max = max(p4[2]*dnorm(x2,p4[1],sigma1))*0.5 + if (plot & (fq < FQ)) lines(c(p4[1] - 0.5*fwhm, p4[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p4[4]*dnorm(x2,p4[3],sigma2),col="yellow") + if (plot & (fq < FQ)) abline(v = p4[3], col="yellow") + fwhm = getFwhm(p4[3],resol) + half_max = max(p4[4]*dnorm(x2,p4[3],sigma2))*0.5 + if (plot & (fq < FQ)) lines(c(p4[3] - 0.5*fwhm, p4[3] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p4[6]*dnorm(x2,p4[5],sigma3),col="yellow") + if (plot & (fq < FQ)) abline(v = p4[5], col="yellow") + fwhm = getFwhm(p4[5],resol) + half_max = max(p4[6]*dnorm(x2,p4[5],sigma3))*0.5 + if (plot & (fq < FQ)) lines(c(p4[5] - 0.5*fwhm, p4[5] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,sumFit2,col="red") + + h2=c(paste("mean =", p4[1], sep=" "), + paste("mean =", p4[3], sep=" "), + paste("mean =", p4[5], sep=" "), + paste("fq =", fq, sep=" ")) + + if (plot & (fq < FQ)) legend("topright", legend=h2) + ######################################### + + # area1 = sum(p4[2]*dnorm(x2,p4[1],sigma1)) + # area2 = sum(p4[4]*dnorm(x2,p4[3],sigma2)) + # area3 = sum(p4[6]*dnorm(x2,p4[5],sigma3)) + + # area1 = max(p4[2]*dnorm(x2,p4[1],sigma1)) + # area2 = max(p4[4]*dnorm(x2,p4[3],sigma2)) + # area3 = max(p4[6]*dnorm(x2,p4[5],sigma3)) + + area1 = getArea(p4[1],resol,p4[2],sigma1,int.factor) + area2 = getArea(p4[3],resol,p4[4],sigma2,int.factor) + area3 = getArea(p4[5],resol,p4[6],sigma3,int.factor) + + peak.area = c(peak.area, area1) + peak.area = c(peak.area, area2) + peak.area = c(peak.area, area3) + + peak.mean = c(peak.mean, p4[1]) + peak.mean = c(peak.mean, p4[3]) + peak.mean = c(peak.mean, p4[5]) + + peak.scale = c(peak.scale, p4[2]) + peak.scale = c(peak.scale, p4[4]) + peak.scale = c(peak.scale, p4[6]) + + peak.sigma = c(peak.sigma, sigma1) + peak.sigma = c(peak.sigma, sigma2) + peak.sigma = c(peak.sigma, sigma3) + + return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fit4G.R b/DIMS/AddOnFunctions/fit4G.R new file mode 100644 index 0000000..6cad63d --- /dev/null +++ b/DIMS/AddOnFunctions/fit4G.R @@ -0,0 +1,18 @@ +fit4G_2 <- function(x,y,sig1,sig2,sig3,sig4,mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4,useBounds){ + + f = function(p){ + d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) + p[8]*dnorm(x,mean=p[7],sd=sig4) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0,x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf) + + optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + + } else { + optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000)) + } + +} diff --git a/DIMS/AddOnFunctions/fit4peaks.R b/DIMS/AddOnFunctions/fit4peaks.R new file mode 100644 index 0000000..a506541 --- /dev/null +++ b/DIMS/AddOnFunctions/fit4peaks.R @@ -0,0 +1,149 @@ +fit4peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor) { + + peak.mean = NULL + peak.area = NULL + peak.scale = NULL + peak.sigma = NULL + + range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) + range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) + range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2) + range4=c(index[4]-2,index[4]-1,index[4],index[4]+1,index[4]+2) + if (range1[1]==0) range1=range1[-1] + if (length(x)length(x)) + if (length(remove)>0) { + range4=range4[-remove] + # message(length(range4)) + } + + # check for negative or 0 + remove=which(range1<1) + if (length(remove)>0) range1=range1[-remove] + remove=which(range2<1) + if (length(remove)>0) range2=range2[-remove] + remove=which(range3<1) + if (length(remove)>0) range3=range3[-remove] + remove=which(range4<1) + if (length(remove)>0) range4=range4[-remove] + + # remove NA + if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] + if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] + if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))] + if (length(which(is.na(y[range4])))!=0) range4=range4[-which(is.na(y[range4]))] + + mu1 = weighted.mean(x[range1],y[range1]) + sigma1 = getSD(x[range1],y[range1]) + fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) + p = fitP$par + + mu2 = weighted.mean(x[range2],y[range2]) + sigma2 = getSD(x[range2],y[range2]) + fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) + p2 = fitP$par + + mu3 = weighted.mean(x[range3],y[range3]) + sigma3 = getSD(x[range3],y[range3]) + fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds) + p3 = fitP$par + + mu4 = weighted.mean(x[range4],y[range4]) + sigma4 = getSD(x[range4],y[range4]) + fitP = fitG_2(x[range4],y[range4],sigma4,mu4,scale,useBounds) + p4 = fitP$par + + fit4P = fit4G_2(x, y, sigma1, sigma2, sigma3, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], p4[1], p4[2], useBounds) + p5 = fit4P$par + + # plot ##################################### + sumFit2 = (p5[2]*dnorm(x2,p5[1],sigma1))+(p5[4]*dnorm(x2,p5[3],sigma2))+(p5[6]*dnorm(x2,p5[5],sigma3))+(p5[8]*dnorm(x2,p5[7],sigma3)) + sumFit = (p5[2]*dnorm(x,p5[1],sigma1))+(p5[4]*dnorm(x,p5[3],sigma2))+(p5[6]*dnorm(x,p5[5],sigma3))+(p5[8]*dnorm(x,p5[7],sigma3)) + fq=getFitQuality(x,y,sort(c(p5[1],p5[3],p5[5],p5[7]))[1],sort(c(p5[1],p5[3],p5[5],p5[7]))[4],resol,sumFit=sumFit)$fq_new + + if (plot & (fq < FQ)) lines(x2,p5[2]*dnorm(x2,p5[1],sigma1),col="purple") + if (plot & (fq < FQ)) abline(v = p5[1], col="purple") + fwhm = getFwhm(p5[1],resol) + half_max = max(p5[2]*dnorm(x2,p5[1],sigma1))*0.5 + if (plot & (fq < FQ)) lines(c(p5[1] - 0.5*fwhm, p5[1] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p5[4]*dnorm(x2,p5[3],sigma2),col="purple") + if (plot & (fq < FQ)) abline(v = p5[3], col="purple") + fwhm = getFwhm(p5[3],resol) + half_max = max(p5[4]*dnorm(x2,p5[3],sigma2))*0.5 + if (plot & (fq < FQ)) lines(c(p5[3] - 0.5*fwhm, p5[3] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p5[6]*dnorm(x2,p5[5],sigma3),col="purple") + if (plot & (fq < FQ)) abline(v = p5[5], col="purple") + fwhm = getFwhm(p5[5],resol) + half_max = max(p5[6]*dnorm(x2,p5[5],sigma3))*0.5 + if (plot & (fq < FQ)) lines(c(p5[5] - 0.5*fwhm, p5[5] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,p5[8]*dnorm(x2,p5[7],sigma3),col="purple") + if (plot & (fq < FQ)) abline(v = p5[7], col="purple") + fwhm = getFwhm(p5[7],resol) + half_max = max(p5[8]*dnorm(x2,p5[7],sigma3))*0.5 + if (plot & (fq < FQ)) lines(c(p5[7] - 0.5*fwhm, p5[7] + 0.5*fwhm),c(half_max,half_max),col="orange") + + if (plot & (fq < FQ)) lines(x2,sumFit2,col="blue") + + #fq = abs(sum(y) - sum(sumFit))/sum(y) + #fq=abs(sum(y) - sum(sumFit))/sum(y) + #fq=mean(abs(sumFit - y)/sumFit) + + + h2=c(paste("mean =", p5[1], sep=" "), + paste("mean =", p5[3], sep=" "), + paste("mean =", p5[5], sep=" "), + paste("mean =", p5[7], sep=" "), + paste("fq =", fq, sep=" ")) + + if (plot & (fq < FQ)) legend("topright", legend=h2) + ############################################# + + # area1 = sum(p5[2]*dnorm(x2,p5[1],sigma1)) + # area2 = sum(p5[4]*dnorm(x2,p5[3],sigma2)) + # area3 = sum(p5[6]*dnorm(x2,p5[5],sigma3)) + # area4 = sum(p5[8]*dnorm(x2,p5[7],sigma4)) + + # area1 = max(p5[2]*dnorm(x2,p5[1],sigma1)) + # area2 = max(p5[4]*dnorm(x2,p5[3],sigma2)) + # area3 = max(p5[6]*dnorm(x2,p5[5],sigma3)) + # area4 = max(p5[8]*dnorm(x2,p5[7],sigma4)) + + area1 = getArea(p5[1],resol,p5[2],sigma1,int.factor) + area2 = getArea(p5[3],resol,p5[4],sigma2,int.factor) + area3 = getArea(p5[5],resol,p5[6],sigma3,int.factor) + area4 = getArea(p5[7],resol,p5[8],sigma4,int.factor) + + peak.area = c(peak.area, area1) + peak.area = c(peak.area, area2) + peak.area = c(peak.area, area3) + peak.area = c(peak.area, area4) + + peak.mean = c(peak.mean, p5[1]) + peak.mean = c(peak.mean, p5[3]) + peak.mean = c(peak.mean, p5[5]) + peak.mean = c(peak.mean, p5[7]) + + peak.scale = c(peak.scale, p5[2]) + peak.scale = c(peak.scale, p5[4]) + peak.scale = c(peak.scale, p5[6]) + peak.scale = c(peak.scale, p5[8]) + + peak.sigma = c(peak.sigma, sigma1) + peak.sigma = c(peak.sigma, sigma2) + peak.sigma = c(peak.sigma, sigma3) + peak.sigma = c(peak.sigma, sigma4) + + return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fitG.R b/DIMS/AddOnFunctions/fitG.R new file mode 100644 index 0000000..b1b595d --- /dev/null +++ b/DIMS/AddOnFunctions/fitG.R @@ -0,0 +1,18 @@ +fitG_2 <- function(x,y,sig,mu,scale,useBounds) { + + f = function(p) { + d = p[2]*dnorm(x,mean=p[1],sd=sig) + sum((d-y)^2) + } + + if (useBounds){ + lower = c(x[1],0,x[1],0) + upper = c(x[length(x)],Inf,x[length(x)],Inf) + + optim(c(as.numeric(mu), as.numeric(scale)), + f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) + } else { + #optim(c(mu,scale),f) + optim(c(as.numeric(mu),as.numeric(scale)),f,control=list(maxit=10000)) + } +} diff --git a/DIMS/AddOnFunctions/fitGaussian.R b/DIMS/AddOnFunctions/fitGaussian.R new file mode 100644 index 0000000..3c5027d --- /dev/null +++ b/DIMS/AddOnFunctions/fitGaussian.R @@ -0,0 +1,327 @@ +fitGaussian <- function(x2,x,y,index,scale,resol,outdir,force,useBounds,plot,scanmode,int.factor,width,height) { + # force=length(index) + # useBounds=FALSE + + peak.mean = NULL + peak.area = NULL + peak.qual = NULL + peak.min = NULL + peak.max = NULL + + FQ1 = 0.15 + FQ = 0.2 + + # One local max + if (force==1){ + + retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds) + + scale = 2 + + if (retVal$mean[1]x[length(x)]) { # <=== mean outside range + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=1,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + } else { # <=== mean within range + + if (retVal$qual > FQ1){ # <=== bad fit + + # Try to fit two curves + + # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. + # The +1 below takes care of the fact that the result of diff is shorter than the input vector. + new_index=which(diff(sign(diff(y)))==-2)+1 + + if (length(new_index)!=2) { + new_index = round(length(x)/3) + new_index = c(new_index, 2*new_index) + } + + #length(new_index) + return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=2,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <=== good fit + + peak.mean = c(peak.mean, retVal$mean) + #peak.area = c(peak.area, sum(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma))) + # "Centroid mode" + # peak.area = c(peak.area, max(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma))) + peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor)) + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + } + + # Two local max + } else if (force==2 & (length(x)>6)) { + + # fit two curves + retVal = fit2peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) # <=== fit 2 curves + + if (retVal$mean[1]x[length(x)] | # <=== one of means outside range + retVal$mean[2]x[length(x)]) { + + # check quality + if (retVal$qual > FQ) { # <=== bad fit + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=2,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + # good fit + } else { + + # check which mean is outside range + # Todo ========> check this!!!!!!!!!!!!!!! + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]x[length(x)] ) { + peak.mean = c(peak.mean, -i) + peak.area = c(peak.area, -i) + } else { + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + } else { # <=== all means within range + + if (retVal$qual > FQ) { # <=== bad fit + + # Try to fit three curves + new_index=which(diff(sign(diff(y)))==-2)+1 + + if (length(new_index)!=3) { + new_index = round(length(x)/4) + new_index = c(new_index, 2*new_index, 3*new_index) + } + + #length(new_index) + return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=3,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <======== good fit + + # check if means are within 3 ppm and sum if so + tmp = retVal$qual + + nMeanNew = -1 + nMean = length(retVal$mean) + while (nMean!=nMeanNew){ + nMean = length(retVal$mean) + retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) + nMeanNew = length(retVal$mean) + } + + retVal$qual = tmp + + h2=NULL + + for (i in 1:length(retVal$mean)){ + h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) + + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + + h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) + if (plot) legend("topright", legend=h2) + } + } + + # Three local max + } else if (force==3 & (length(x)>6)){ + + retVal = fit3peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) + + # outside range + if (retVal$mean[1]x[length(x)] | # <=== one of means outside range + retVal$mean[2]x[length(x)] | + retVal$mean[3]x[length(x)]) { + + # check quality + if (retVal$qual > FQ) { # <=== bad fit + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + # good fit + } else { + + # check which mean is outside range + # Todo ========> check this!!!!!!!!!!!!!!! + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]x[length(x)] ) { + peak.mean = c(peak.mean, -i) + peak.area = c(peak.area, -i) + } else { + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + + } else { # <=== all means within range + + if (retVal$qual > FQ) { # <=== bad fit + + # Try to fit four curves + new_index=which(diff(sign(diff(y)))==-2)+1 + + if (length(new_index)!=4) { + new_index = round(length(x)/5) + new_index = c(new_index, 2*new_index, 3*new_index, 4*new_index) + } + + #length(new_index) + return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=4,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <======== good fit + + # check if means are within 3 ppm and sum if so + tmp = retVal$qual + + nMeanNew = -1 + nMean = length(retVal$mean) + while (nMean!=nMeanNew){ + nMean = length(retVal$mean) + retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) + nMeanNew = length(retVal$mean) + } + + retVal$qual = tmp + + h2=NULL + # peak.mean=NULL + # peak.area=NULL + + for (i in 1:length(retVal$mean)){ + h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) + + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + + h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) + if (plot) legend("topright", legend=h2) + + } + } + + # Four local max + } else if (force==4 & (length(x)>6)){ + + retVal = fit4peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) + + if (retVal$mean[1]x[length(x)] | + retVal$mean[2]x[length(x)] | + retVal$mean[3]x[length(x)] | + retVal$mean[4]x[length(x)]) { + + # check quality + if (retVal$qual > FQ) { # <=== bad fit + + # do it again + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height)) + + # good fit + } else { + + # check which mean is outside range + # Todo ========> check this!!!!!!!!!!!!!!! + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]x[length(x)] ) { + peak.mean = c(peak.mean, -i) + peak.area = c(peak.area, -i) + } else { + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + + } else { + + if (retVal$qual > FQ) { # <=== bad fit + + # Generate 1 curve + return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=5,useBounds=FALSE,plot,scanmode,int.factor,width,height)) + + } else { # <======== good fit + + # check if means are within 3 ppm and sum if so + tmp = retVal$qual + + nMeanNew = -1 + nMean = length(retVal$mean) + while (nMean!=nMeanNew){ + nMean = length(retVal$mean) + retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) + nMeanNew = length(retVal$mean) + } + + retVal$qual = tmp + + h2=NULL + + for (i in 1:length(retVal$mean)){ + h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) + + peak.mean = c(peak.mean, retVal$mean[i]) + peak.area = c(peak.area, retVal$area[i]) + } + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + + h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) + if (plot) legend("topright", legend=h2) + } + } + + # More then four local max + } else { + + scale=2 + FQ1=0.40 + useBounds=TRUE + index=which(y==max(y)) + retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds) + + if (retVal$qual > FQ1){ # <=== bad fit + + if (plot) dev.off() + + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + peak.mean = c(peak.mean, rval$mean) + peak.area = c(peak.area, rval$area) + peak.min = rval$min + peak.max = rval$max + peak.qual = 0 + + } else { # <=== good fit + + peak.mean = c(peak.mean, retVal$mean) + peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor)) + peak.qual = retVal$qual + peak.min = x[1] + peak.max = x[length(x)] + } + } + + return(list("mean"=peak.mean, "area"=peak.area, "qual" = peak.qual, "min"=peak.min, "max"=peak.max)) +} diff --git a/DIMS/AddOnFunctions/fitGaussianInit.R b/DIMS/AddOnFunctions/fitGaussianInit.R new file mode 100644 index 0000000..1c92bce --- /dev/null +++ b/DIMS/AddOnFunctions/fitGaussianInit.R @@ -0,0 +1,60 @@ +fitGaussianInit <- function(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) { + # scanmode="negative" + + mz.range = x[length(x)] - x[1] + x2 = seq(x[1],x[length(x)],length=mz.range*int.factor) + + # # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. + # # The +1 below takes care of the fact that the result of diff is shorter than the input vector. + # index=which(diff(sign(diff(y)))==-2)+1 + + # Alway try to fit one curve first + index = which(y==max(y)) + + if (scanmode=="positive"){ + plot_label="pos_fit.png" + } else { + plot_label="neg_fit.png" + } + + if (plot) { + CairoPNG(filename=paste(outdir,"Gaussian_fit",paste(sampname, x[1], plot_label, sep="_"), sep="/"), width, height) + plot(x,y,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y)),main=paste(i,start,end, sep=" ")) #, ylim=c(0,1e+05) + } + + retVal = fitGaussian(x2,x,y,index,scale,resol,outdir,force=length(index),useBounds=FALSE,plot,scanmode,int.factor,width,height) + + if (plot) { + if (length(retVal$mean)==1) { + + result = tryCatch(dev.off(), warning = function(w){}, + error=function(e){}, + finally = {}) + + # file.rename(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"), + # paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean,digits = 6), plot_label, sep="_"), sep="/")) + } else { + + + h2=NULL + for (i in 1:length(retVal$mean)){ + h2=c(h2, paste("mean =", retVal$mean[i], sep=" ")) + } + h2=c(h2, paste("fq =", retVal$qual, sep=" ")) + legend("topright", legend=h2) + + dev.off() + + for (i in 1:length(retVal$mean)){ + if (retVal$mean[i]!=-1){ + file.copy(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"), + paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean[i],digits = 6), plot_label, sep="_"), sep="/")) + } + } + file.remove(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/")) + } + } + + return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max)) + +} diff --git a/DIMS/AddOnFunctions/generateBreaksFwhm.R b/DIMS/AddOnFunctions/generateBreaksFwhm.R new file mode 100644 index 0000000..1a42631 --- /dev/null +++ b/DIMS/AddOnFunctions/generateBreaksFwhm.R @@ -0,0 +1,26 @@ +# options(digits=16) +# resol = 140000 +# breaks.fwhm = 1 +# i = 1 +# breaks.fwhm.avg = NULL +# +# Sys.time() +# +# while (breaks.fwhm[length(breaks.fwhm)]<1000){ +# +# resol.mz = resol*(1/sqrt(2)^(log2(breaks.fwhm[i]/200))) +# fwhm.0.1 = (breaks.fwhm[i]/resol.mz)/10 +# breaks.fwhm = c(breaks.fwhm, breaks.fwhm[i] + fwhm.0.1) +# breaks.fwhm.avg = c(breaks.fwhm.avg,(breaks.fwhm[i] +breaks.fwhm[i+1])/2) +# +# if (i %% 10000 == 0){ +# cat(paste("i =", i)) +# cat(paste("breaks.fwhm =", breaks.fwhm[length(breaks.fwhm)])) +# } +# +# i = i + 1 +# } +# +# Sys.time() +# save(list=c("breaks.fwhm", "breaks.fwhm.avg"), file="breaks.RData") +# cat("Breaks saved!") diff --git a/DIMS/AddOnFunctions/generateExcelFile.R b/DIMS/AddOnFunctions/generateExcelFile.R new file mode 100644 index 0000000..31474d9 --- /dev/null +++ b/DIMS/AddOnFunctions/generateExcelFile.R @@ -0,0 +1,24 @@ +generateExcelFile <- function(peaklist, fileName, sub, plot = TRUE) { + # plotdir=file.path(plotdir) + # imageNum=2 + # fileName="./results/xls/Pos_allpgrps_identified" + # subName=c("","_box") + + end <- 0 + i <- 0 + + if (dim(peaklist)[1]>=sub & (sub>0)){ + for (i in 1:floor(dim(peaklist)[1]/sub)){ + start=-(sub-1)+i*sub + end=i*sub + message(paste0(start, ":", end)) + + genExcelFileV3(peaklist[c(start:end),], paste(fileName, sep="_"), plot) + } + } + start = end + 1 + end = dim(peaklist)[1] + message(start) + message(end) + genExcelFileV3(peaklist[c(start:end),], paste(fileName, i+1, sep="_"), plot) +} diff --git a/DIMS/AddOnFunctions/generateGaussian.R b/DIMS/AddOnFunctions/generateGaussian.R new file mode 100644 index 0000000..da6dec2 --- /dev/null +++ b/DIMS/AddOnFunctions/generateGaussian.R @@ -0,0 +1,48 @@ +generateGaussian <- function(x,y,resol,plot,scanmode,int.factor,width,height) { + + factor=1.5 + index = which(y==max(y))[1] + x=x[index] + y=y[index] + mu = x + fwhm = getFwhm(mu,resol) + x.p = c(mu-factor*fwhm, x, mu+factor*fwhm) + y.p = c(0, y, 0) + + # if (plot) dir.create("./results/plots",showWarnings = FALSE) + # if (plot) dir.create("./results/plots/Gaussian_fit",showWarnings = FALSE) + + if (plot) { + if (scanmode=="positive"){ + plot_label="pos_fit.png" + } else { + plot_label="neg_fit.png" + } + } + + mz.range = x.p[length(x.p)] - x.p[1] + x2 = seq(x.p[1],x.p[length(x.p)],length=mz.range*int.factor) + sigma = getSD(x.p,y.p) + scale = optimizeGauss(x.p,y.p,sigma,mu) + + if (plot) { + CairoPNG(filename=paste("./results/Gaussian_fit",paste(sampname, mu, plot_label, sep="_"), sep="/"), width, height) + + plot(x.p,y.p,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y))) + lines(x2,scale*dnorm(x2,mu,sigma), col="green") + half_max = max(scale*dnorm(x2,mu,sigma))*0.5 + lines(c(mu - 0.5*fwhm, mu + 0.5*fwhm),c(half_max,half_max),col="orange") + abline(v = mu, col="green") + h=c(paste("mean =", mu, sep=" ")) + legend("topright", legend=h) + + dev.off() + } + + # area = sum(scale*dnorm(x2,mu,sigma)) + # area = max(scale*dnorm(x2,mu,sigma)) + area = getArea(mu,resol,scale,sigma,int.factor) + + return(list("mean"=mu, "area"=area, "min"=x2[1] , "max"=x2[length(x2)])) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/getArea.R b/DIMS/AddOnFunctions/getArea.R new file mode 100644 index 0000000..415b050 --- /dev/null +++ b/DIMS/AddOnFunctions/getArea.R @@ -0,0 +1,23 @@ +getArea <- function(mu,resol,scale,sigma,int.factor){ + + # mu=p3[1] + # scale=p3[2] + # sigma=sigma1 + + # mu=p3[1] + # scale=p3[2] + # sigma=sigma1 + + # avoid too big vector (cannot allocate vector of size ...) + if (mu>1200) return(0) + + fwhm = getFwhm(mu,resol) + mzMin = mu - 2*fwhm + mzMax = mu + 2*fwhm + mz.range = mzMax - mzMin + x_int = seq(mzMin,mzMax,length=mz.range*int.factor) + + #plot(x_int,scale*dnorm(x_int,mu,sigma),col="red",type="l") + + return(sum(scale*dnorm(x_int,mu,sigma))/100) +} diff --git a/DIMS/AddOnFunctions/getDeltaMZ.R b/DIMS/AddOnFunctions/getDeltaMZ.R new file mode 100644 index 0000000..f09a0a4 --- /dev/null +++ b/DIMS/AddOnFunctions/getDeltaMZ.R @@ -0,0 +1,5 @@ +getDeltaMZ <- function(mass, breaks.fwhm){ + index = which(breaks.fwhmintRangeMin) & (xintRangeMin) & (x mu: ",mu)) + # message(paste("====================> resol.mz: ",resol.mz)) + # }) + + # , error = function(e) { + # error-handler-code + # }, finally = { + # cleanup-code + # }) + + return(fwhm) +} diff --git a/DIMS/AddOnFunctions/getPatients.R b/DIMS/AddOnFunctions/getPatients.R new file mode 100644 index 0000000..3b03f0a --- /dev/null +++ b/DIMS/AddOnFunctions/getPatients.R @@ -0,0 +1,9 @@ +getPatients <- function(peaklist){ + patients=colnames(peaklist)[grep("P", colnames(peaklist), fixed = TRUE)] + patients=unique(as.vector(unlist(lapply(strsplit(patients, ".", fixed = TRUE), function(x) x[1])))) + # ToDo: If 2 P's in sample names!!!!!!!!!!!!! + # patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "_P", fixed = TRUE), function(x) x[2])))))) + patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "P", fixed = TRUE), function(x) x[2])))))) + + return(patients) +} diff --git a/DIMS/AddOnFunctions/getSD.R b/DIMS/AddOnFunctions/getSD.R new file mode 100644 index 0000000..197a002 --- /dev/null +++ b/DIMS/AddOnFunctions/getSD.R @@ -0,0 +1,9 @@ +getSD <- function(x,y,resol=140000) { + + index = which(y==max(y)) + mean = x[index] + resol.mz = resol*(1/sqrt(2)^(log2(mean/200))) + fwhm = mean/resol.mz + sd = (fwhm/2)*0.85 + return(sd) +} diff --git a/DIMS/AddOnFunctions/globalAssignments.HPC.R b/DIMS/AddOnFunctions/globalAssignments.HPC.R new file mode 100644 index 0000000..9207661 --- /dev/null +++ b/DIMS/AddOnFunctions/globalAssignments.HPC.R @@ -0,0 +1,114 @@ +#.libPaths(new="/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") +## calculate relative abundancies from theoretical mass and composition + +#library(Rdisop) + +#theor.table <- read.table(file="C:/Users/mraves/Metabolomics/TheoreticalMZ_Negative_composition.txt", sep="\t", header=TRUE) +options(digits=16) + +#library(OrgMassSpecR) + +suppressPackageStartupMessages(library(lattice)) +# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST +Ba <- list(name= 'Ba', mass=130, isotope=list(mass=c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768), abundance=c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717))) +Br <- list(name= 'Br', mass=79, isotope=list(mass=c(-0.0816639, 0, -0.083711), abundance=c(0.5069, 0, 0.4931))) +C <- list(name= 'C', mass=12, isotope=list(mass=c(0, 0.003354838, 0.003241989), abundance=c(0.9893, 0.0107, 0))) +Ca <- list(name= 'Ca', mass=40, isotope=list(mass=c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467), abundance=c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187))) +Cl <- list(name= 'Cl', mass=35, isotope=list(mass=c(-0.03114732, 0, -0.03409741), abundance=c(0.7576, 0, 0.2424))) +Cr <- list(name= 'Cr', mass=50, isotope=list(mass=c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175), abundance=c(0.04345, 0, 0.83789, 0.09501, 0.02365))) +Cu <- list(name= 'Cu', mass=63, isotope=list(mass=c(-0.0704011, 0, -0.0722071), abundance=c(0.6917, 0, 0.3083))) +F <- list(name= 'F', mass=19, isotope=list(mass=c(-0.00159678), abundance=c(1))) +Fe <- list(name= 'Fe', mass=54, isotope=list(mass=c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227), abundance=c(0.058, 0, 0.9172, 0.022, 0.0028))) +H <- list(name= 'H', mass=1, isotope=list(mass=c(0.00782503207, 0.014101778, 0.01604928), abundance=c(0.999885, 0.000115, 0))) +Hg <- list(name= 'Hg', mass=196, isotope=list(mass=c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533), abundance=c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687))) +I <- list(name= 'I', mass=127, isotope=list(mass=c(-0.095527), abundance=c(1))) +K <- list(name= 'K', mass=39, isotope=list(mass=c(-0.0362926, -0.0360008, -0.0381746), abundance=c(0.932581, 0.000117, 0.067302))) +Li <- list(name= 'Li', mass=6, isotope=list(mass=c(0.0151214, 0.016003), abundance=c(0.075, 0.925))) +Mg <- list(name= 'Mg', mass=24, isotope=list(mass=c(-0.0149577, -0.0141626, -0.0174063), abundance=c(0.7899, 0.1, 0.1101))) +Mn <- list(name= 'Mn', mass=55, isotope=list(mass=c(-0.0619529), abundance=c(1))) +N <- list(name= 'N', mass=14, isotope=list(mass=c(0.003074002, 0.00010897), abundance=c(0.99634, 0.00366))) +Na <- list(name= 'Na', mass=23, isotope=list(mass=c(-0.0102323), abundance=c(1))) +Ni <- list(name= 'Ni', mass=58, isotope=list(mass=c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321), abundance=c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926))) +O <- list(name= 'O', mass=16, isotope=list(mass=c(-0.00508538044, -0.0008683, -0.0008397), abundance=c(0.99757, 0.000381, 0.00205))) +P <- list(name= 'P', mass=31, isotope=list(mass=c(-0.026238), abundance=c(1))) +S <- list(name= 'S', mass=32, isotope=list(mass=c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924), abundance=c(0.9499, 0.0075, 0.0425, 0, 1e-04))) +Se <- list(name= 'Se', mass=74, isotope=list(mass=c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022), abundance=c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873))) +Si <- list(name= 'Si', mass=28, isotope=list(mass=c(-0.0230729, -0.0235051, -0.0262293), abundance=c(0.9223, 0.0467, 0.031))) +Sn <- list(name= 'Sn', mass=112, isotope=list(mass=c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257),abundance=c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579))) +Zn <- list(name= 'Zn', mass=64, isotope=list(mass=c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675), abundance=c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006))) + +NH4 <- list(name= "NH4", mass=18, isotope=list(mass=c(0.03437, 0.03141, -0.95935)), abundance=c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1 +Ac <- list(name= "Ac", mass=60, isotope=list(mass=c(0.02114, 0.02450, 0.02538)), abundance=c(0.975, 0.021, 0.004)) # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4 +NaCl <- list(name= "NaCl", mass=58, isotope=list(mass=c(-0.04137, 0, -0.04433)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl2 <- list(name= "NaCl2", mass=116, isotope=list(mass=c(-0.08274, 0, -0.08866)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl3 <- list(name= "NaCl3", mass=174, isotope=list(mass=c(-0.12411, 0, -0.13299)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl4 <- list(name= "NaCl4", mass=232, isotope=list(mass=c(-0.16548, 0, -0.17732)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl5 <- list(name= "NaCl5", mass=290, isotope=list(mass=c(-0.20685, 0, -0.22165)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +For <- list(name= "For", mass=45, isotope=list(mass=c(-0.00233, 0.00103)), abundance=c(0.989, 0.011)) # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4 +Na2 <- list(name= "2Na-H", mass=46, isotope=list(mass=c(-1.0282896)), abundance=c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +Met <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +CH3OH <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +Na3 <- list(name= "3Na-2H", mass=69, isotope=list(mass=c(-2.0463469)), abundance=c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +KCl <- list(name= "KCl", mass=74, isotope=list(mass=c(-0.06744,0.92961,-0.06744,0.92772)), abundance=c(0.7047,0.2283,0.0507,0.0162)) # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3 +H2PO4 <- list(name= "H2PO4", mass=97, isotope=list(mass=c(-0.03091)), abundance=c(1)) +HSO4 <- list(name= "HSO4", mass=97, isotope=list(mass=c(-0.04042,0,-0.04462)), abundance=c(0.96,0,0.04)) +Met2 <- list(name= "Met2", mass=64, isotope=list(mass=c(1.060265,1.013405)), abundance=c(0.978,0.022)) +Met3 <- list(name= "Met3", mass=96, isotope=list(mass=c(1.086485,1.089845)), abundance=c(0.969,0.031)) +Met4 <- list(name= "Met4", mass=128, isotope=list(mass=c(1.112705,1.116065)), abundance=c(0.959,0.041)) +Met5 <- list(name= "Met5", mass=160, isotope=list(mass=c(1.20935,1.142285)), abundance=c(0.949,0.051)) +NaminH <- list(name= "Na-H", mass=21, isotope=list(mass=c(-0.02571416)), abundance=c(1)) +KminH <- list(name= "K-H", mass=37, isotope=list(mass=c(-0.05194,0.94617)), abundance=c(0.9328,0.0672)) +H2O <- list(name= "H2O", mass=-19, isotope=list(mass=c(-0.01894358)), abundance=c(1)) +NaK <- list(name= "NaK-H", mass=61, isotope=list(mass=c(-0.054345,0.943765)), abundance=c(0.9328,0.0672)) +min2H <- list(name= "min2H", mass=-2, isotope=list(mass=c(-0.0151014)), abundance=c(1)) +plus2H <- list(name= "plus2H", mass=2, isotope=list(mass=c(0.0151014)), abundance=c(1)) +plus2Na <- list(name= "plus2Na", mass=46, isotope=list(mass=c(-0.02046), abundance=c(1))) +plusNaH <- list(name= "plusNaH", mass=24, isotope=list(mass=c(-0.00295588), abundance=c(1))) +plusKH <- list(name= "plusKH", mass=40, isotope=list(mass=c(-0.029008,0.969101)), abundance=c(0.9328,0.0672)) +plusHNH <- list(name= "plusHNH", mass=19, isotope=list(mass=c(0.04164642)), abundance=c(1)) +min3H <- list(name= "min3H", mass=-3, isotope=list(mass=c(-0.02182926)), abundance=c(1)) +plus3H <- list(name= "plus3H", mass=3, isotope=list(mass=c(0.02182926)), abundance=c(1)) +plus3Na <- list(name= "plus3Na", mass=68, isotope=list(mass=c(0.96931)), abundance=c(1)) +plus2NaH <- list(name= "plus2NaH", mass=47, isotope=list(mass=c(0.2985712)), abundance=c(1)) +plusNa2H <- list(name= "plusNa2H", mass=25, isotope=list(mass=c(0.00432284)), abundance=c(1)) + +Cl37 <- list(name= "Cl37", mass=37, isotope=list(mass=c(-0.03409741)), abundance=c(1)) + +allelements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn) +allAdducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, NH4, Ac, NaCl, For,Na2,CH3OH,NaCl2,NaCl3,NaCl4,NaCl5,Na3,KCl,H2PO4,HSO4,Met2,Met3,Met4,Met5,NaminH,KminH,H2O,NaK,min2H,plus2H,plus2Na,plusNaH,plusKH,min3H,plus3H,plusHNH,plus3Na,plus2NaH,plusNa2H) + +#atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl") +#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721) +#electron <- 0.00054858 +#Mol.comp <- c(0,4,0,0,1,1,0) +#Mol.exact <- sum(Mol.comp * atomicWts) + electron +#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list +#getMass(Mol) + electron - 0.0022 + 0.000007*getMass(Mol) + +#atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "34S", "18O") +#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 33.96786690, 17.9991610) +#electron <- 0.00054858 +#Mol.comp <- c(0,4,0,0,1,1,0,0,0,0) # main peak HSO4 +#Mol.comp <- c(0,4,0,0,0,1,0,1,0,0) # deuterated +#Mol.comp <- c(0,4,0,0,1,0,0,0,1,0) # 34S +#Mol.comp <- c(0,3,0,0,1,1,0,0,0,1) # 18O +#Mol.exact <- sum(Mol.comp * atomicWts) + electron +#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list + +atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "13C", "34S", "18O", "37Cl") +atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259) +electron <- 0.00054858 +############# P O N C H S Cl D 13C 34S 18O 37Cl +#Mol.comp <- c(0,6,0,6,12,0,1, 0, 0, 0, 0, 0) # main peak Galactose HCl negative ion +#Mol.comp <- c(0,6,0,5,12,0,1, 0, 1, 0, 0, 0) # 13C Galactose HCl negative ion +#Mol.comp <- c(0,6,0,6,11,0,1, 1, 0, 0, 0, 0) # D Galactose HCl negative ion +#Mol.comp <- c(0,5,0,6,12,0,1, 0, 0, 0, 1, 0) # 18O Galactose HCl negative ion +#Mol.comp <- c(0,6,0,6,12,0,0, 0, 0, 0, 0, 1) # 18O Galactose HCl negative ion +#Mol.exact <- sum(Mol.comp * atomicWts) + electron +#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list + +Hmass <- H$mass + H$isotope$mass[1] +Dmass <- H$mass + 1 + H$isotope$mass[2] +Tmass <- H$mass + 2 + H$isotope$mass[3] +C13mass <- C$mass + 1 + C$isotope$mass[2] +N15mass <- N$mass + 1 + N$isotope$mass[2] diff --git a/DIMS/AddOnFunctions/iden.code.R b/DIMS/AddOnFunctions/iden.code.R new file mode 100644 index 0000000..4853647 --- /dev/null +++ b/DIMS/AddOnFunctions/iden.code.R @@ -0,0 +1,45 @@ +iden.code <- function(peaklist, db, ppm, theor_mass_label) { + # theor_mass_label = {"MNeg", "Mpos"} + + mcol <- peaklist[ , "mzmed.pgrp"] + theor.mcol <- db[,theor_mass_label] + assi_HMDB <- iso_HMDB <- HMDB_code <- c(rep("",length(mcol))) + theormz_HMDB <- c(rep(0,length(mcol))) + + for(t in 1:length(mcol)){ + mz<-mcol[t] + mtol <- mz*ppm/1000000 + selp <- which(theor.mcol > (mz - mtol) & theor.mcol < (mz + mtol)) + assinames <- isonames <- HMDBcodes <- "" + + if(length(selp)!=0){ + for(i in 1:length(selp)){ + + if(grepl(" iso ", db[selp[i], "CompoundName"])){ + mainpeak <- strsplit(db[selp[i],"CompoundName"]," iso ")[[1]][1] + + # Check if peak without isotope occurs, this assumes that peaklist is ordered on mass!!! + if(length(grep(mainpeak, assi_HMDB, fixed=TRUE)) > 0){ + isonames <- as.character(paste(isonames,as.character(db[selp[i],"CompoundName"]), sep=";")) + } + + } else { + assinames <- as.character(paste(assinames,as.character(db[selp[i], "CompoundName"]), sep=";")) + HMDBcodes <- as.character(paste(HMDBcodes, as.character(rownames(db)[selp[i]]), sep=";")) + } + } + } + + assi_HMDB[t] <- as.character(assinames) + iso_HMDB[t] <- as.character(isonames) + if ((assinames=="") & (isonames=="")){ + theormz_HMDB[t] <- "" + } else { + theormz_HMDB[t] <- theor.mcol[selp[1]] + } + HMDB_code[t] <- as.character(HMDBcodes) + } + + return(cbind(peaklist, assi_HMDB, iso_HMDB, theormz_HMDB, HMDB_code)) +} + diff --git a/DIMS/AddOnFunctions/ident.hires.noise.HPC.R b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R new file mode 100644 index 0000000..0be43d1 --- /dev/null +++ b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R @@ -0,0 +1,242 @@ +# modified identify function to also look for adducts and their isotopes +ident.hires.noise.HPC <- function(peaklist, allAdducts, scanmode="Negative", look4=c("Cl", "Ac"), identlist=NULL, resol=280000, slope=0, incpt=0, ppm.fixed, ppm.iso.fixed) { + + # peaklist=outlist_Neg + # scanmode="Negative" + # identlist=noise.Neg.MZ + # look4=look4.addN + # resol=resol + # slope=0 + # incpt=0 + # ppm.fixed=3 + # ppm.iso.fixed=2 + + metlin <- assi <- iso <- rep("", nrow(peaklist)) + theormz <- nisos <- expint <- conf <- rep(0, nrow(peaklist)) + # nrH <- nrD <- nrC <- nr13C <- nrN <- nr15N <- nrO <- nrP <- nrS <- nrCl <- rep(0, nrow(peaklist)) + + # add adducts to identification list + if (scanmode == "Positive") { add.mode <- "+" } else { add.mode <- "-" } + # identlist <- theorMZ + identlist.orig <- identlist + + for (p in 1:length(look4)) { # loop over type of adduct + #print(p) + identlist.Adduct <- identlist.orig + identlist.Adduct[ , "CompoundName"] <- as.character(identlist.orig[ , "CompoundName"]) + + #add2label <- paste("[M+", look4[p], "]", add.mode, sep="") + if (look4[p]=="H2O") { + add2label <- paste("[M-", look4[p], "]", add.mode, sep="") + } else { + add2label <- paste("[M+", look4[p], "]", add.mode, sep="") + } + + identlist.Adduct[ , "CompoundName"] <- paste(identlist.Adduct[ , "CompoundName"], add2label, sep=" ") + adductInfo <- elementInfo(look4[p], allAdducts) + if (scanmode == "Positive") { + adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] - Hmass } else { + adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] + Hmass + } + for (q in 1:nrow(identlist.Adduct)) { # loop over compounds in database + # construct information for compound + adduct: + if (scanmode == "Positive") { + identlist.Adduct[q, "Mpos"] <- as.numeric(identlist.Adduct[q, "Mpos"]) + adductMass # + Na - H + identlist.Adduct[q, "MNeg"] <- 0 + } else { + identlist.Adduct[q, "Mpos"] <- 0 + identlist.Adduct[q, "MNeg"] <- as.numeric(identlist.Adduct[q, "MNeg"]) + adductMass # + Cl + H + } + } + + # # modify columns with info for mol. formula: + # if (look4[p] == "2Na-H") { + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1 + # } else if (look4[p] == "NH4") { + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 3 + # identlist.Adduct[, "nrN"] <- as.numeric(identlist.Adduct[, "nrN"]) + 1 + # } else if (look4[p] == "Cl") { + # identlist.Adduct[, "nrCl"] <- as.numeric(identlist.Adduct[, "nrCl"]) + 1 + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 1 + # } else if (look4[p] == "Ac") { + # identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 2 + # identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 3 + # identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 2 + # } else if (look4[p] == "CH3OH+H") { + # identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 1 + # identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 4 + # identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 1 + # } else { + # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1 + # } + identlist <- rbind(identlist, identlist.Adduct) + } # end for p adducts in look4 + + + if (scanmode == "Positive") { theor.mcol <- as.numeric(identlist[ , "Mpos"]) } else { + theor.mcol <- as.numeric(identlist[ , "MNeg"]) } + # apply correction using regression line obtained with ISses + theor.mcol <- (1+slope)*theor.mcol + incpt + + # get mz information from peaklist + mcol <- peaklist[ , "mzmed.pgrp"] + # if column with average intensities is missing, calculate it: + if (!("avg.int" %in% colnames(peaklist))){ + mzmaxcol <- which(colnames(peaklist) == "mzmax.pgrp") + endcol <- ncol(peaklist) + peaklist[ , "avg.int"] <- apply(peaklist[ ,(mzmaxcol+1):(endcol)], 1, mean) + } + + # # generate URL for Metlin: + # for (p in 1:nrow(peaklist)) { # for each peak # p <- 1 + # mzpeak <- as.numeric(mcol[p]) + # # resolution as function of mz: + # resol.mz <- resol*(1/sqrt(2)^(log2(mzpeak/200))) + # fwhm <- mzpeak/resol.mz + # # massmin <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) - fwhm # for Metlin db + # # massmax <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) + fwhm # for Metlin db + # if (scanmode == "Positive") { + # massmin <- mzpeak - (1.0078250321 - 0.00054858) - fwhm # for Metlin db + # massmax <- mzpeak - (1.0078250321 - 0.00054858) + fwhm # for Metlin db + # } else { + # massmin <- mzpeak + (1.0078250321 - 0.00054858) - fwhm # for Metlin db + # massmax <- mzpeak + (1.0078250321 - 0.00054858) + fwhm # for Metlin db + # } + # metlin[p] <- paste("http://metlin.scripps.edu/metabo_list.php?mass_min=", massmin, "&mass_max=", massmax, sep="") + # } + + + # do indentification using own database: + for (t in 1:nrow(identlist)) { # theoretical mass # t <- 45 + # for (t in 1:169) { + #print(as.character(identlist[t,"CompoundName"])) + theor.mz <- theor.mcol[t] + + theor.comp <- as.character(identlist[t, "Composition"]) + #theor.comp <- mol.formula(identlist[t, ]) + + # # if there's Deuterium, Tritium, 13C or 15N in the composition: + # mass.incr <- 0 + (as.numeric(identlist[t,"nrD"])*Dmass) + + # (as.numeric(identlist[t,"nrT"])*Tmass) + + # (as.numeric(identlist[t,"nrC13"])*C13mass) + + # (as.numeric(identlist[t,"nrN15"])*N15mass) + # theor.comp <- strsplit(theor.comp, "iso")[[1]][1] + mass.incr <- 0 + + # resolution as function of mz: + resol.mz <- resol*(1/sqrt(2)^(log2(theor.mz/200))) + # calculate fine-grained isotopic distribution using MIDAs + fwhm <- round(theor.mz/resol.mz,6) + + #system(paste("C:/Users/awillem7/tools/MIDAs_New/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + #system(paste("C:/Users/mraves/Metabolomics/MIDAs_New/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + #system(paste(path2MIDAS, theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + + #system(paste("/data/home/luyf/Metabolomics/MIDAs/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) + options(stringsAsFactors = FALSE) + #fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE) + + # res <- try(fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE)) + # if(inherits(res, "try-error")) + # { + # #error handling code, maybe just skip this iteration using + # message("Skipped") + # next + # } + # + # # correct mass for D, T, 13C and 15N + # fgid[ ,1] <- as.numeric(fgid[ ,1]) + mass.incr + # # calculate percentage intensities from relative intensities + # firstone <- as.numeric(fgid[1,2]) + # fgid[ ,3] <- as.numeric(fgid[ , 2]) / firstone + # #fgid <- as.matrix(fgid, ncol=3) + # # the mz in the MIDAs file are of the neutral molecule + # if (scanmode == "Positive") { mz.iso <- as.numeric(fgid[ , 1]) + Hmass - electron } + # if (scanmode == "Negative") { mz.iso <- as.numeric(fgid[ , 1]) - Hmass + electron } + # fgid <- cbind(fgid, mz.iso) + # colnames(fgid) <- c("mz","rel.int", "perc.int", "mz.iso") + + # compensate mz for presence of adduct + # Adduct.mass <- theor.mz - mz.iso[1] + # fgid[ , "mz.iso"] <- as.numeric(fgid[ , "mz.iso"]) + Adduct.mass + + # set tolerance for mz accuracy of main peak + mtol <- theor.mz*ppm.fixed/1000000 + # find main peak + selp <- which(mcol > (theor.mz - mtol) & mcol < (theor.mz + mtol)) + # selp <- which(mcol > (theor.mz - 0.01) & mcol < (theor.mz + 0.01)) + # peaklist[selp, c(1:4,(endcol+1))] + + # set tolerance for mz accuracy of isotope peaks + itol <- theor.mz*ppm.iso.fixed/1000000 + + # if (length(selp) > 1) { # more than one candidate peak for main; select best one based on isotope pattern + # #cat(as.character(identlist[t, "CompoundName"])); print(" has >1 candidate peaks") + # conf.local <- rep(0, length(selp)) + # for (p in 1:length(selp)) { # p <- 2 + # # determine isotope pattern for each candidate peak + # # obs.mz <- peaklist[selp[p],"mzmed.pgrp"] + # conf.local[p] <- match.isotope.pattern(peaklist, scanmode, selp[p], fgid, ppm.iso.fixed) + # } + # # selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] } + # selp <- na.exclude(selp[conf.local == max(conf.local)]) + # } + if (length(selp) > 1) { # more than one candidate peak for main; select best one based on mz.diff + selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] + } + if (length(selp) == 1) { # match for main + assi[selp] <- paste(assi[selp], as.character(identlist[t,"CompoundName"]), sep=";") + theormz[selp] <- theor.mz + # conf[selp] <- match.isotope.pattern(peaklist, scanmode, selp, fgid, ppm.iso.fixed) + + # nrH[selp] <- identlist[t,"nrH"] + # nrD[selp] <- identlist[t,"nrD"] + # nrC[selp] <- identlist[t,"nrC"] + # nr13C[selp] <- identlist[t,"nrC13"] + # nrN[selp] <- identlist[t,"nrN"] + # nr15N[selp] <- identlist[t,"nrN15"] + # nrO[selp] <- identlist[t,"nrO"] + # nrP[selp] <- identlist[t,"nrP"] + # nrS[selp] <- identlist[t,"nrS"] + # nrCl[selp] <- identlist[t,"nrCl"] + + # assign isotope peaks + # mz.main <- peaklist[selp, "mzmed.pgrp"] # mz of main peak + # int.main <- peaklist[selp, "avg.int"] # intensity of main peak (= 100%) + # # deviation from theoretical mass: + # diff <- theor.mz - mz.main + # # calculate expected intensities and select isotopes with exp.int > threshold + # fgid[ , "exp.int"] <- fgid[ , "perc.int"] * int.main + # fgid.subset <- fgid[(fgid[ , "exp.int"] > thresh), ] + # nisos[selp] <- nrow(fgid.subset) - 1 + # if (nrow(fgid.subset) > 1) { # avoid error message if fgid.subset has only 1 line + # for (f in 2:nrow(fgid.subset)) { # f <- 2 + # mz.target <- fgid.subset[f, "mz.iso"] - diff + # int.target <- fgid.subset[f, "exp.int"] + # # print(itarget) + # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - itol) & peaklist[ , "mzmed.pgrp"] < (mz.target + itol) + # # sum(sel.iso) + # # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - 0.01) & peaklist[ , "mzmed.pgrp"] < (mz.target + 0.01) + # # peaklist[sel.iso, c(1:4,23)] + # if (sum(sel.iso) == 1) { # 2 separate if-statements because of error if sum(sel.iso) = 0 + # if (peaklist[sel.iso, "avg.int"] > (int.target/2)) { # match + # iso[sel.iso] <- paste(paste(iso[sel.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="") + # # peaklist[sel.iso, ] + # expint[sel.iso] <- fgid.subset[f, "exp.int"] + # } + # } else if (sum(sel.iso) > 1) { + # nrs.iso <- which(sel.iso) + # nr.iso <- nrs.iso[which(abs(peaklist[sel.iso, "avg.int"] - int.target) == min(abs(peaklist[sel.iso, "avg.int"] - int.target)))] + # if (peaklist[nr.iso, "avg.int"] > (int.target/2)) { # match + # # print(peaklist[nr.iso, "avg.int"]) + # iso[nr.iso] <- paste(paste(iso[nr.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="") + # expint[nr.iso] <- fgid.subset[f, "exp.int"] + # } # end if + # } # end else if + # } # end for f + # } # end if + } # end if + } # end for t + # cbind(peaklist, nrH, nrD, nrC, nr13C, nrN, nr15N, nrO, nrP, nrS, nrCl, assi, theormz, conf, nisos, iso, expint, metlin) + cbind(peaklist, assi, theormz, conf, nisos, iso, expint, metlin) +} diff --git a/DIMS/AddOnFunctions/isWithinXppm.R b/DIMS/AddOnFunctions/isWithinXppm.R new file mode 100644 index 0000000..5da43c6 --- /dev/null +++ b/DIMS/AddOnFunctions/isWithinXppm.R @@ -0,0 +1,51 @@ +isWithinXppm <- function(mean, scale, sigma, area, x2, x, ppm=4, resol, plot) { + # mean=retVal$mean + # scale=retVal$scale + # sigma=retVal$sigma + # area=retVal$area + # ppm=3 + + # sort!!!!!!!!!!!!!!!! + index = order(mean) + mean = mean[index] + scale = scale[index] + sigma = sigma[index] + area = area[index] + + summed = NULL + remove = NULL + + if (length(mean)>1){ + for (i in 2:length(mean)){ + if ((abs(mean[i-1]-mean[i])/mean[i-1])*10^6 < ppm) { + + # avoid double occurance in sum + if ((i-1) %in% summed) next + + retVal = sumCurves(mean[i-1], mean[i], scale[i-1], scale[i], sigma[i-1], sigma[i], x2, x, resol, plot) + summed = c(summed, i-1, i) + if (is.nan(retVal$mean)) retVal$mean=0 + mean[i-1] = retVal$mean + mean[i] = retVal$mean + area[i-1] = retVal$area + area[i] = retVal$area + scale[i-1] = retVal$scale + scale[i] = retVal$scale + sigma[i-1] = retVal$sigma + sigma[i] = retVal$sigma + + remove = c(remove, i) + } + } + } + + if (length(remove)!=0){ + mean=mean[-c(remove)] + area=area[-c(remove)] + scale=scale[-c(remove)] + sigma=sigma[-c(remove)] + } + + return(list("mean"=mean, "area"=area, "scale"=scale, "sigma"=sigma, "qual"=NULL)) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/mergeDuplicatedRows.R b/DIMS/AddOnFunctions/mergeDuplicatedRows.R new file mode 100644 index 0000000..39cb8b4 --- /dev/null +++ b/DIMS/AddOnFunctions/mergeDuplicatedRows.R @@ -0,0 +1,49 @@ +mergeDuplicatedRows <- function(peaklist) { + # peaklist = outlist.tot + # resultDir = "./results" + # scanmode = "positive" + + # peaklist_index=which(peaklist[,"mzmed.pgrp"]=="94.9984524624111") + # peaklist[peaklist_index,] + + collapse <- function(label,pklst,index){ + # label = "iso_HMDB" + # pklst = peaklist + # index = peaklist_index + tmp2=as.vector(pklst[index,label]) + if (length(which(is.na(tmp2)))>0) tmp2=tmp2[-which(is.na(tmp2))] + return(paste(tmp2,collapse = ";")) + } + + options(digits=16) + + collect=NULL + remove=NULL + + index = which(duplicated(peaklist[, "mzmed.pgrp"])) + + while (length(index) > 0){ + + peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"]) + # peaklist[peaklist_index,"iso_HMDB",drop=FALSE] + tmp=peaklist[peaklist_index[1],,drop=FALSE] + + tmp[,"assi_HMDB"]=collapse("assi_HMDB",peaklist,peaklist_index) + tmp[,"iso_HMDB"]=collapse("iso_HMDB",peaklist,peaklist_index) + tmp[,"HMDB_code"]=collapse("HMDB_code",peaklist,peaklist_index) + tmp[,"assi_noise"]=collapse("assi_noise",peaklist,peaklist_index) + if (tmp[,"assi_noise"]==";") tmp[,"assi_noise"]=NA + tmp[,"theormz_noise"]=collapse("theormz_noise",peaklist,peaklist_index) + if (tmp[,"theormz_noise"]=="0;0") tmp[,"theormz_noise"]=NA + + collect = rbind(collect, tmp) + remove = c(remove, peaklist_index) + + index=index[-which(peaklist[index, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"])] + } + + if (!is.null(remove)) peaklist = peaklist[-remove,] + peaklist = rbind(peaklist,collect) + + return(peaklist) +} diff --git a/DIMS/AddOnFunctions/normalization_2.1.R b/DIMS/AddOnFunctions/normalization_2.1.R new file mode 100644 index 0000000..ed99964 --- /dev/null +++ b/DIMS/AddOnFunctions/normalization_2.1.R @@ -0,0 +1,75 @@ +# normalization_2.1(outlist.pos.id, "Intensity_all_peaks_positive_norm", groupNames.pos, on="total", assi_label="assi_HMDB") + +normalization_2.1 <- function(data, filename, groupNames, on="total", assi_label="assi"){ + # data=outlist.pos.id + # filename = "Intensity_all_peaks_positive_norm" + # groupNames = groupNames.pos + # on="total" + # assi_label="assi_HMDB" + + lastcol = length(groupNames) + 6 + before = data[ ,c(7:lastcol)] + + if (on=="total_IS") { + + assi = which(colnames(data)==assi_label) + data.int <- data[ ,c(assi,7:lastcol)] # assi and samples columns + data.assi = data.int[grep("(IS", data.int[,1], ignore.case=FALSE, fixed = TRUE),] + + } else if (on=="total_ident"){ + + assi = which(colnames(data)==assi_label) + assi.hmdb = which(colnames(data)=="assi.hmdb") + index = sort(union(which(data[,assi]!=""), which(data[,assi.hmdb]!=""))) + + data.int = data[ ,c(assi,7:lastcol)] # assi and samples columns + data.assi = data.int[index,] + + } else if (on=="total") { + + assi = which(colnames(data)==assi_label) + data.int = data[ ,c(assi,7:lastcol)] # assi and samples columns + data.assi = data.int + + } + + sum <- 0 + for (c in 2:ncol(data.assi)) { + sum <- sum + sum(as.numeric(data.assi[,c])) + } + average <- sum/(ncol(data.assi)-1) + for (c in 2:ncol(data.int)) { + factor <- sum(as.numeric(data.assi[,c]))/average + if (factor==0) { + data.int[ ,c]=0 + cat(colnames(data.int)[c]) + cat("factor==0 !!!") + } else { + data.int[ ,c] <- as.numeric(data.int[ ,c])/factor + } + } + + # colnames(data.int[,2:ncol(data.int)]) + + if (dim(data)[2]==lastcol){ + final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)]) + } else { + final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)],data[,(lastcol + 1):ncol(data)]) + } + + #outdir="./results/normalization" + #dir.create(outdir, showWarnings = FALSE) + + #CairoPNG(filename=paste(outdir, paste(filename, "_before.png", sep=""), sep="/"), width, height) + #sub=apply(before,2, function(x) sum(as.numeric(x))) + #barplot(as.vector(unlist(sub)), main="Not normalized",names.arg = colnames(before),las=2) + #dev.off() + + #CairoPNG(filename=paste(outdir, paste(filename, "_", on, ".png", sep=""), sep="/"), width, height) + #sub=apply(final.outlist.Pos.idpat.norm[,c(7:lastcol)],2, function(x) sum(as.numeric(x))) + #barplot(as.vector(unlist(sub)), main=filename,names.arg = colnames(final.outlist.Pos.idpat.norm[,c(7:lastcol)]),las=2) + #dev.off() + + return(final.outlist.Pos.idpat.norm) + +} diff --git a/DIMS/AddOnFunctions/optimizeGauss.R b/DIMS/AddOnFunctions/optimizeGauss.R new file mode 100644 index 0000000..0fbce75 --- /dev/null +++ b/DIMS/AddOnFunctions/optimizeGauss.R @@ -0,0 +1,11 @@ +optimizeGauss <- function(x,y,sigma,mu) { + + f = function(p,x,y,sigma,mu) { + curve = p*dnorm(x,mu,sigma) + return((max(curve)-max(y))^2) + } + + rval = optimize(f, c(0, 100000), tol = 0.0001,x,y,sigma,mu) + + return(rval$minimum) +} diff --git a/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R new file mode 100644 index 0000000..c36259a --- /dev/null +++ b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R @@ -0,0 +1,88 @@ +peak.grouping.Gauss.HPC <- function(outdir, fileIn, scanmode, resol, groupNames) { + # fileIn="./results/specpks_all/positive_outlist_i_min_1.197.RData" + + # ppm/2 + #range = 1.5e-06 + range = 2e-06 + startcol=7 + + # outlist.copy <- read.table(file=fileIn, sep="\t", header=TRUE) + load(fileIn) + outlist.copy = outlist_i_min_1 + batch = strsplit(fileIn, ".",fixed = TRUE)[[1]][3] + + outpgrlist = NULL + + while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) { + + sel = which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1] + + # 3ppm range around max + mzref = as.numeric(outlist.copy[sel, "mzmed.pkt"]) + pkmin = -(range*mzref - mzref) + pkmax = 2*mzref-pkmin + + selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax + tmplist = outlist.copy[selp, ] + + nrsamples = sum(selp) + if (nrsamples > 1) { + # 3ppm range around mean + mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) + mzmin.pgrp = -(range*mzmed.pgrp - mzmed.pgrp) + mzmax.pgrp = 2*mzmed.pgrp - mzmin.pgrp + + selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp + tmplist = outlist.copy[selp, ] + + fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"])) + fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"])) + ints.allsamps = rep(0, length(groupNames)) + names(ints.allsamps) = groupNames # same order as sample list!!! + + # if (length(unique(as.vector(tmplist[,"samplenr"]))) != length(as.vector(tmplist[,"samplenr"]))) { + # message(paste("bingo", sel)) + # break + # } + + # Check for each sample if multiple peaks exists, if so take the sum! + labels=unique(tmplist[,"samplenr"]) + ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) + + outpgrlist = rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps)) + } + outlist.copy[selp, "height.pkt"] = -1 + } + + outpgrlist = as.data.frame(outpgrlist) # ignore warnings of duplicate row names + colnames(outpgrlist)[1:6] = c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") + + # # filtering ################################################################################################################## + # final.outlist=outpgrlist[,c("mzmed.pgrp", "fq.best", "fq.worst","nrsamples","mzmin.pgrp","mzmax.pgrp", sampleNames)] + # # NB: in centroided mode, data files contains many "-1.000" values, from peak finding. Set these to zero. + # final.outlist[final.outlist == -1] = 0 + # + # # keep only peaks which occur in 3 out of 3 technical replicates in at least one sample in peak group list + # # peakFiltering(repl.pattern, final.outlist, nsampgrps, outdir, scanmode, startcol=7) + # # peakFiltering <- function(repl.pattern, final.outlist, nsampgrps, resultDir, scanmode, startcol){ + # nsamp = length(repl.pattern) + # nsampgrps = length(repl.pattern[[1]]) + # + # keep <- rep(0, nrow(final.outlist)) + # for (p in 1:nrow(final.outlist)) { + # for (g in 1:nsampgrps) { # g <- 31 + # if (keep[p] == 0 & sum(final.outlist[p, repl.pattern[[g]]] > 0) == length(repl.pattern[[g]]) ) { keep[p] <- 1 } + # } + # } + # + # tmp <- cbind(final.outlist, keep) + # final.outlist.filt <- tmp[keep == 1, ] + # + # # omit keep column + # final.outlist.filt <- final.outlist.filt[ , -ncol(final.outlist.filt)] + + #save(outpgrlist_part, file=paste(outdir, paste(scanmode, "_", mzstart, "_", mzend, ".RData", sep=""), sep="/")) + # save(final.outlist.filt, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/")) + save(outpgrlist, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/")) + +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/remove.dupl.2.1.R b/DIMS/AddOnFunctions/remove.dupl.2.1.R new file mode 100644 index 0000000..89e307d --- /dev/null +++ b/DIMS/AddOnFunctions/remove.dupl.2.1.R @@ -0,0 +1,32 @@ +remove.dupl.2.1 <- function(peaklist) { + # peaklist = outlist.tot + # resultDir = "./results" + # scanmode = "positive" + + # peaklist=peaklist[1:100000,] + + collect=NULL + remove=NULL + + index = which(duplicated(peaklist[, "mzmed.pgrp"])) + + while (length(index) > 0){ + + peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"]) + tmp=peaklist[peaklist_index[1],,drop=FALSE] + if (!is.na(peaklist[peaklist_index[1],"assi_HMDB"])) tmp[,"assi_HMDB"]=paste(peaklist[peaklist_index,"assi_HMDB"],collapse = ";") else tmp[,"assi_HMDB"]=NA + if (!is.na(peaklist[peaklist_index[1],"iso_HMDB"])) tmp[,"iso_HMDB"]=paste(peaklist[peaklist_index,"iso_HMDB"],collapse = ";") else tmp[,"iso_HMDB"]=NA + if (!is.na(peaklist[peaklist_index[1],"HMDB_code"])) tmp[,"HMDB_code"]=paste(peaklist[peaklist_index,"HMDB_code"],collapse = ";") else tmp[,"HMDB_code"]=NA + if (peaklist[peaklist_index[1],"assi_noise"]!="") tmp[,"assi_noise"]=paste(peaklist[peaklist_index,"assi_noise"],collapse = ";") else tmp[,"assi_noise"]="" + + collect = rbind(collect, tmp) + remove = c(remove, peaklist_index) + + index=index[-which(index==index[1])] + } + + peaklist = peaklist[-remove,] + peaklist = rbind(peaklist,collect) + + return(peaklist) +} diff --git a/DIMS/AddOnFunctions/remove.dupl.R b/DIMS/AddOnFunctions/remove.dupl.R new file mode 100644 index 0000000..227f207 --- /dev/null +++ b/DIMS/AddOnFunctions/remove.dupl.R @@ -0,0 +1,39 @@ +# remove duplicates, peak groups with mz within a few ppm +# ppmdef should be 2 times bigger than during identification!!! +remove.dupl <- function(peaklist, ppmdef=4, tolint=0.01) { + + # peaklist = outpgrlist + # ppmdef = 2 + # tolint = 0.01 + + int.cols = 7:ncol(peaklist) + + p <- 1 + while (p < nrow(peaklist)) { + mzref <- peaklist[p, "mzmed.pgrp"] + # print(mzref) + dist.ppm <- ppmdef * mzref / 1000000 + sel <- peaklist[ , "mzmed.pgrp"] > (mzref - dist.ppm) & peaklist[ , "mzmed.pgrp"] < (mzref + dist.ppm) + subset <- peaklist[sel, ] + if (nrow(subset) > 1) { + avi <- rep(1, max(int.cols)) + for (c in int.cols) { avi[c] <- max(subset[ ,c])/mean(subset[ ,c]) } + + # remove NaN + avi[which(is.nan(avi))] = 1 + + if (mean(avi) > (1-tolint) & mean(avi < (1+tolint))) { + peaklist <- peaklist[-which(sel), ] + newline <- subset[1, ] + newline[ , "mzmed.pgrp"] <- mean(subset[ , "mzmed.pgrp"]) + newline[ , "mzmin.pgrp"] <- min(subset[ , "mzmin.pgrp"]) + newline[ , "mzmax.pgrp"] <- max(subset[ , "mzmax.pgrp"]) + newline[ , int.cols] <- apply(subset[ , int.cols], 2, max) + # newline[ , "avg.int"] <- mean(as.numeric(newline[ , int.cols])) + peaklist <- rbind(peaklist, newline) + } + } + p <- p + 1 + } + peaklist +} diff --git a/DIMS/AddOnFunctions/replaceZeros.R b/DIMS/AddOnFunctions/replaceZeros.R new file mode 100644 index 0000000..4fafc9a --- /dev/null +++ b/DIMS/AddOnFunctions/replaceZeros.R @@ -0,0 +1,108 @@ +replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ + # file="./results/grouping_rest/negative_1.RData" + # scanmode= "negative" + # scriptDir="./scripts" + # resol=140000 + # thresh=2000 + # outdir="./results" + + control_label="C" + + source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) + sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) + + dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) + + # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) + # scale=2 # Initial value used to estimate scaling parameter + # width=1024 + # height=768 + + # message(paste("file", file)) + # message(paste("scanmode", scanmode)) + # message(paste("resol", resol)) + # message(paste("outdir", outdir)) + # message(paste("thresh", thresh)) + # message(paste("scriptDir", scriptDir)) + + load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) + + name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) + name = name[length(name)] + # message(paste("File name: ", name)) + + # load samplePeaks + # load outpgrlist + load(file) + + # ################################################################################# + # # filter on at least signal in two control samples + # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) + # # barplot(as.numeric(outpgrlist[753, int.cols])) + # keep = NULL + # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) + # outpgrlist = outpgrlist[keep,] + # ################################################################################# + + ################################################################################ + # For now only replace zeros + if (!is.null(outpgrlist)) { + for (i in 1:length(names(repl.pattern.filtered))){ + samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]] + index=which(samplePeaks<=0) + if (!length(index)){ + next + } + for (j in 1:length(index)){ + area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area + # for testing purposes, add a fixed random seed + set.seed(123) + outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area) + } + } + + ################################################################################ + + + #################### identification ######################################################### + # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016 + + # Add average column + outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) + + if (scanmode=="negative"){ + label = "MNeg" + label2 = "Negative" + # take out multiple NaCl adducts + look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H" + # HMDB_add_iso=HMDB_add_iso.Neg + } else { + label = "Mpos" + label2 = "Positive" + # take out NaCl adducts + look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5") + # HMDB_add_iso=HMDB_add_iso.Pos + } + + # # Identification using large database + # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label) + # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups")) + # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres")) + + # Identify noise peaks + noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") + noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] + + # Replace "Negative" by "negative" in ident.hires.noise + final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) + # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks")) + tmp <- final.outlist.idpat2[ , c("assi", "theormz")] + colnames(tmp) <- c("assi_noise", "theormz_noise") + + final.outlist.idpat3 <- cbind(outpgrlist, tmp) + ############################################################################################# + + # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep=""))) + save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep="")) + } +} diff --git a/DIMS/AddOnFunctions/replaceZeros_setseed.R b/DIMS/AddOnFunctions/replaceZeros_setseed.R new file mode 100644 index 0000000..4fafc9a --- /dev/null +++ b/DIMS/AddOnFunctions/replaceZeros_setseed.R @@ -0,0 +1,108 @@ +replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ + # file="./results/grouping_rest/negative_1.RData" + # scanmode= "negative" + # scriptDir="./scripts" + # resol=140000 + # thresh=2000 + # outdir="./results" + + control_label="C" + + source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) + sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) + + dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) + + # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) + # scale=2 # Initial value used to estimate scaling parameter + # width=1024 + # height=768 + + # message(paste("file", file)) + # message(paste("scanmode", scanmode)) + # message(paste("resol", resol)) + # message(paste("outdir", outdir)) + # message(paste("thresh", thresh)) + # message(paste("scriptDir", scriptDir)) + + load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) + + name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) + name = name[length(name)] + # message(paste("File name: ", name)) + + # load samplePeaks + # load outpgrlist + load(file) + + # ################################################################################# + # # filter on at least signal in two control samples + # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) + # # barplot(as.numeric(outpgrlist[753, int.cols])) + # keep = NULL + # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) + # outpgrlist = outpgrlist[keep,] + # ################################################################################# + + ################################################################################ + # For now only replace zeros + if (!is.null(outpgrlist)) { + for (i in 1:length(names(repl.pattern.filtered))){ + samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]] + index=which(samplePeaks<=0) + if (!length(index)){ + next + } + for (j in 1:length(index)){ + area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area + # for testing purposes, add a fixed random seed + set.seed(123) + outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area) + } + } + + ################################################################################ + + + #################### identification ######################################################### + # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016 + + # Add average column + outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) + + if (scanmode=="negative"){ + label = "MNeg" + label2 = "Negative" + # take out multiple NaCl adducts + look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H" + # HMDB_add_iso=HMDB_add_iso.Neg + } else { + label = "Mpos" + label2 = "Positive" + # take out NaCl adducts + look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5") + # HMDB_add_iso=HMDB_add_iso.Pos + } + + # # Identification using large database + # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label) + # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups")) + # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres")) + + # Identify noise peaks + noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") + noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] + + # Replace "Negative" by "negative" in ident.hires.noise + final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) + # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks")) + tmp <- final.outlist.idpat2[ , c("assi", "theormz")] + colnames(tmp) <- c("assi_noise", "theormz_noise") + + final.outlist.idpat3 <- cbind(outpgrlist, tmp) + ############################################################################################# + + # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep=""))) + save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep="")) + } +} diff --git a/DIMS/AddOnFunctions/run.vbs b/DIMS/AddOnFunctions/run.vbs new file mode 100644 index 0000000..640116a --- /dev/null +++ b/DIMS/AddOnFunctions/run.vbs @@ -0,0 +1,22 @@ +Option Explicit +On Error Resume Next +RunExcelMacro + +Sub RunExcelMacro() + +Dim xlApp +Dim xlBook +Dim xlBook_persenal + +Set xlApp = CreateObject("Excel.Application") +xlApp.DisplayAlerts = FALSE +Set xlBook_persenal = xlApp.Workbooks.Open("C:\Users\awillem8\AppData\Roaming\Microsoft\Excel\XLSTART\PERSONAL.XLSB", 0, True) +Set xlBook = xlApp.Workbooks.Open("C:\Users\mkerkho7\testmap\xls\P181_Aberant_HMDB.xlsx", 0, True) +xlApp.Run "PERSONAL.XLSB!FixAndResize" +xlBook.SaveAs "C:\Users\mkerkho7\testmap\results\P181_Aberant_HMDB.xlsx" +xlBook.Close False +xlApp.Quit +Set xlBook = Nothing +Set xlBook_persenal = Nothing +Set xlApp = Nothing +End Sub diff --git a/DIMS/AddOnFunctions/runVBAMacro.R b/DIMS/AddOnFunctions/runVBAMacro.R new file mode 100644 index 0000000..3836b2f --- /dev/null +++ b/DIMS/AddOnFunctions/runVBAMacro.R @@ -0,0 +1,48 @@ +runVBAMacro <- function(dir, dir2, vb_script) { + +# dir="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls\\" +# dir2="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls_fixed\\" + +dir.create(dir2) +files=list.files(dir) + +script_1 = paste("Option Explicit +On Error Resume Next +RunExcelMacro + +Sub RunExcelMacro() + +Dim xlApp +Dim xlBook +Dim xlBook_persenal + +Set xlApp = CreateObject(\"Excel.Application\") +xlApp.DisplayAlerts = FALSE +Set xlBook_persenal = xlApp.Workbooks.Open(\"C:\\Users\\awillem8\\AppData\\Roaming\\Microsoft\\Excel\\XLSTART\\PERSONAL.XLSB\", 0, True) +Set xlBook = xlApp.Workbooks.Open(\"", dir, sep="") + +script_2 = "\", 0, True) +xlApp.Run \"PERSONAL.XLSB!FixAndResize\" +xlBook.SaveAs \"" + +script_3 = "\" +xlBook.Close False +xlApp.Quit +Set xlBook = Nothing +Set xlBook_persenal = Nothing +Set xlApp = Nothing +End Sub" + +for (i in 1:length(files)){ + script = paste(script_1, files[i], script_2, dir2, files[i], script_3, sep="") # dir2, files[i], + + message(script) + + fileConn = file("./src/run.vbs") + writeLines(script, fileConn) + close(fileConn) + + system(vb_script) +} +} + diff --git a/DIMS/AddOnFunctions/searchMZRange.R b/DIMS/AddOnFunctions/searchMZRange.R new file mode 100644 index 0000000..43a7171 --- /dev/null +++ b/DIMS/AddOnFunctions/searchMZRange.R @@ -0,0 +1,187 @@ +searchMZRange <- function(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh){ + # range=sub_range + + end=NULL + index=as.vector(which(range!=0)) + + # bad infusion + if (length(index)==0) return(values) + + start=index[1] + subRangeLength = 15 + + for (i in 1:length(index)){ + # for (i in 1:129000){ + + if (i 1){ + + end=index[i] + + # start=395626 + # end=395640 + # i=25824 + + # gaan met de banaan + # 128836 + # start 1272853 + # mz start 316.83302 + # end 1272870 + # mz end 316.84269 + + x = as.numeric(names(range)[c(start:end)]) + y = as.vector(range[c(start:end)]) + # # Trim zeros + # x = as.vector(trimZeros(x,y)[[1]]) + # y = as.vector(trimZeros(x,y)[[2]]) + + if (length(y)!=0) { + + # check if intensity above thresh + if (max(y) < thresh | is.nan(max(y))) { + start=index[i+1] + next + } + + # cat("gaan met de banaan") + # cat(i) + # cat(paste("start", start, sep=" ")) + # cat(paste("mz start", x[1], sep=" ")) + # cat(paste("end", end, sep=" ")) + # cat(paste("mz end", x[length(x)], sep=" ")) + + if (length(y)>subRangeLength) { + + y[which(y3) { + # Check only zeros + if (sum(y)==0) next + + rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname, scanmode, plot,width,height,i,start,end) + + if (rval$qual[1]==1) { + + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + + } else { + for (j in 1:length(rval$mean)){ + values$mean = c(values$mean, rval$mean[j]) + values$area = c(values$area, rval$area[j]) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min[1]) + values$max = c(values$max, rval$max[1]) + values$qual = c(values$qual, rval$qual[1]) + } + } + + } else { + + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + } + } + start=index[i+1] + } + } + + # last little range + end = index[length(index)] + x = as.numeric(names(range)[c(start:end)]) + y = as.vector(range[c(start:end)]) + + # x = as.vector(trimZeros(x,y)[[1]]) + # y = as.vector(trimZeros(x,y)[[2]]) + + if (length(y)!=0) { + + # check if intensity above thresh + if (max(y) < thresh | is.nan(max(y))) { + #start=index[i+1] + # do nothing!! + } else { + + # cat("gaan met de banaan") + # cat(paste("start", start, sep=" ")) + # cat(paste("mz start", x[1], sep=" ")) + # cat(paste("end", end, sep=" ")) + # cat(paste("mz end", x[length(x)], sep=" ")) + + if (length(y)>subRangeLength) { + + y[which(y3) { + + # Check only zeros + if (sum(y)==0) next + + rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) + + if (rval$qual[1]==1) { + #cat("Quality = 1!!!") + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + + } else { + for (j in 1:length(rval$mean)){ + values$mean = c(values$mean, rval$mean[j]) + values$area = c(values$area, rval$area[j]) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min[1]) + values$max = c(values$max, rval$max[1]) + values$qual = c(values$qual, rval$qual[1]) + } + } + } else { + rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) + + values$mean = c(values$mean, rval$mean) + values$area = c(values$area, rval$area) + values$nr = c(values$nr, sampname) + values$min = c(values$min, rval$min) + values$max = c(values$max, rval$max) + values$qual = c(values$qual, 0) + + values$spikes = values$spikes + 1 + } + } + } + + return(values) + #return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max, "nr"=sample.nr, "spikes"=spikes)) +} diff --git a/DIMS/AddOnFunctions/sourceDir.R b/DIMS/AddOnFunctions/sourceDir.R new file mode 100644 index 0000000..abf4c9e --- /dev/null +++ b/DIMS/AddOnFunctions/sourceDir.R @@ -0,0 +1,8 @@ +# source add on functions +sourceDir <- function(path, trace = TRUE, ...) { + for (nm in list.files(path, pattern = "[.][RrSsQq]$")) { + #if(trace) cat(nm,":") + source(file.path(path, nm), ...) + #if(trace) cat("\n") + } +} diff --git a/DIMS/AddOnFunctions/statistics_z.R b/DIMS/AddOnFunctions/statistics_z.R new file mode 100644 index 0000000..0a92d3d --- /dev/null +++ b/DIMS/AddOnFunctions/statistics_z.R @@ -0,0 +1,69 @@ +statistics_z <- function(peaklist, sortCol, adducts){ + # peaklist=as.data.frame(outlist.adducts.HMDB) + # plotdir="./results/plots/adducts" + # filename="./results/allpgrps_stats.txt" + # adducts=TRUE + + # peaklist=outlist.tot + # sortCol=NULL + # adducts=FALSE + + case_label = "P" + control_label = "C" + startcol = dim(peaklist)[2]+3 + + # calculate mean and sd for Control group + ctrl.cols <- grep(control_label, colnames(peaklist),fixed = TRUE) # 5:41 + int.cols <- c(grep(control_label, colnames(peaklist),fixed = TRUE), grep(case_label, colnames(peaklist),fixed = TRUE)) + peaklist[,int.cols][peaklist[,int.cols]==0] = NA + + # tmp = data.matrix(peaklist[ , ctrl.cols], rownames.force = TRUE) + tmp = peaklist[ , ctrl.cols, drop=FALSE] + + peaklist$avg.ctrls = apply(tmp, 1, function(x) mean(as.numeric(x),na.rm = TRUE)) + peaklist$sd.ctrls = apply(tmp, 1, function(x) sd(as.numeric(x),na.rm = TRUE)) + + cnames.z = NULL + + for (i in int.cols) { + # message(i) + cname = colnames(peaklist)[i] + cnames.z = c(cnames.z, paste(cname, "Zscore", sep="_")) + zscores.1col = (as.numeric(as.vector(unlist(peaklist[ , i]))) - peaklist$avg.ctrls) / peaklist$sd.ctrls + peaklist = cbind(peaklist, zscores.1col) + } + + colnames(peaklist)[startcol:ncol(peaklist)] = cnames.z + + z.cols = grep("Zscore", colnames(peaklist),fixed = TRUE) + + if (!adducts){ + if ((dim(peaklist[, z.cols])[2]+6)!=(startcol-1)){ + ppmdev=array(1:dim(peaklist)[1], dim=c(dim(peaklist)[1])) + + # calculate ppm deviation + for (i in 1:dim(peaklist)[1]){ + if (!is.na(peaklist$theormz_HMDB[i]) & !is.null(peaklist$theormz_HMDB[i]) & (peaklist$theormz_HMDB[i]!="")){ + ppmdev[i] = 10^6*(as.numeric(as.vector(peaklist$mzmed.pgrp[i]))-as.numeric(as.vector(peaklist$theormz_HMDB[i])))/as.numeric(as.vector(peaklist$theormz_HMDB[i])) + } else { + ppmdev[i]=NA + } + } + + peaklist = cbind(peaklist[, 1:6], ppmdev=ppmdev, peaklist[ , 7:ncol(peaklist)]) + } + } + + #peaklist = peaklist[order(peaklist[,sortCol]),] + + # # Order on average Z-score + # tmp = peaklist[,grep("Zscore", colnames(peaklist))] + # tmp.p = tmp[,grep("P", colnames(tmp)),drop=FALSE] + # tmp.p.avg = apply(tmp.p, 1, mean) + # + # peaklist = cbind(peaklist, "avg.z.score"=tmp.p.avg) + # peaklist = peaklist[order(abs(tmp.p.avg), decreasing = TRUE),] + + return(peaklist) + +} diff --git a/DIMS/AddOnFunctions/sumCurves.R b/DIMS/AddOnFunctions/sumCurves.R new file mode 100644 index 0000000..bcc1485 --- /dev/null +++ b/DIMS/AddOnFunctions/sumCurves.R @@ -0,0 +1,39 @@ +sumCurves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, x2, x, resol, plot) { + # mean1=mean[i-1] + # mean2=mean[i] + # scale1=scale[i-1] + # scale2=scale[i] + # sigma1=sigma[i-1] + # sigma2=sigma[i] + + # message("=============================================================> sum 2 curves!") + + sumFit=(scale1*dnorm(x2,mean1,sigma1))+(scale2*dnorm(x2,mean2,sigma2)) + if (plot) lines(x2,sumFit,col="brown") + + #mean1plus2 = mean(c(mean1,mean2)) + mean1plus2 = weighted.mean(c(mean1,mean2),c(max(scale1*dnorm(x2,mean1,sigma1)),max(scale2*dnorm(x2,mean2,sigma2)))) + + if (plot) abline(v = mean1plus2, col="brown") + fwhm = getFwhm(mean1plus2, resol) + half_max = max(sumFit)*0.5 + if (plot) lines(c(mean1plus2 - 0.5*fwhm, mean1plus2 + 0.5*fwhm),c(half_max,half_max),col="orange") + + # sumFit=(scale1*dnorm(x,mean1,sigma1))+(scale2*dnorm(x,mean2,sigma2)) + # fq=abs(sum(y) - sum(sumFit))/sum(y) + + # h2=c(paste("mean =", mean1plus2, sep=" "), + # paste("fq =", fq, sep=" ")) + # + # legend("topright", legend=h2) + + # I assume that the sum of the distributions if also normal, which is not + #area = sum(scale1*dnorm(x2,mean1,sigma1))+sum(scale2*dnorm(x2,mean2,sigma2)) + #area = max(scale1*dnorm(x2,mean1,sigma1))+max(scale2*dnorm(x2,mean2,sigma2)) + area = max(sumFit) + scale = scale1 + scale2 + sigma = (fwhm/2)*0.85 + + return(list("mean"= mean1plus2,"area"=area, "scale"=scale, "sigma"=sigma)) # "qual"=fq + +} diff --git a/DIMS/AddOnFunctions/trimZeros.R b/DIMS/AddOnFunctions/trimZeros.R new file mode 100644 index 0000000..0fc5ea3 --- /dev/null +++ b/DIMS/AddOnFunctions/trimZeros.R @@ -0,0 +1,8 @@ +trimZeros <- function(x, y) { + tmp = which(y==0) + if (length(tmp)!=0){ + y = y[-tmp] + x = x[-tmp] + } + return(list(x,y)) +} diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 90f510a..a218c4d 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -19,9 +19,9 @@ print(filepath) print(breaks_filepath) print(resol) -#sampname <- sub('\\..*$', '', basename(filepath)) +sampname <- sub('\\..*$', '', basename(filepath)) #cat(paste0("\n", sampname)) -sampname <- "AssignToBins" +# sampname <- "AssignToBins" #suppressPackageStartupMessages(library("Cairo")) options(digits=16) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index 872e32f..0bb7855 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -4,12 +4,11 @@ process AssignToBins { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(file_id, path(mzML_filename)) - file(breaks_file) + tuple(file_id, path(mzML_filename), path(breaks_file)) val(resolution) output: - file 'AssignToBins.RData' + path("${file_id}.RData") script: """ diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R new file mode 100644 index 0000000..3bfc484 --- /dev/null +++ b/DIMS/AverageTechReplicates.R @@ -0,0 +1,119 @@ +#!/usr/bin/Rscript +# adapted from 3-AverageTechReplicates.R + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +init_filepath <- cmd_args[1] +nr_replicates <- as.numeric(cmd_args[2]) +thresh2remove <- 2000 +dimsThresh <- 100 + +removeFromRepl.pat <- function(bad_samples, repl.pattern, nr_replicates) { + # bad_samples=remove_pos + + tmp = repl.pattern + + removeFromGroup=NULL + + for (i in 1:length(tmp)){ + tmp2 = repl.pattern[[i]] + + remove=NULL + + for (j in 1:length(tmp2)){ + if (tmp2[j] %in% bad_samples){ + #cat(tmp2[j]) + #cat(paste("remove",tmp2[j])) + #cat(paste("remove i",i)) + #cat(paste("remove j",j)) + remove = c(remove, j) + } + } + + if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i) + if (!is.null(remove)) repl.pattern[[i]]=repl.pattern[[i]][-remove] + } + + if (length(removeFromGroup)!=0) { + repl.pattern=repl.pattern[-removeFromGroup] + } + + return(list("pattern"=repl.pattern)) +} + + +# get repl.pattern +load("./init.RData") + +remove_neg=NULL +remove_pos=NULL +cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") +for (i in 1:length(repl.pattern)) { + techRepsArray.pos = NULL + techRepsArray.neg = NULL + + tech_reps = as.vector(unlist(repl.pattern[i])) + sum_neg=0 + sum_pos=0 + n_pos=0 + n_neg=0 + cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl.pattern)) + for (j in 1:length(tech_reps)) { + load(paste("./", tech_reps[j], ".RData", sep="")) + cat("\n\nParsing", tech_reps[j]) + + cat("\n\tNegative pklist sum",sum(pklist$neg[,1])) + if (sum(pklist$neg[,1])dimsThresh))==1),1]=0 + if (!is.null(dim(sum_neg))) sum_neg[apply(techRepsArray.neg,1,function(x) length(which(x>dimsThresh))==1),1]=0 + + if (n_neg != 0){ + sum_neg[,1] <- sum_neg[,1]/n_neg + colnames(sum_neg) <- names(repl.pattern)[i] + save(sum_neg, file=paste("./", names(repl.pattern)[i], "_neg.RData", sep="")) + } + if (n_pos != 0) { + sum_pos[,1] <- sum_pos[,1]/n_pos + colnames(sum_pos) <- names(repl.pattern)[i] + save(sum_pos, file=paste("./", names(repl.pattern)[i], "_pos.RData", sep="")) + } +} + +retVal <- removeFromRepl.pat(remove_neg, repl.pattern, nr_replicates) +repl.pattern.filtered <- retVal$pattern +save(repl.pattern.filtered, file="./repl_pattern_negative.RData") +write.table(remove_neg, file="./miss_infusions_neg.txt", row.names=FALSE, col.names=FALSE ,sep= "\t") + +retVal <- removeFromRepl.pat(remove_pos, repl.pattern, nr_replicates) +repl.pattern.filtered <- retVal$pattern +save(repl.pattern.filtered, file="./repl_pattern_positive.RData") +write.table(remove_pos, file="./miss_infusions_pos.txt", row.names=FALSE, col.names=FALSE ,sep= "\t") + + diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf new file mode 100644 index 0000000..aa3b99d --- /dev/null +++ b/DIMS/AverageTechReplicates.nf @@ -0,0 +1,24 @@ +process AverageTechReplicates { + label 'AverageTechReplicates' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(RData_file) + path(init_filepath) + val(nr_replicates) + + output: + path '*_*.RData', emit: binned + path 'repl.pattern.negative.RData' + path 'repl.pattern.positive.RData' + path 'miss_infusions_neg.txt' + path 'miss_infusions_pos.txt' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $nr_replicates + """ +} + + diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 85065ac..22b80c9 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -54,4 +54,6 @@ for (i in 1:nsegment) { } save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/")) +# temporary fix for breaks file: +# save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir/../, "breaks.fwhm.RData", sep="/")) diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index f73ad54..f60d5cc 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -8,7 +8,7 @@ process GenerateBreaks { output: - file 'breaks.fwhm.RData' + path 'breaks.fwhm.RData' script: """ diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R new file mode 100644 index 0000000..10fdc37 --- /dev/null +++ b/DIMS/HMDBparts.R @@ -0,0 +1,126 @@ +#!/usr/bin/Rscript + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +# outdir <- cmd_args[1] +# scanmode <- cmd_args[2] +db_path <- cmd_args[1] # location of HMDB db file +breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData +standard_run <- "no" # cmd_args[5] # "yes" + +# Cut up entire HMDB into small parts based on the new binning/breaks + +# load(paste(outdir, "breaks.fwhm.RData", sep = "/")) +load(breaks_filepath) +#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/") +#dir.create(outdir_hmdb, showWarnings = FALSE) + +# New: in case of a standard run (m/z 69-606) use external HMDB parts +min_mz <- round(breaks.fwhm[1]) +max_mz <- round(breaks.fwhm[length(breaks.fwhm)]) +# test if standard mz range is used +if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) { + # skip generating HMDB parts + use_external_HMDB <- TRUE + save(use_external_HMDB, file="./using_external_HMDB_parts.RData") + hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/" + hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files + # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/")) + for (hmdb_file in hmdb_parts) { + file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE) + } +} else { + # generate HMDB parts in case of non-standard mz range + use_external_HMDB <- FALSE + save(use_external_HMDB, file="not_using_external_HMDB_parts.RData") + # db <- cmd_args[3] + load(db_path) + ppm <- as.numeric(cmd_args[4]) + if (scanmode=="negative"){ + label = "MNeg" + HMDB_add_iso=HMDB_add_iso.Neg + } else { + label = "Mpos" + HMDB_add_iso=HMDB_add_iso.Pos + } + + # filter mass range meassured!!! + HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),] + + # sort on mass + outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),] + + n=dim(outlist)[1] + sub=20000 # max rows per file + end=0 + min_1_last=sub + check=0 + outlist_part=NULL + + + if (n < sub) { + outlist_part <- outlist + save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/")) + } else { + + if (n >= sub & (floor(n/sub) - 1) >= 2){ + for (i in 2:floor(n/sub) - 1){ + start <- -(sub - 1) + i*sub + end <- i*sub + + if (i > 1){ + outlist_i = outlist[c(start:end),] + + n_moved = 0 + + # Calculate 3ppm and replace border, avoid cut within peakgroup! + while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1,]) + outlist_i <- outlist_i[-1,] + n_moved <- n_moved + 1 + } + + # message(paste("Process", i-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/")) + check <- check + dim(outlist_part)[1] + + outlist_part <- outlist_i + min_1_last <- dim(outlist_part)[1] + + } else { + outlist_part <- outlist[c(start:end),] + } + } + } + + start <- end + 1 + end <- n + outlist_i <- outlist[c(start:end),] + n_moved <- 0 + + if (!is.null(outlist_part)) { + # Calculate 3ppm and replace border, avoid cut within peakgroup! + while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { + outlist_part = rbind(outlist_part, outlist_i[1,]) + outlist_i = outlist_i[-1,] + n_moved = n_moved + 1 + } + + # message(paste("Process", i+1-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_")) + check <- check + dim(outlist_part)[1] + } + + outlist_part <- outlist_i + # message(paste("Process", i+2-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_")) + check <- check + dim(outlist_part)[1] + cat("\n", "Check", check == dim(outlist)[1]) + + } +} diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf new file mode 100644 index 0000000..8ed9130 --- /dev/null +++ b/DIMS/HMDBparts.nf @@ -0,0 +1,20 @@ +process HMDBparts { + // Custom process to cut HMDB db into parts + tag {"DIMS HMDBparts"} + label 'HMDBparts' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(hmdb_db_file) + path(breaks_file) + + output: + path('*.RData') + + script: + + """ + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file + """ +} diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 064225c..4908047 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -7,7 +7,7 @@ process MakeInit { tuple(path(samplesheet), val(nr_replicates)) output: - file 'init.RData' + path 'init.RData' script: """ diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R new file mode 100644 index 0000000..4820ef5 --- /dev/null +++ b/DIMS/PeakFinding.R @@ -0,0 +1,68 @@ +#!/usr/bin/Rscript + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +filepath <- cmd_args[1] +resol <- as.numeric(cmd_args[2]) +scripts <- cmd_args[3] +breaks_filepath <- cmd_args[4] # location of breaks.fwhm.RData + +thresh <- 2000 + +# if file extension is .txt, do nothing. +if (grepl(".txt$", filepath) { stop("not run on txt file") } + +# create output folder +dir.create(paste(outdir, "4-specpks", sep="/"),showWarnings = F) + +# load in function scripts +source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/")) +sourceDir(paste(scripts, "AddOnFunctions", sep="/")) + +# load(paste(outdir, "breaks.fwhm.RData", sep="/")) +load(breaks_filepath) +load(filepath) +if (grepl("_pos", filepath)) { scanmode = "positive" } else + if (grepl("_neg", filepath)) { scanmode = "negative" } + +options(digits=16) +int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) +scale=2 # Initial value used to estimate scaling parameter +width=1024 +height=768 + +### fit Gaussian estimate mean and integrate to obtain intensity +findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { + sampname <- colnames(plist)[1] + + range = as.vector(plist) + names(range) = rownames(plist) + + values = list("mean"=NULL,"area"=NULL,"nr"=NULL,"min"=NULL,"max"=NULL,"qual"=NULL,"spikes"=0) + + values = searchMZRange(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh) + + outlist.persample=NULL + outlist.persample=cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area) + + index=which(outlist.persample[,"height.pkt"]==0) + if (length(index)>0) { + outlist.persample=outlist.persample[-index,] + } + + save(outlist.persample, file=paste(outdir, "4-specpks", paste(sampname, "_", scanmode, ".RData", sep=""), sep="/")) + + cat(paste("There were", values$spikes, "spikes!")) +} + + +if (scanmode == "negative") { + findPeaks.Gauss.HPC(sum_neg, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +} else { + findPeaks.Gauss.HPC(sum_pos, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +} diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf new file mode 100644 index 0000000..fb86239 --- /dev/null +++ b/DIMS/PeakFinding.nf @@ -0,0 +1,23 @@ +process PeakFinding { + label 'PeakFinding' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(RData_file) + val(resolution) + path(scripts) + path(breaks_file) + + output: + path '*_neg.RData' + path '*_pos.RData' + path 'repl.pattern.*.RData', emit: pattern + path 'miss_infusions_neg.txt' + path 'miss_infusions_pos.txt' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $resolution $scripts $breaks_file + """ +} diff --git a/DIMS/PeakGroupingIdentified.R b/DIMS/PeakGroupingIdentified.R new file mode 100644 index 0000000..7c8d998 --- /dev/null +++ b/DIMS/PeakGroupingIdentified.R @@ -0,0 +1,180 @@ +#!/usr/bin/Rscript + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +SpecPeaks_file <- cmd_args[1] +#outdir <- cmd_args[2] +#scanmode <- cmd_args[3] +HMDB_part_file <- cmd_args[2] +pattern_file <- cmd_args[3] +resol <- as.numeric(cmd_args[4]) +ppm <- as.numeric(cmd_args[5]) + +options(digits=16) + +cat(paste("File to group:", fileIn)) + + +options(digits=16) +load(HMDB_part_file) # outlist_part(HMDB_add_iso) +HMDB_add_iso = outlist_part + +# Windows and Unix-like +batch = strsplit(fileIn, "/",fixed = TRUE)[[1]] +batch = batch[length(batch)] +batch = strsplit(batch, ".",fixed = TRUE)[[1]][2] + +load(SpecPeaks_file) +outlist.copy = outlist.tot +rm(outlist.tot) + +load(pattern_file) +load("./breaks.fwhm.RData") + +outpgrlist.identified = NULL + +if (scanmode=="negative"){ + label = "MNeg" +} else { + label = "Mpos" +} + +outlist.grouped <- NULL + +# First group on HMDB masses +while (dim(HMDB_add_iso)[1] > 0) { + index = 1 + # message(HMDB_add_iso[index,"CompoundName"]) + + mass = as.numeric(HMDB_add_iso[index,label]) + mtol = (mass*ppm)/10^6 + + mzmed = as.numeric(outlist.copy[,"mzmed.pkt"]) + selp = which((mzmed > (mass - mtol)) & (mzmed < (mass + mtol))) + tmplist = outlist.copy[selp,,drop=FALSE] + outlist.grouped <- rbind(outlist.grouped, tmplist) + + nrsamples = length(selp) + if (nrsamples > 0) { + # message(paste("Bingo ",n)) + + mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) + # mzmed.pgrp = median(as.numeric(outlist.copy[selp, "mzmed.pkt"])) + mzmin.pgrp = mass - mtol + mzmax.pgrp = mass + mtol + + fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"])) + fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"])) + ints.allsamps = rep(0, length(names(repl.pattern.filtered))) + names(ints.allsamps) = names(repl.pattern.filtered) # same order as sample list!!! + + # # Check for each sample if multiple peaks exists, if so take the sum! + labels=unique(tmplist[,"samplenr"]) + ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) + # ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])}))) + + # Identification + # assi_HMDB = iso_HMDB = HMDB_code = "" + assi_HMDB = iso_HMDB = HMDB_code = NA + tmplist.mass.iso = tmplist.mass.adduct = NULL + # index = which(HMDB_add_iso[,label]==mass) + + # Consider groups off 4ppm + mass.all = as.numeric(HMDB_add_iso[,label]) + index = which((mass.all > (mass - mtol)) & (mass.all < (mass + mtol))) + # index = which(mass.all < (mass + 2*mtol)) + tmplist.mass = HMDB_add_iso[index,,drop=FALSE] + + if (dim(tmplist.mass)[1]>0) { + + index.iso = grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE) + if (length(index.iso)>0){ + tmplist.mass.iso = tmplist.mass[index.iso,,drop=FALSE] + tmplist.mass = tmplist.mass[-index.iso,,drop=FALSE] + } + + if (dim(tmplist.mass)[1]>0) { + index.adduct = grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE) + if (length(index.adduct)>0){ + tmplist.mass.adduct = tmplist.mass[index.adduct,,drop=FALSE] + tmplist.mass = tmplist.mass[-index.adduct,,drop=FALSE] + } + } + + # First compouds without adducts or isotopes + if (dim(tmplist.mass)[1]>0) { + + # pure compounds + assi_HMDB = as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";")) + HMDB_code = as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";")) + theormz_HMDB = as.numeric(tmplist.mass[1,label]) + + # adducts + if (!is.null(tmplist.mass.adduct)) { + if (dim(tmplist.mass.adduct)[1]>0) { + if (is.na(assi_HMDB)){ + assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")) + HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) + } else { + assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") + HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") + }}} + + # isotopes + if (!is.null(tmplist.mass.iso)) { + if (dim(tmplist.mass.iso)[1]>0) { + iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + }} + + # No pure compounts + } else if (!is.null(tmplist.mass.adduct)) { + + theormz_HMDB = as.numeric(tmplist.mass.adduct[1,label]) + + # adducts + if (!is.null(tmplist.mass.adduct)) { + if (dim(tmplist.mass.adduct)[1]>0) { + if (is.na(assi_HMDB)){ + assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")) + HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) + } else { + assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") + HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") + }}} + + # isotopes + if (!is.null(tmplist.mass.iso)) { + if (dim(tmplist.mass.iso)[1]>0) { + iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + }} + + # only isotopes + } else if (!is.null(tmplist.mass.iso)) { + + if (dim(tmplist.mass.iso)[1]>0) { + theormz_HMDB = as.numeric(tmplist.mass.iso[1,label]) + iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + } + } + + } + + outpgrlist.identified = rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp), + t(as.matrix(ints.allsamps)), + data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB))) + } + + HMDB_add_iso = HMDB_add_iso[-index,] + +} + +#dir.create(paste(outdir, "6-grouping_hmdb", sep="/"), showWarnings = FALSE) +save(outpgrlist.identified, file=paste(paste(outdir, "6-grouping_hmdb", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/")) + +#dir.create(paste(outdir, "6-grouping_hmdb_done", sep="/"), showWarnings = FALSE) +save(outlist.grouped, file=paste(paste(outdir, "6-grouping_hmdb_grouped", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/")) diff --git a/DIMS/PeakGroupingIdentified.nf b/DIMS/PeakGroupingIdentified.nf new file mode 100644 index 0000000..8100780 --- /dev/null +++ b/DIMS/PeakGroupingIdentified.nf @@ -0,0 +1,18 @@ +process PeakGroupingIdentified { + label 'PeakGroupingIdentified' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(HMDBpart_file) + val(resolution) + + output: + path 'negative.RData' + path 'positive.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $HMDBpart_file $resolution $ppm + """ +} diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R new file mode 100644 index 0000000..f85d9c5 --- /dev/null +++ b/DIMS/SpectrumPeakFinding.R @@ -0,0 +1,77 @@ +#!/usr/bin/Rscript + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +# outdir <- cmd_args[1] +# scanmode <- cmd_args[2] +scanmodes <- c("positive", "negative") + +# Check if all jobs terminated correct! +notRun = NULL + +# new: run for each scanmode +for (scanmode in scanmodes) { + load(paste0("./repl.pattern.", scanmode, ".RData")) + groupNames = names(repl.pattern.filtered) + + indir <- "./" + object.files = list.files(indir, full.names=TRUE, pattern="*.RData") + + for (i in 1:length(groupNames)) { + group <- paste0(indir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData")) + if (!(group %in% object.files)) { + notRun = c(notRun, group) + } + } + + #if (is.null(notRun)){ + cat("\nCollecting samples!") + + # negative + filepath <- "./") + files <- list.files(filepath, recursive=TRUE, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep="")) + + outlist.tot=NULL + for (i in 1:length(files)) { + + cat("\n", files[i]) + load(files[i]) + + if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)){ + tmp=strsplit(files[i], "/")[[1]] + fname = tmp[length(tmp)] + #fname = strsplit(files[i], "/")[[1]][8] + fname = strsplit(fname, ".RData")[[1]][1] + fname = substr(fname, 13, nchar(fname)) + + if (i == 1) { outlist.tot <- c(fname, rep("-1",5)) } else { outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5)))} + } else { + if (i == 1) { outlist.tot <- outlist.persample } else { outlist.tot <- rbind(outlist.tot, outlist.persample)} + } + } + + # remove negative values + index=which(outlist.tot[,"height.pkt"]<=0) + if (length(index)>0) outlist.tot = outlist.tot[-index,] + index=which(outlist.tot[,"mzmed.pkt"]<=0) + if (length(index)>0) outlist.tot = outlist.tot[-index,] + + # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/") + # dir.create(outdir_specpks, showWarnings = F) + # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/")) + save(outlist.tot, file = paste(scanmode, "RData", sep = ".") + + + if (!is.null(notRun)){ + for (i in 1:length(notRun)){ + message(paste(notRun[i], "was not generated")) + } + } + +} + diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf new file mode 100644 index 0000000..66e04a8 --- /dev/null +++ b/DIMS/SpectrumPeakFinding.nf @@ -0,0 +1,17 @@ +process SpectrumPeakFinding { + label 'SpectrumPeakFinding' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(RData_file) + + output: + path 'negative.RData' + path 'positive.RData' + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $RData_file + """ +} From 5461f865ab93e04852f7c791d79d2c0133fd5355 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 11 Aug 2023 15:23:30 +0200 Subject: [PATCH 06/73] added ThermoRawFileParser.nf to CustomModules --- DIMS/ThermoRawFileParser.nf | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 DIMS/ThermoRawFileParser.nf diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf new file mode 100644 index 0000000..af86aa3 --- /dev/null +++ b/DIMS/ThermoRawFileParser.nf @@ -0,0 +1,19 @@ +process ConvertRawFile { + // Custom process to convert raw file to mzML format + tag {"DIMS ConvertRawFile ${file_id}"} + label 'ThermoRawFileParser_1_1_11' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(file_id, path(raw_file)) + + output: + tuple(file_id, path("${file_id}.mzML")) + + script: + + """ + source /hpc/dbg_mz/tools/mono/etc/profile + mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./ + """ +} From 0d8f0f0a5e72f69685dde4eb2551b7291e13ce33 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 14 Aug 2023 13:55:37 +0200 Subject: [PATCH 07/73] added file for listing of raw files --- DIMS/Utils/RawFiles.nf | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 DIMS/Utils/RawFiles.nf diff --git a/DIMS/Utils/RawFiles.nf b/DIMS/Utils/RawFiles.nf new file mode 100644 index 0000000..8cc0efe --- /dev/null +++ b/DIMS/Utils/RawFiles.nf @@ -0,0 +1,12 @@ +def extractRawfilesFromDir(dir) { + // Original code from: https://github.com/SciLifeLab/Sarek - MIT License - Copyright (c) 2016 SciLifeLab + dir = dir.tokenize().collect{"$it/*.raw"} + Channel + .fromPath(dir, type:'file') + .ifEmpty { error "No raw files found in ${dir}." } + .map { rawfiles_path -> + def file_id = rawfiles_path.getSimpleName() + [file_id, rawfiles_path] + } +} + From 4c29eb8818fd41cd9da0d3061bf6878d1d6459e4 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 21 Aug 2023 16:17:05 +0200 Subject: [PATCH 08/73] new files added, existing files modified --- DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R | 24 +++ DIMS/AddOnFunctions/removeFromRepl.pat.R | 31 ++++ DIMS/AssignToBins.R | 31 ++-- DIMS/AssignToBins.nf | 3 +- DIMS/AverageTechReplicates.R | 48 +++--- DIMS/AverageTechReplicates.nf | 12 +- DIMS/GenerateBreaks.R | 22 ++- DIMS/HMDBparts.R | 181 +++++++++++----------- DIMS/HMDBparts.nf | 5 +- DIMS/HMDBparts_old.R | 126 +++++++++++++++ DIMS/MakeInit.R | 15 +- DIMS/PeakFinding.R | 100 ++++++------ DIMS/PeakFinding.nf | 15 +- DIMS/PeakGroupingIdentified.R | 2 +- DIMS/PeakGroupingIdentified.nf | 9 +- DIMS/SpectrumPeakFinding.R | 117 +++++++------- DIMS/SpectrumPeakFinding.nf | 3 +- 17 files changed, 464 insertions(+), 280 deletions(-) create mode 100644 DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R create mode 100644 DIMS/AddOnFunctions/removeFromRepl.pat.R create mode 100644 DIMS/HMDBparts_old.R diff --git a/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R new file mode 100644 index 0000000..c00aa92 --- /dev/null +++ b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R @@ -0,0 +1,24 @@ +### fit Gaussian estimate mean and integrate to obtain intensity +findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { + sampname <- colnames(plist)[1] + + range <- as.vector(plist) + names(range) <- rownames(plist) + + values <- list("mean"=NULL, "area"=NULL, "nr"=NULL, "min"=NULL, "max"=NULL, "qual"=NULL, "spikes"=0) + + values <- searchMZRange(range, values, int.factor, scale, resol, outdir, sampname, scanmode, plot, width, height, thresh) + + outlist.persample <- NULL + outlist.persample <- cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area) + + index <- which(outlist.persample[ ,"height.pkt"]==0) + if (length(index) > 0) { + outlist.persample <- outlist.persample[-index,] + } + + # save(outlist.persample, file=paste(outdir, paste(sampname, "_", scanmode, ".RData", sep=""), sep="/")) + save(outlist.persample, file=paste("./", sampname, "_", scanmode, ".RData", sep="")) + + cat(paste("There were", values$spikes, "spikes!")) +} diff --git a/DIMS/AddOnFunctions/removeFromRepl.pat.R b/DIMS/AddOnFunctions/removeFromRepl.pat.R new file mode 100644 index 0000000..ad5b1c8 --- /dev/null +++ b/DIMS/AddOnFunctions/removeFromRepl.pat.R @@ -0,0 +1,31 @@ +removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { + + tmp = repl_pattern + + removeFromGroup=NULL + + for (i in 1:length(tmp)){ + tmp2 = repl_pattern[[i]] + + remove=NULL + + for (j in 1:length(tmp2)){ + if (tmp2[j] %in% bad_samples){ + #cat(tmp2[j]) + #cat(paste("remove",tmp2[j])) + #cat(paste("remove i",i)) + #cat(paste("remove j",j)) + remove = c(remove, j) + } + } + + if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i) + if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove] + } + + if (length(removeFromGroup)!=0) { + repl_pattern=repl_pattern[-removeFromGroup] + } + + return(list("pattern"=repl_pattern)) +} diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index a218c4d..c6fdaba 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,8 +1,6 @@ #!/usr/bin/Rscript ## adapted from 2-DIMS.R -#.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.6.2") - # load required packages suppressPackageStartupMessages(library("xcms")) @@ -20,10 +18,7 @@ print(breaks_filepath) print(resol) sampname <- sub('\\..*$', '', basename(filepath)) -#cat(paste0("\n", sampname)) -# sampname <- "AssignToBins" -#suppressPackageStartupMessages(library("Cairo")) options(digits=16) ### process one sample at a time and find peaks FOR BOTH SCAN MODES! # @@ -32,11 +27,11 @@ scale=2 # Initial value used to estimate scaling parameter width=1024 height=768 -# Aggregate +# Initiate trimLeft=NULL trimRight=NULL -breaks.fwhm=NULL -breaks.fwhm.avg=NULL +breaks_fwhm=NULL +breaks_fwhm_avg=NULL bins=NULL pos_results=NULL neg_results=NULL @@ -44,11 +39,11 @@ neg_results=NULL # read in the data for 1 sample raw_data <- suppressMessages(xcmsRaw(filepath)) -# load breaks.fwhm +# load breaks_fwhm load(breaks_filepath) # Create empty placeholders for later use -bins <- rep(0, length(breaks.fwhm) - 1) +bins <- rep(0, length(breaks_fwhm) - 1) pos_bins <- bins neg_bins <- bins @@ -70,8 +65,8 @@ pos_raw_data_matrix <- raw_data_matrix[pos_index, ] neg_raw_data_matrix <- raw_data_matrix[neg_index, ] # Get index for binning intensity values -bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) -bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks.fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) +bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) +bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) # Get the list of intensity values for each bin, and add the # intensity values which are in the same bin @@ -109,19 +104,19 @@ neg_results_transpose = t(neg_results) rownames(pos_results_transpose) = sampname rownames(neg_results_transpose) = sampname -# delete the last value of breaks.fwhm.avg to match dimensions of pos_results and neg_results -breaks.fwhm.avg.minus1 <- breaks.fwhm.avg[-length(breaks.fwhm.avg)] +# delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results +breaks_fwhm_avg_minus1 <- breaks_fwhm_avg[-length(breaks_fwhm_avg)] # Format as string and show precision of float to 5 digits -breaks.fwhm.avg.minus1 <- sprintf("%.5f", breaks.fwhm.avg.minus1) +breaks_fwhm_avg_minus1 <- sprintf("%.5f", breaks_fwhm_avg_minus1) # Use this as the column names -colnames(pos_results_transpose) <- breaks.fwhm.avg.minus1 -colnames(neg_results_transpose) <- breaks.fwhm.avg.minus1 +colnames(pos_results_transpose) <- breaks_fwhm_avg_minus1 +colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1 # transpose back pos_results_final <- t(pos_results_transpose) neg_results_final <- t(neg_results_transpose) -pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks.fwhm) +pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm) save(pklist, file=paste("./", sampname, ".RData", sep="")) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index 0bb7855..c7103cd 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -5,14 +5,13 @@ process AssignToBins { input: tuple(file_id, path(mzML_filename), path(breaks_file)) - val(resolution) output: path("${file_id}.RData") script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $resolution + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $params.resolution """ } diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 3bfc484..4303a99 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,8 +1,8 @@ #!/usr/bin/Rscript # adapted from 3-AverageTechReplicates.R -# load required packages -# none +# load required functions +# source(paste(scripts_dir, "AddOnFunctions/removeFromRepl.patR", sep="")) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -13,15 +13,15 @@ nr_replicates <- as.numeric(cmd_args[2]) thresh2remove <- 2000 dimsThresh <- 100 -removeFromRepl.pat <- function(bad_samples, repl.pattern, nr_replicates) { +removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { # bad_samples=remove_pos - tmp = repl.pattern + tmp = repl_pattern removeFromGroup=NULL for (i in 1:length(tmp)){ - tmp2 = repl.pattern[[i]] + tmp2 = repl_pattern[[i]] remove=NULL @@ -36,33 +36,33 @@ removeFromRepl.pat <- function(bad_samples, repl.pattern, nr_replicates) { } if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i) - if (!is.null(remove)) repl.pattern[[i]]=repl.pattern[[i]][-remove] + if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove] } if (length(removeFromGroup)!=0) { - repl.pattern=repl.pattern[-removeFromGroup] + repl_pattern=repl_pattern[-removeFromGroup] } - return(list("pattern"=repl.pattern)) + return(list("pattern"=repl_pattern)) } -# get repl.pattern +# get repl_pattern load("./init.RData") remove_neg=NULL remove_pos=NULL cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (i in 1:length(repl.pattern)) { +for (i in 1:length(repl_pattern)) { techRepsArray.pos = NULL techRepsArray.neg = NULL - tech_reps = as.vector(unlist(repl.pattern[i])) + tech_reps = as.vector(unlist(repl_pattern[i])) sum_neg=0 sum_pos=0 n_pos=0 n_neg=0 - cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl.pattern)) + cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl_pattern)) for (j in 1:length(tech_reps)) { load(paste("./", tech_reps[j], ".RData", sep="")) cat("\n\nParsing", tech_reps[j]) @@ -96,24 +96,24 @@ for (i in 1:length(repl.pattern)) { if (n_neg != 0){ sum_neg[,1] <- sum_neg[,1]/n_neg - colnames(sum_neg) <- names(repl.pattern)[i] - save(sum_neg, file=paste("./", names(repl.pattern)[i], "_neg.RData", sep="")) + colnames(sum_neg) <- names(repl_pattern)[i] + save(sum_neg, file=paste("./", names(repl_pattern)[i], "_neg_avg.RData", sep="")) } if (n_pos != 0) { sum_pos[,1] <- sum_pos[,1]/n_pos - colnames(sum_pos) <- names(repl.pattern)[i] - save(sum_pos, file=paste("./", names(repl.pattern)[i], "_pos.RData", sep="")) + colnames(sum_pos) <- names(repl_pattern)[i] + save(sum_pos, file=paste("./", names(repl_pattern)[i], "_pos_avg.RData", sep="")) } } -retVal <- removeFromRepl.pat(remove_neg, repl.pattern, nr_replicates) -repl.pattern.filtered <- retVal$pattern -save(repl.pattern.filtered, file="./repl_pattern_negative.RData") -write.table(remove_neg, file="./miss_infusions_neg.txt", row.names=FALSE, col.names=FALSE ,sep= "\t") +retVal <- removeFromRepl.pat(remove_neg, repl_pattern, nr_replicates) +repl_pattern_filtered <- retVal$pattern +save(repl_pattern_filtered, file="./negative_repl_pattern.RData") +write.table(remove_neg, file="./miss_infusions_negative.txt", row.names=FALSE, col.names=FALSE ,sep= "\t") -retVal <- removeFromRepl.pat(remove_pos, repl.pattern, nr_replicates) -repl.pattern.filtered <- retVal$pattern -save(repl.pattern.filtered, file="./repl_pattern_positive.RData") -write.table(remove_pos, file="./miss_infusions_pos.txt", row.names=FALSE, col.names=FALSE ,sep= "\t") +retVal <- removeFromRepl.pat(remove_pos, repl_pattern, nr_replicates) +repl_pattern_filtered <- retVal$pattern +save(repl_pattern_filtered, file="./positive_repl_pattern.RData") +write.table(remove_pos, file="./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE ,sep= "\t") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index aa3b99d..26f9211 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -6,18 +6,16 @@ process AverageTechReplicates { input: path(RData_file) path(init_filepath) - val(nr_replicates) output: - path '*_*.RData', emit: binned - path 'repl.pattern.negative.RData' - path 'repl.pattern.positive.RData' - path 'miss_infusions_neg.txt' - path 'miss_infusions_pos.txt' + path '*_repl_pattern.RData', emit: patterns + path '*_avg.RData', emit: binned + path 'miss_infusions_negative.txt' + path 'miss_infusions_positive.txt' script: """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $nr_replicates + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates """ } diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 22b80c9..285da0d 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -2,8 +2,6 @@ ## adapted from 1-generateBreaksFwhm.HPC.R ## #!/usr/bin/Rscript -# .libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") - # load required package suppressPackageStartupMessages(library("xcms")) @@ -19,8 +17,8 @@ resol <- as.numeric(cmd_args[4]) # 140000 # initialize trimLeft = NULL trimRight = NULL -breaks.fwhm = NULL -breaks.fwhm.avg = NULL +breaks_fwhm = NULL +breaks_fwhm_avg = NULL bins = NULL posRes = NULL negRes = NULL @@ -28,7 +26,7 @@ negRes = NULL # read in mzML file raw_data <- suppressMessages(xcmsRaw(filepath)) -# trim scans at the start and end +# trim (remove) scans at the start and end trimLeft = round(raw_data@scantime[length(raw_data@scantime)*trim]) trimRight = round(raw_data@scantime[length(raw_data@scantime)*(1-trim)]) @@ -44,16 +42,16 @@ segment = seq(from=lowMZ, to=highMZ, length.out=nsegment+1) for (i in 1:nsegment) { startsegm <- segment[i] endsegm <- segment[i+1] - resol.mz <- resol*(1/sqrt(2)^(log2(startsegm/200))) - fwhmsegm <- startsegm/resol.mz - breaks.fwhm <- c(breaks.fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)) + resol_mz <- resol*(1/sqrt(2)^(log2(startsegm/200))) + fwhmsegm <- startsegm/resol_mz + breaks_fwhm <- c(breaks_fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)) # average the m/z instead of start value range = seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm) deltaMZ = range[2] - range[1] - breaks.fwhm.avg <- c(breaks.fwhm.avg, range + 0.5*deltaMZ) + breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5*deltaMZ) } -save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/")) -# temporary fix for breaks file: -# save(breaks.fwhm, breaks.fwhm.avg, trimLeft, trimRight, file=paste(outdir/../, "breaks.fwhm.RData", sep="/")) +# remove one of these: +# save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file=paste(outdir, "breaks.fwhm.RData", sep="/")) +save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData") diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 10fdc37..6fb1816 100644 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -1,7 +1,5 @@ #!/usr/bin/Rscript - -# load required packages -# none +# adapted from hmdb_parts.R # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -11,116 +9,123 @@ for (arg in cmd_args) cat(" ", arg, "\n") # scanmode <- cmd_args[2] db_path <- cmd_args[1] # location of HMDB db file breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -standard_run <- "no" # cmd_args[5] # "yes" +standard_run <- cmd_args[3] # "yes" # Cut up entire HMDB into small parts based on the new binning/breaks -# load(paste(outdir, "breaks.fwhm.RData", sep = "/")) load(breaks_filepath) -#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/") -#dir.create(outdir_hmdb, showWarnings = FALSE) -# New: in case of a standard run (m/z 69-606) use external HMDB parts -min_mz <- round(breaks.fwhm[1]) -max_mz <- round(breaks.fwhm[length(breaks.fwhm)]) +# In case of a standard run (m/z 69-606) use external HMDB parts +min_mz <- round(breaks_fwhm[1]) +max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) # test if standard mz range is used if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) { # skip generating HMDB parts use_external_HMDB <- TRUE save(use_external_HMDB, file="./using_external_HMDB_parts.RData") hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/" - hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files + # hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files + hmdb_parts <- list.files(hmdb_parts_dir, pattern=hmdb) # all files containing hmdb in file name # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/")) for (hmdb_file in hmdb_parts) { - file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE) + # file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE) + file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), "./", recursive = TRUE) } } else { # generate HMDB parts in case of non-standard mz range use_external_HMDB <- FALSE - save(use_external_HMDB, file="not_using_external_HMDB_parts.RData") + save(use_external_HMDB, file="./not_using_external_HMDB_parts.RData") # db <- cmd_args[3] load(db_path) ppm <- as.numeric(cmd_args[4]) - if (scanmode=="negative"){ - label = "MNeg" - HMDB_add_iso=HMDB_add_iso.Neg - } else { - label = "Mpos" - HMDB_add_iso=HMDB_add_iso.Pos - } - # filter mass range meassured!!! - HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),] - - # sort on mass - outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),] - - n=dim(outlist)[1] - sub=20000 # max rows per file - end=0 - min_1_last=sub - check=0 - outlist_part=NULL - - - if (n < sub) { - outlist_part <- outlist - save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/")) - } else { - - if (n >= sub & (floor(n/sub) - 1) >= 2){ - for (i in 2:floor(n/sub) - 1){ - start <- -(sub - 1) + i*sub - end <- i*sub - - if (i > 1){ - outlist_i = outlist[c(start:end),] - - n_moved = 0 - + scanmodes <- c("positive", "negative") + + for (scanmode in scanmodes) { + if (scanmode == "negative") { + column_label <- "MNeg" + HMDB_add_iso <- HMDB_add_iso.Neg + } else if (scanmode == "positive") { + column_label <- "Mpos" + HMDB_add_iso <- HMDB_add_iso.Pos + } + + # filter mass range meassured!!! + HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] & + HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + + # sort on mass + outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),] + + n=dim(outlist)[1] + sub=20000 # max rows per file + end=0 + min_1_last=sub + check=0 + outlist_part=NULL + + + if (n < sub) { + outlist_part <- outlist + save(outlist_part, file = paste0("./", scanmode, "_hmdb.1.RData")) + } else { + + if (n >= sub & (floor(n/sub) - 1) >= 2){ + for (i in 2:floor(n/sub) - 1){ + start <- -(sub - 1) + i*sub + end <- i*sub + + if (i > 1){ + outlist_i = outlist[c(start:end),] + + n_moved = 0 + + # Calculate 3ppm and replace border, avoid cut within peakgroup! + while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1,]) + outlist_i <- outlist_i[-1,] + n_moved <- n_moved + 1 + } + + # message(paste("Process", i-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep="."), sep="")) + check <- check + dim(outlist_part)[1] + + outlist_part <- outlist_i + min_1_last <- dim(outlist_part)[1] + + } else { + outlist_part <- outlist[c(start:end),] + } + } + } + + start <- end + 1 + end <- n + outlist_i <- outlist[c(start:end),] + n_moved <- 0 + + if (!is.null(outlist_part)) { # Calculate 3ppm and replace border, avoid cut within peakgroup! - while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1,]) - outlist_i <- outlist_i[-1,] - n_moved <- n_moved + 1 + while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) { + outlist_part = rbind(outlist_part, outlist_i[1,]) + outlist_i = outlist_i[-1,] + n_moved = n_moved + 1 } - - # message(paste("Process", i-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/")) + + # message(paste("Process", i+1-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i, "RData", sep = "."), sep = "")) check <- check + dim(outlist_part)[1] - - outlist_part <- outlist_i - min_1_last <- dim(outlist_part)[1] - - } else { - outlist_part <- outlist[c(start:end),] } + + outlist_part <- outlist_i + # message(paste("Process", i+2-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep="")) + check <- check + dim(outlist_part)[1] + cat("\n", "Check", check == dim(outlist)[1]) + } } - - start <- end + 1 - end <- n - outlist_i <- outlist[c(start:end),] - n_moved <- 0 - - if (!is.null(outlist_part)) { - # Calculate 3ppm and replace border, avoid cut within peakgroup! - while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { - outlist_part = rbind(outlist_part, outlist_i[1,]) - outlist_i = outlist_i[-1,] - n_moved = n_moved + 1 - } - - # message(paste("Process", i+1-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_")) - check <- check + dim(outlist_part)[1] - } - - outlist_part <- outlist_i - # message(paste("Process", i+2-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_")) - check <- check + dim(outlist_part)[1] - cat("\n", "Check", check == dim(outlist)[1]) - - } + } + diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 8ed9130..7d58bfe 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -6,8 +6,11 @@ process HMDBparts { shell = ['/bin/bash', '-euo', 'pipefail'] input: + // tuple(path(hmdb_db_file), path(breaks_file)) path(hmdb_db_file) path(breaks_file) + // val(standard_run) + // val(ppm) output: path('*.RData') @@ -15,6 +18,6 @@ process HMDBparts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.standard_run $params.ppm """ } diff --git a/DIMS/HMDBparts_old.R b/DIMS/HMDBparts_old.R new file mode 100644 index 0000000..10fdc37 --- /dev/null +++ b/DIMS/HMDBparts_old.R @@ -0,0 +1,126 @@ +#!/usr/bin/Rscript + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +# outdir <- cmd_args[1] +# scanmode <- cmd_args[2] +db_path <- cmd_args[1] # location of HMDB db file +breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData +standard_run <- "no" # cmd_args[5] # "yes" + +# Cut up entire HMDB into small parts based on the new binning/breaks + +# load(paste(outdir, "breaks.fwhm.RData", sep = "/")) +load(breaks_filepath) +#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/") +#dir.create(outdir_hmdb, showWarnings = FALSE) + +# New: in case of a standard run (m/z 69-606) use external HMDB parts +min_mz <- round(breaks.fwhm[1]) +max_mz <- round(breaks.fwhm[length(breaks.fwhm)]) +# test if standard mz range is used +if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) { + # skip generating HMDB parts + use_external_HMDB <- TRUE + save(use_external_HMDB, file="./using_external_HMDB_parts.RData") + hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/" + hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files + # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/")) + for (hmdb_file in hmdb_parts) { + file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE) + } +} else { + # generate HMDB parts in case of non-standard mz range + use_external_HMDB <- FALSE + save(use_external_HMDB, file="not_using_external_HMDB_parts.RData") + # db <- cmd_args[3] + load(db_path) + ppm <- as.numeric(cmd_args[4]) + if (scanmode=="negative"){ + label = "MNeg" + HMDB_add_iso=HMDB_add_iso.Neg + } else { + label = "Mpos" + HMDB_add_iso=HMDB_add_iso.Pos + } + + # filter mass range meassured!!! + HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),] + + # sort on mass + outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),] + + n=dim(outlist)[1] + sub=20000 # max rows per file + end=0 + min_1_last=sub + check=0 + outlist_part=NULL + + + if (n < sub) { + outlist_part <- outlist + save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/")) + } else { + + if (n >= sub & (floor(n/sub) - 1) >= 2){ + for (i in 2:floor(n/sub) - 1){ + start <- -(sub - 1) + i*sub + end <- i*sub + + if (i > 1){ + outlist_i = outlist[c(start:end),] + + n_moved = 0 + + # Calculate 3ppm and replace border, avoid cut within peakgroup! + while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1,]) + outlist_i <- outlist_i[-1,] + n_moved <- n_moved + 1 + } + + # message(paste("Process", i-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/")) + check <- check + dim(outlist_part)[1] + + outlist_part <- outlist_i + min_1_last <- dim(outlist_part)[1] + + } else { + outlist_part <- outlist[c(start:end),] + } + } + } + + start <- end + 1 + end <- n + outlist_i <- outlist[c(start:end),] + n_moved <- 0 + + if (!is.null(outlist_part)) { + # Calculate 3ppm and replace border, avoid cut within peakgroup! + while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { + outlist_part = rbind(outlist_part, outlist_i[1,]) + outlist_i = outlist_i[-1,] + n_moved = n_moved + 1 + } + + # message(paste("Process", i+1-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_")) + check <- check + dim(outlist_part)[1] + } + + outlist_part <- outlist_i + # message(paste("Process", i+2-1,":", dim(outlist_part)[1])) + save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_")) + check <- check + dim(outlist_part)[1] + cat("\n", "Check", check == dim(outlist)[1]) + + } +} diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 2657b66..2f2a25a 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,10 +1,6 @@ #!/usr/bin/env Rscript ## adapted from makeInit in old pipeline -# used for when init.RData has to be created manually -# arg1 : path to sampleNames.txt or whatever the name of the samplesheet txt file may be -# arg2 : amount of technical replicates (usually 3) - args <- commandArgs(trailingOnly=TRUE) sample_sheet <- read.csv(args[1], sep="\t") nr_replicates <- as.numeric(args[2]) @@ -14,21 +10,20 @@ nr_sampgrps <- length(sampleNames)/nr_replicates groupNames <- trimws(as.vector(unlist(sample_sheet[2]))) groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames) groupNamesUnique <- unique(groupNames) -#groupNamesNotUnique <- groupNames[duplicated(groupNames)] -repl.pattern <- c() +repl_pattern <- c() for (sampgrp in 1:nr_sampgrps) { tmp <- c() for (repl in nr_replicates:1) { index <- ((sampgrp*nr_replicates) - repl) + 1 tmp <- c(tmp, sampleNames[index]) } - repl.pattern <- c(repl.pattern, list(tmp)) + repl_pattern <- c(repl_pattern, list(tmp)) } -names(repl.pattern) <- groupNamesUnique +names(repl_pattern) <- groupNamesUnique # just to preview -head(repl.pattern) +head(repl_pattern) -save(repl.pattern, file="init.RData") +save(repl_pattern, file="./init.RData") diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 4820ef5..a739257 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,68 +1,72 @@ #!/usr/bin/Rscript - -# load required packages -# none +# adapted from 4-peakFinding.R # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) for (arg in cmd_args) cat(" ", arg, "\n", sep="") filepath <- cmd_args[1] -resol <- as.numeric(cmd_args[2]) -scripts <- cmd_args[3] -breaks_filepath <- cmd_args[4] # location of breaks.fwhm.RData +breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData +resol <- as.numeric(cmd_args[3]) +scripts_dir <- cmd_args[4] thresh <- 2000 # if file extension is .txt, do nothing. -if (grepl(".txt$", filepath) { stop("not run on txt file") } +if (grepl(".txt$", filepath)) { stop("not run on txt file") } -# create output folder -dir.create(paste(outdir, "4-specpks", sep="/"),showWarnings = F) +# for debugging: +print(filepath) +print(breaks_filepath) +print(resol) +print(scripts_dir) +print(paste(scripts_dir, "AddOnFunctions/", sep="")) # load in function scripts -source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/")) -sourceDir(paste(scripts, "AddOnFunctions", sep="/")) +# source(paste(scripts_dir, "AddOnFunctions/sourceDir.R", sep="/")) +# the sourceDir function no longer works +# sourceDir(paste(scripts_dir, "AddOnFunctions", sep="/")) +source(paste(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/searchMZRange.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/generateGaussian.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fitGaussian.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fitGaussianInit.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/getFwhm.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/getSD.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/optimizeGauss.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit1Peak.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit2peaks.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit3peaks.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit4peaks.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fitG.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit2G.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit3G.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/fit4G.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/getArea.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/getFitQuality.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/checkOverlap.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/sumCurves.R", sep="")) +source(paste(scripts_dir, "AddOnFunctions/isWithinXppm.R", sep="")) -# load(paste(outdir, "breaks.fwhm.RData", sep="/")) load(breaks_filepath) -load(filepath) -if (grepl("_pos", filepath)) { scanmode = "positive" } else - if (grepl("_neg", filepath)) { scanmode = "negative" } - -options(digits=16) -int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) -scale=2 # Initial value used to estimate scaling parameter -width=1024 -height=768 +# load(filepath) -### fit Gaussian estimate mean and integrate to obtain intensity -findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { - sampname <- colnames(plist)[1] - - range = as.vector(plist) - names(range) = rownames(plist) - - values = list("mean"=NULL,"area"=NULL,"nr"=NULL,"min"=NULL,"max"=NULL,"qual"=NULL,"spikes"=0) - - values = searchMZRange(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh) - - outlist.persample=NULL - outlist.persample=cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area) - - index=which(outlist.persample[,"height.pkt"]==0) - if (length(index)>0) { - outlist.persample=outlist.persample[-index,] - } - - save(outlist.persample, file=paste(outdir, "4-specpks", paste(sampname, "_", scanmode, ".RData", sep=""), sep="/")) - - cat(paste("There were", values$spikes, "spikes!")) -} +# for some reason, the repl_pattern_positive.RData and repl_pattern_negative.RData files are also read in. +if (!grepl("repl_pattern", filepath)) { + # Load output of AverageTechReplicates for a sample + sample_avgtechrepl <- get(load(filepath)) + if (grepl("_pos", filepath)) { scanmode = "positive" } else + if (grepl("_neg", filepath)) { scanmode = "negative" } + # for debugging: + print(filepath) + print(scanmode) -if (scanmode == "negative") { - findPeaks.Gauss.HPC(sum_neg, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) -} else { - findPeaks.Gauss.HPC(sum_pos, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) + # Initialize + options(digits = 16) + int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic) + scale <- 2 # Initial value used to estimate scaling parameter + width <- 1024 + height <- 768 + findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) } diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index fb86239..343c4c1 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -4,20 +4,15 @@ process PeakFinding { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(RData_file) - val(resolution) - path(scripts) - path(breaks_file) + // path(RData_file) + // path(breaks_file) + tuple(path(RData_file), path(breaks_file)) output: - path '*_neg.RData' - path '*_pos.RData' - path 'repl.pattern.*.RData', emit: pattern - path 'miss_infusions_neg.txt' - path 'miss_infusions_pos.txt' + path '*tive.RData' script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $resolution $scripts $breaks_file + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $breaks_file $params.resolution $params.scripts_dir """ } diff --git a/DIMS/PeakGroupingIdentified.R b/DIMS/PeakGroupingIdentified.R index 7c8d998..e6bf3fc 100644 --- a/DIMS/PeakGroupingIdentified.R +++ b/DIMS/PeakGroupingIdentified.R @@ -7,9 +7,9 @@ cmd_args <- commandArgs(trailingOnly = TRUE) for (arg in cmd_args) cat(" ", arg, "\n", sep="") -SpecPeaks_file <- cmd_args[1] #outdir <- cmd_args[2] #scanmode <- cmd_args[3] +SpecPeaks_file <- cmd_args[1] HMDB_part_file <- cmd_args[2] pattern_file <- cmd_args[3] resol <- as.numeric(cmd_args[4]) diff --git a/DIMS/PeakGroupingIdentified.nf b/DIMS/PeakGroupingIdentified.nf index 8100780..2181cae 100644 --- a/DIMS/PeakGroupingIdentified.nf +++ b/DIMS/PeakGroupingIdentified.nf @@ -4,15 +4,16 @@ process PeakGroupingIdentified { shell = ['/bin/bash', '-euo', 'pipefail'] input: + path(SpectrumPeak_file) path(HMDBpart_file) - val(resolution) + path(pattern_file) output: - path 'negative.RData' - path 'positive.RData' + path '*_negative.RData' + path '*_positive.RData' script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $HMDBpart_file $resolution $ppm + Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $SpectrumPeak_file $HMDBpart_file $pattern_file $params.resolution $params.ppm """ } diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index f85d9c5..175579f 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -1,77 +1,86 @@ #!/usr/bin/Rscript +# adapted from 5-collectSamples.R # load required packages # none # define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") +#cmd_args <- commandArgs(trailingOnly = TRUE) +#for (arg in cmd_args) cat(" ", arg, "\n") # outdir <- cmd_args[1] # scanmode <- cmd_args[2] scanmodes <- c("positive", "negative") -# Check if all jobs terminated correct! +# Check whether all jobs terminated correct! notRun = NULL -# new: run for each scanmode -for (scanmode in scanmodes) { - load(paste0("./repl.pattern.", scanmode, ".RData")) - groupNames = names(repl.pattern.filtered) - - indir <- "./" - object.files = list.files(indir, full.names=TRUE, pattern="*.RData") +print("one") +# collect spectrum peaks for each scanmode +for (scanmode in scanmodes) { + print(scanmode) + # load peak lists of all biological samples + input_dir <- getwd() # "./" + peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep="")) + + # remove any repl_pattern files from the list + #if (sum(grepl("repl_pattern", peaklist_files)i > 0) { + # peaklist_files <- peaklist_files[-grep("repl_pattern", peaklist_files)] } + print(peaklist_files) + + # get sample names + load(paste0("./", scanmode, "_repl_pattern", ".RData")) + groupNames = names(repl_pattern_filtered) for (i in 1:length(groupNames)) { - group <- paste0(indir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData")) - if (!(group %in% object.files)) { + group <- paste0(input_dir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData")) + if (!(group %in% peaklist_files)) { notRun = c(notRun, group) } } - - #if (is.null(notRun)){ - cat("\nCollecting samples!") - - # negative - filepath <- "./") - files <- list.files(filepath, recursive=TRUE, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep="")) - - outlist.tot=NULL - for (i in 1:length(files)) { - - cat("\n", files[i]) - load(files[i]) - - if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)){ - tmp=strsplit(files[i], "/")[[1]] - fname = tmp[length(tmp)] - #fname = strsplit(files[i], "/")[[1]][8] - fname = strsplit(fname, ".RData")[[1]][1] - fname = substr(fname, 13, nchar(fname)) - - if (i == 1) { outlist.tot <- c(fname, rep("-1",5)) } else { outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5)))} - } else { - if (i == 1) { outlist.tot <- outlist.persample } else { outlist.tot <- rbind(outlist.tot, outlist.persample)} + print("two") + cat("\nCollecting samples!") + + outlist.tot=NULL + for (i in 1:length(peaklist_files)) { + + cat("\n", peaklist_files[i]) + load(peaklist_files[i]) + if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)) { + tmp=strsplit(peaklist_files[i], "/")[[1]] + fname = tmp[length(tmp)] + #fname = strsplit(peaklist_files[i], "/")[[1]][8] + fname = strsplit(fname, ".RData")[[1]][1] + fname = substr(fname, 13, nchar(fname)) + if (i == 1) { + outlist.tot <- c(fname, rep("-1",5)) + } else { + outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5))) } - } - - # remove negative values - index=which(outlist.tot[,"height.pkt"]<=0) - if (length(index)>0) outlist.tot = outlist.tot[-index,] - index=which(outlist.tot[,"mzmed.pkt"]<=0) - if (length(index)>0) outlist.tot = outlist.tot[-index,] - - # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/") - # dir.create(outdir_specpks, showWarnings = F) - # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/")) - save(outlist.tot, file = paste(scanmode, "RData", sep = ".") - - - if (!is.null(notRun)){ - for (i in 1:length(notRun)){ - message(paste(notRun[i], "was not generated")) + } else { + if (i == 1) { + outlist.tot <- outlist.persample + } else { + outlist.tot <- rbind(outlist.tot, outlist.persample) } } - + } + print("three") + # remove negative values + index=which(outlist.tot[,"height.pkt"]<=0) + if (length(index)>0) outlist.tot = outlist.tot[-index,] + index=which(outlist.tot[,"mzmed.pkt"]<=0) + if (length(index)>0) outlist.tot = outlist.tot[-index,] + + # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/") + # dir.create(outdir_specpks, showWarnings = F) + # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/")) + save(outlist.tot, file = paste("./", scanmode, ".RData", sep = "")) + + if (!is.null(notRun)){ + for (i in 1:length(notRun)){ + message(paste(notRun[i], "was not generated")) + } + } } diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf index 66e04a8..6a7c73a 100644 --- a/DIMS/SpectrumPeakFinding.nf +++ b/DIMS/SpectrumPeakFinding.nf @@ -5,6 +5,7 @@ process SpectrumPeakFinding { input: path(RData_file) + path(replication_pattern) output: path 'negative.RData' @@ -12,6 +13,6 @@ process SpectrumPeakFinding { script: """ - Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $RData_file + Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $replication_pattern """ } From 6b6c5316cf273d32a11ae42031adb878f4700098 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 25 Aug 2023 17:16:17 +0200 Subject: [PATCH 09/73] code cleaned up --- DIMS/AddOnFunctions/replaceZeros.R | 64 ++++++++------------- DIMS/AssignToBins.R | 57 +++++++++---------- DIMS/AverageTechReplicates.R | 73 +++++++++++------------- DIMS/GenerateBreaks.R | 33 ++++++----- DIMS/HMDBparts.R | 36 ++++-------- DIMS/MakeInit.R | 12 ++-- DIMS/PeakFinding.R | 91 ++++++++++++------------------ DIMS/SpectrumPeakFinding.R | 36 +++--------- DIMS/SpectrumPeakFinding.nf | 4 +- 9 files changed, 163 insertions(+), 243 deletions(-) diff --git a/DIMS/AddOnFunctions/replaceZeros.R b/DIMS/AddOnFunctions/replaceZeros.R index 4fafc9a..234d961 100644 --- a/DIMS/AddOnFunctions/replaceZeros.R +++ b/DIMS/AddOnFunctions/replaceZeros.R @@ -1,4 +1,4 @@ -replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ +replaceZeros <- function(outpgrlist, repl_pattern, scanmode, resol, outdir, thresh, ppm) { # file="./results/grouping_rest/negative_1.RData" # scanmode= "negative" # scriptDir="./scripts" @@ -6,49 +6,43 @@ replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ # thresh=2000 # outdir="./results" - control_label="C" + # control_label="C" - source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) - sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) + # source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) + # sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) - dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) + # dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) # scale=2 # Initial value used to estimate scaling parameter # width=1024 # height=768 - # message(paste("file", file)) - # message(paste("scanmode", scanmode)) - # message(paste("resol", resol)) - # message(paste("outdir", outdir)) - # message(paste("thresh", thresh)) - # message(paste("scriptDir", scriptDir)) - - load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) - - name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) - name = name[length(name)] + # load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) + + # batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2] + # name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) + # name = name[length(name)] # message(paste("File name: ", name)) # load samplePeaks # load outpgrlist - load(file) + # load(file) - # ################################################################################# # # filter on at least signal in two control samples # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) # # barplot(as.numeric(outpgrlist[753, int.cols])) # keep = NULL # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) # outpgrlist = outpgrlist[keep,] - # ################################################################################# - ################################################################################ - # For now only replace zeros + # replace zeros if (!is.null(outpgrlist)) { - for (i in 1:length(names(repl.pattern.filtered))){ - samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]] + print(dim(outpgrlist)) + print(colnames(outpgrlist)) + for (i in 1:length(names(repl_pattern))){ + print(names(repl_pattern)[i]) + samplePeaks=outpgrlist[,names(repl_pattern)[i]] index=which(samplePeaks<=0) if (!length(index)){ next @@ -56,16 +50,13 @@ replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ for (j in 1:length(index)){ area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area # for testing purposes, add a fixed random seed - set.seed(123) - outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area) + # set.seed(123) + outpgrlist[index[j], names(repl_pattern)[i]] = rnorm(n=1, mean=area, sd=0.25*area) } } - ################################################################################ - - #################### identification ######################################################### - # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016 + # Identification # Add average column outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) @@ -84,25 +75,16 @@ replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ # HMDB_add_iso=HMDB_add_iso.Pos } - # # Identification using large database - # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label) - # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups")) - # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres")) - # Identify noise peaks noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] - - # Replace "Negative" by "negative" in ident.hires.noise final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) - # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks")) tmp <- final.outlist.idpat2[ , c("assi", "theormz")] colnames(tmp) <- c("assi_noise", "theormz_noise") final.outlist.idpat3 <- cbind(outpgrlist, tmp) - ############################################################################################# - - # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep=""))) - save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep="")) + + return(final.outlist.idpat3) + # save(final.outlist.idpat3, file=paste("./", name, sep="")) } } diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index c6fdaba..4e320d3 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -8,34 +8,31 @@ suppressPackageStartupMessages(library("xcms")) cmd_args <- commandArgs(trailingOnly = TRUE) for (arg in cmd_args) cat(" ", arg, "\n", sep="") -filepath <- cmd_args[1] # location of mzML file +filepath <- cmd_args[1] # location of mzML file breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -resol <- as.numeric(cmd_args[3]) # 140000 -trim <- 0.1 -dimsThresh <- 100 -print(filepath) -print(breaks_filepath) -print(resol) +resol <- as.numeric(cmd_args[3]) # 140000 +trim <- 0.1 +dimsThresh <- 100 -sampname <- sub('\\..*$', '', basename(filepath)) +# get sample name +sample_name <- sub('\\..*$', '', basename(filepath)) options(digits=16) -### process one sample at a time and find peaks FOR BOTH SCAN MODES! # -int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) -scale=2 # Initial value used to estimate scaling parameter -width=1024 -height=768 - -# Initiate -trimLeft=NULL -trimRight=NULL -breaks_fwhm=NULL -breaks_fwhm_avg=NULL -bins=NULL -pos_results=NULL -neg_results=NULL +# Initialize +int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic) +scale <- 2 # Initial value used to estimate scaling parameter +width <- 1024 +height <- 768 +trimLeft <- NULL +trimRight <- NULL +breaks_fwhm <- NULL +breaks_fwhm_avg <- NULL +bins <- NULL +pos_results <- NULL +neg_results <- NULL +### process one sample at a time and find peaks FOR BOTH SCAN MODES! # # read in the data for 1 sample raw_data <- suppressMessages(xcmsRaw(filepath)) @@ -93,16 +90,16 @@ if (nrow(neg_raw_data_matrix) > 0) { pos_bins[pos_bins < dimsThresh] <- 0 neg_bins[neg_bins < dimsThresh] <- 0 -pos_results = cbind(pos_results, pos_bins) -neg_results = cbind(neg_results, neg_bins) +pos_results <- cbind(pos_results, pos_bins) +neg_results <- cbind(neg_results, neg_bins) # transpose -pos_results_transpose = t(pos_results) -neg_results_transpose = t(neg_results) +pos_results_transpose <- t(pos_results) +neg_results_transpose <- t(neg_results) # Add file names as row names -rownames(pos_results_transpose) = sampname -rownames(neg_results_transpose) = sampname +rownames(pos_results_transpose) <- sample_name +rownames(neg_results_transpose) <- sample_name # delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results breaks_fwhm_avg_minus1 <- breaks_fwhm_avg[-length(breaks_fwhm_avg)] @@ -117,6 +114,6 @@ colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1 pos_results_final <- t(pos_results_transpose) neg_results_final <- t(neg_results_transpose) -pklist <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm) +peak_list <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm) -save(pklist, file=paste("./", sampname, ".RData", sep="")) +save(peak_list, file=paste("./", sample_name, ".RData", sep="")) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 4303a99..b880f13 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,9 +1,6 @@ #!/usr/bin/Rscript # adapted from 3-AverageTechReplicates.R -# load required functions -# source(paste(scripts_dir, "AddOnFunctions/removeFromRepl.patR", sep="")) - # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) for (arg in cmd_args) cat(" ", arg, "\n", sep="") @@ -11,36 +8,32 @@ for (arg in cmd_args) cat(" ", arg, "\n", sep="") init_filepath <- cmd_args[1] nr_replicates <- as.numeric(cmd_args[2]) thresh2remove <- 2000 -dimsThresh <- 100 +dimsThresh <- 100 removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { # bad_samples=remove_pos - tmp = repl_pattern + tmp <- repl_pattern - removeFromGroup=NULL + removeFromGroup <- NULL for (i in 1:length(tmp)){ - tmp2 = repl_pattern[[i]] + tmp2 <- repl_pattern[[i]] - remove=NULL + remove <- NULL for (j in 1:length(tmp2)){ if (tmp2[j] %in% bad_samples){ - #cat(tmp2[j]) - #cat(paste("remove",tmp2[j])) - #cat(paste("remove i",i)) - #cat(paste("remove j",j)) remove = c(remove, j) } } - if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i) - if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove] + if (length(remove) == nr_replicates) removeFromGroup <- c(removeFromGroup,i) + if (!is.null(remove)) repl_pattern[[i]] <- repl_pattern[[i]][-remove] } if (length(removeFromGroup)!=0) { - repl_pattern=repl_pattern[-removeFromGroup] + repl_pattern <- repl_pattern[-removeFromGroup] } return(list("pattern"=repl_pattern)) @@ -50,44 +43,44 @@ removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { # get repl_pattern load("./init.RData") -remove_neg=NULL -remove_pos=NULL +remove_neg <- NULL +remove_pos <- NULL cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") for (i in 1:length(repl_pattern)) { - techRepsArray.pos = NULL - techRepsArray.neg = NULL + techRepsArray.pos <- NULL + techRepsArray.neg <- NULL - tech_reps = as.vector(unlist(repl_pattern[i])) - sum_neg=0 - sum_pos=0 - n_pos=0 - n_neg=0 + tech_reps <- as.vector(unlist(repl_pattern[i])) + sum_neg <- 0 + sum_pos <- 0 + n_pos <- 0 + n_neg <- 0 cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl_pattern)) for (j in 1:length(tech_reps)) { load(paste("./", tech_reps[j], ".RData", sep="")) cat("\n\nParsing", tech_reps[j]) - cat("\n\tNegative pklist sum",sum(pklist$neg[,1])) - if (sum(pklist$neg[,1]) 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) { # skip generating HMDB parts - use_external_HMDB <- TRUE - save(use_external_HMDB, file="./using_external_HMDB_parts.RData") hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/" - # hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files hmdb_parts <- list.files(hmdb_parts_dir, pattern=hmdb) # all files containing hmdb in file name - # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/")) for (hmdb_file in hmdb_parts) { - # file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE) file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), "./", recursive = TRUE) } } else { # generate HMDB parts in case of non-standard mz range - use_external_HMDB <- FALSE - save(use_external_HMDB, file="./not_using_external_HMDB_parts.RData") - # db <- cmd_args[3] load(db_path) ppm <- as.numeric(cmd_args[4]) @@ -55,14 +44,14 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # sort on mass - outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),] + outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),] - n=dim(outlist)[1] - sub=20000 # max rows per file - end=0 - min_1_last=sub - check=0 - outlist_part=NULL + n <- dim(outlist)[1] + sub <- 20000 # max rows per file + end <- 0 + min_1_last <- sub + check <- 0 + outlist_part <- NULL if (n < sub) { @@ -106,26 +95,23 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < n_moved <- 0 if (!is.null(outlist_part)) { - # Calculate 3ppm and replace border, avoid cut within peakgroup! + # Calculate ppm and replace border, avoid cut within peakgroup! while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) { - outlist_part = rbind(outlist_part, outlist_i[1,]) - outlist_i = outlist_i[-1,] - n_moved = n_moved + 1 + outlist_part <- rbind(outlist_part, outlist_i[1,]) + outlist_i <- outlist_i[-1,] + n_moved <- n_moved + 1 } - # message(paste("Process", i+1-1,":", dim(outlist_part)[1])) save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i, "RData", sep = "."), sep = "")) check <- check + dim(outlist_part)[1] } outlist_part <- outlist_i - # message(paste("Process", i+2-1,":", dim(outlist_part)[1])) save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep="")) check <- check + dim(outlist_part)[1] cat("\n", "Check", check == dim(outlist)[1]) } } - } diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 2f2a25a..3b125e3 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,14 +1,14 @@ #!/usr/bin/env Rscript ## adapted from makeInit in old pipeline -args <- commandArgs(trailingOnly=TRUE) -sample_sheet <- read.csv(args[1], sep="\t") +args <- commandArgs(trailingOnly=TRUE) +sample_sheet <- read.csv(args[1], sep="\t") nr_replicates <- as.numeric(args[2]) sampleNames <- trimws(as.vector(unlist(sample_sheet[1]))) nr_sampgrps <- length(sampleNames)/nr_replicates -groupNames <- trimws(as.vector(unlist(sample_sheet[2]))) -groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames) +groupNames <- trimws(as.vector(unlist(sample_sheet[2]))) +groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames) groupNamesUnique <- unique(groupNames) repl_pattern <- c() @@ -23,7 +23,7 @@ for (sampgrp in 1:nr_sampgrps) { names(repl_pattern) <- groupNamesUnique -# just to preview -head(repl_pattern) +# preview the replication pattern +print(head(repl_pattern)) save(repl_pattern, file="./init.RData") diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index a739257..379a0fa 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -5,68 +5,49 @@ cmd_args <- commandArgs(trailingOnly = TRUE) for (arg in cmd_args) cat(" ", arg, "\n", sep="") -filepath <- cmd_args[1] +filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -resol <- as.numeric(cmd_args[3]) -scripts_dir <- cmd_args[4] +resol <- as.numeric(cmd_args[3]) +scripts_dir <- cmd_args[4] thresh <- 2000 -# if file extension is .txt, do nothing. -if (grepl(".txt$", filepath)) { stop("not run on txt file") } - -# for debugging: -print(filepath) -print(breaks_filepath) -print(resol) -print(scripts_dir) -print(paste(scripts_dir, "AddOnFunctions/", sep="")) - # load in function scripts -# source(paste(scripts_dir, "AddOnFunctions/sourceDir.R", sep="/")) -# the sourceDir function no longer works -# sourceDir(paste(scripts_dir, "AddOnFunctions", sep="/")) -source(paste(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/searchMZRange.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/generateGaussian.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fitGaussian.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fitGaussianInit.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/getFwhm.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/getSD.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/optimizeGauss.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit1Peak.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit2peaks.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit3peaks.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit4peaks.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fitG.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit2G.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit3G.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/fit4G.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/getArea.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/getFitQuality.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/checkOverlap.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/sumCurves.R", sep="")) -source(paste(scripts_dir, "AddOnFunctions/isWithinXppm.R", sep="")) +source(paste0(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R")) +source(paste0(scripts_dir, "AddOnFunctions/searchMZRange.R")) +source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) +source(paste0(scripts_dir, "AddOnFunctions/fitGaussian.R")) +source(paste0(scripts_dir, "AddOnFunctions/fitGaussianInit.R")) +source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R")) +source(paste0(scripts_dir, "AddOnFunctions/getSD.R")) +source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit1Peak.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit2peaks.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit3peaks.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit4peaks.R")) +source(paste0(scripts_dir, "AddOnFunctions/fitG.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit2G.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit3G.R")) +source(paste0(scripts_dir, "AddOnFunctions/fit4G.R")) +source(paste0(scripts_dir, "AddOnFunctions/getArea.R")) +source(paste0(scripts_dir, "AddOnFunctions/getFitQuality.R")) +source(paste0(scripts_dir, "AddOnFunctions/checkOverlap.R")) +source(paste0(scripts_dir, "AddOnFunctions/sumCurves.R")) +source(paste0(scripts_dir, "AddOnFunctions/isWithinXppm.R")) load(breaks_filepath) -# load(filepath) -# for some reason, the repl_pattern_positive.RData and repl_pattern_negative.RData files are also read in. -if (!grepl("repl_pattern", filepath)) { - # Load output of AverageTechReplicates for a sample - sample_avgtechrepl <- get(load(filepath)) - if (grepl("_pos", filepath)) { scanmode = "positive" } else - if (grepl("_neg", filepath)) { scanmode = "negative" } +# Load output of AverageTechReplicates for a sample +sample_avgtechrepl <- get(load(filepath)) +if (grepl("_pos", filepath)) { scanmode = "positive" } else + if (grepl("_neg", filepath)) { scanmode = "negative" } - # for debugging: - print(filepath) - print(scanmode) +# Initialize +options(digits = 16) +int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic) +scale <- 2 # Initial value used to estimate scaling parameter +width <- 1024 +height <- 768 - # Initialize - options(digits = 16) - int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic) - scale <- 2 # Initial value used to estimate scaling parameter - width <- 1024 - height <- 768 - findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) -} +# run the findPeaks function +findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index 175579f..9935b05 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -1,15 +1,7 @@ #!/usr/bin/Rscript # adapted from 5-collectSamples.R -# load required packages -# none - # define parameters -#cmd_args <- commandArgs(trailingOnly = TRUE) -#for (arg in cmd_args) cat(" ", arg, "\n") - -# outdir <- cmd_args[1] -# scanmode <- cmd_args[2] scanmodes <- c("positive", "negative") # Check whether all jobs terminated correct! @@ -24,11 +16,6 @@ for (scanmode in scanmodes) { input_dir <- getwd() # "./" peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep="")) - # remove any repl_pattern files from the list - #if (sum(grepl("repl_pattern", peaklist_files)i > 0) { - # peaklist_files <- peaklist_files[-grep("repl_pattern", peaklist_files)] } - print(peaklist_files) - # get sample names load(paste0("./", scanmode, "_repl_pattern", ".RData")) groupNames = names(repl_pattern_filtered) @@ -38,18 +25,16 @@ for (scanmode in scanmodes) { notRun = c(notRun, group) } } - print("two") - cat("\nCollecting samples!") + + cat("\nCollecting samples") outlist.tot=NULL for (i in 1:length(peaklist_files)) { - cat("\n", peaklist_files[i]) load(peaklist_files[i]) - if (is.null(outlist.persample) || (dim(outlist.persample)[1]==0)) { + if (is.null(outlist.persample) || (dim(outlist.persample)[1] == 0)) { tmp=strsplit(peaklist_files[i], "/")[[1]] fname = tmp[length(tmp)] - #fname = strsplit(peaklist_files[i], "/")[[1]][8] fname = strsplit(fname, ".RData")[[1]][1] fname = substr(fname, 13, nchar(fname)) if (i == 1) { @@ -65,17 +50,14 @@ for (scanmode in scanmodes) { } } } - print("three") + # remove negative values - index=which(outlist.tot[,"height.pkt"]<=0) - if (length(index)>0) outlist.tot = outlist.tot[-index,] - index=which(outlist.tot[,"mzmed.pkt"]<=0) - if (length(index)>0) outlist.tot = outlist.tot[-index,] + index <- which(outlist.tot[ ,"height.pkt"] <= 0) + if (length(index) > 0) outlist.tot <- outlist.tot[-index, ] + index <- which(outlist.tot[ ,"mzmed.pkt"] <= 0) + if (length(index) > 0) outlist.tot <- outlist.tot[-index, ] - # outdir_specpks <- paste(outdir, "5-specpks_all", sep = "/") - # dir.create(outdir_specpks, showWarnings = F) - # save(outlist.tot, file = paste(outdir_specpks, paste(scanmode, "RData", sep = "."), sep = "/")) - save(outlist.tot, file = paste("./", scanmode, ".RData", sep = "")) + save(outlist.tot, file = paste0("./SpectrumPeaks_", scanmode, ".RData")) if (!is.null(notRun)){ for (i in 1:length(notRun)){ diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf index 6a7c73a..786131c 100644 --- a/DIMS/SpectrumPeakFinding.nf +++ b/DIMS/SpectrumPeakFinding.nf @@ -8,8 +8,8 @@ process SpectrumPeakFinding { path(replication_pattern) output: - path 'negative.RData' - path 'positive.RData' + path 'SpectrumPeaks_*.RData' + // path 'SpectrumPeaks_positive.RData' script: """ From 39e2cb77b49784cec0a2598d5dcd412da1e964a2 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 25 Aug 2023 17:17:30 +0200 Subject: [PATCH 10/73] added functionality --- DIMS/FillMissing.R | 58 ++++++++++++++ DIMS/FillMissing.nf | 17 ++++ DIMS/PeakGrouping.R | 186 +++++++++++++++++++++++++++++++++++++++++++ DIMS/PeakGrouping.nf | 19 +++++ 4 files changed, 280 insertions(+) create mode 100755 DIMS/FillMissing.R create mode 100644 DIMS/FillMissing.nf create mode 100644 DIMS/PeakGrouping.R create mode 100644 DIMS/PeakGrouping.nf diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R new file mode 100755 index 0000000..8c3c86c --- /dev/null +++ b/DIMS/FillMissing.R @@ -0,0 +1,58 @@ +#!/usr/bin/Rscript +# adapted from 9-runFillMissing.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +# define parameters +peakgrouplist_file <- cmd_args[1] +#pattern_file <- cmd_args[2] +scripts_dir <- cmd_args[2] +#thresh <- as.numeric(cmd_args[4]) +#resol <- as.numeric(cmd_args[5]) +#ppm <- as.numeric(cmd_args[6]) +thresh <- cmd_args[3] +resol <- cmd_args[4] +ppm <- cmd_args[5] +# outdir <- cmd_args[2] +# scanmode <- cmd_args[3] +outdir <- "./" +if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else + if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" } + +# load in function scripts +source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) +source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) +source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R")) + +# for debugging: +print(peakgrouplist_file) +#print(pattern_file) +print(scripts_dir) +print(thresh) +print(resol) +print(ppm) + +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) +print(outputfile_name) + +# get replication pattern for sample names +pattern_file <- paste0(scanmode, "_repl_pattern.RData") +repl_pattern <- get(load(pattern_file)) +print(repl_pattern[[1]]) + +# peakgrouplist <- get(load(peakgrouplist_file)) +load(peakgrouplist_file) +batch_number <- strsplit(basename(peakgrouplist_file), ".",fixed = TRUE)[[1]][1] + +print(dim(outpgrlist.identified)) +print(colnames(outpgrlist.identified)) +print(batch_number) + +# replace missing values (zeros) with random noise +peakgrouplist_filled <- replaceZeros(outpgrlist.identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) + +# save output +save(peakgrouplist_filled, file=paste0("./", outputfile_name) +save(peakgrouplist_filled, file=paste0("./", batch_number, scanmode, "identified_filled.RData") diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf new file mode 100644 index 0000000..c88ea5c --- /dev/null +++ b/DIMS/FillMissing.nf @@ -0,0 +1,17 @@ +process FillMissing { + label 'FillMissing' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(GroupedList_file) + path(replication_pattern) + + output: + path('.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + """ +} diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R new file mode 100644 index 0000000..b5038e8 --- /dev/null +++ b/DIMS/PeakGrouping.R @@ -0,0 +1,186 @@ +#!/usr/bin/Rscript +# adapted from 6-peakGrouping.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +HMDB_part_file <- cmd_args[1] +SpecPeaks_file <- cmd_args[2] +pattern_file <- cmd_args[3] +ppm <- as.numeric(cmd_args[4]) + +options(digits=16) + +# load part of the HMDB +HMDB_add_iso <- get(load(HMDB_part_file)) +# load(HMDB_part_file) +# HMDB_add_iso <- outlist_part + +# determine appropriate scanmode based on HMDB_part_file +if (grepl("negative", basename(HMDB_part_file))) { scanmode <- "negative" } else + if (grepl("positive", basename(HMDB_part_file))) { scanmode <- "positive" } + +# determine batch number of HMDB part file +batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2] + +# load file with spectrum peaks +SpecPeaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") +load(SpecPeaks_file) +outlist.copy <- outlist.tot +rm(outlist.tot) + +# load replication pattern +# load(paste0("./", scanmode, "_repl_pattern", ".RData")) +pattern_file <- paste0(scanmode, "_repl_pattern.RData") +load(pattern_file) +# load("./breaks.fwhm.RData") + +# determine appropriate column name in HMDB part +if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" } + +# for debugging: +print(head(HMDB_add_iso)) +print(scanmode) +print(column_label) + +# Initialize +outpgrlist.identified <- NULL +outlist.grouped <- NULL + +# First find peak groups identified based on HMDB masses +while (dim(HMDB_add_iso)[1] > 0) { + index <- 1 + + # take one m/z value from the HMDB part and calculate mass tolerance + reference_mass <- as.numeric(HMDB_add_iso[index, column_label]) + mass_tolerance <- (reference_mass * ppm) / 10^6 + + # find the peaks in the dataset with corresponding m/z + mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"]) + selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance))) + tmplist <- outlist.copy[selp,,drop=FALSE] + outlist.grouped <- rbind(outlist.grouped, tmplist) + nrsamples <- length(selp) + if (nrsamples > 0) { + mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) + mzmin.pgrp <- reference_mass - mass_tolerance + mzmax.pgrp <- reference_mass + mass_tolerance + + # determine fit quality fq + fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"])) + fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"])) + + # set up object for intensities for all samples + ints.allsamps <- rep(0, length(names(repl.pattern.filtered))) + names(ints.allsamps) <- names(repl.pattern.filtered) + + # Check for each sample if multiple peaks exist, if so take the sum of the intensities + labels <- unique(tmplist[ ,"samplenr"]) + ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])) } ))) + + # Initialize + assi_HMDB <- iso_HMDB <- HMDB_code <- NA + tmplist.mass.iso <- tmplist.mass.adduct <- NULL + + # Identification: find all entries in HMDB part with mass within ppm range + mass.all <- as.numeric(HMDB_add_iso[ , column_label]) + index <- which((mass.all > (reference_mass - mass_tolerance)) & (mass.all < (reference_mass + mass_tolerance))) + tmplist.mass <- HMDB_add_iso[index,,drop=FALSE] + + if (dim(tmplist.mass)[1]>0) { + # find isotope entries + index.iso <- grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE) + if (length(index.iso) > 0){ + tmplist.mass.iso <- tmplist.mass[index.iso,,drop=FALSE] + tmplist.mass <- tmplist.mass[-index.iso,,drop=FALSE] + } + + if (dim(tmplist.mass)[1] > 0) { + # find adduct entries + index.adduct <- grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE) + if (length(index.adduct) > 0) { + tmplist.mass.adduct <- tmplist.mass[index.adduct,,drop=FALSE] + tmplist.mass <- tmplist.mass[-index.adduct,,drop=FALSE] + } + } + + # Compose a list compounds, adducts or isotopes with corresponding m/z + if (dim(tmplist.mass)[1]>0) { + + # metabolites + assi_HMDB <- as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";")) + HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";")) + theormz_HMDB <- as.numeric(tmplist.mass[1, column_label]) + + # adducts of metabolites + if (!is.null(tmplist.mass.adduct)) { + if (dim(tmplist.mass.adduct)[1] > 0) { + if (is.na(assi_HMDB)){ + assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")) + HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) + } else { + assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") + HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") + } + } + } + + # isotopes of metabolites + if (!is.null(tmplist.mass.iso)) { + if (dim(tmplist.mass.iso)[1]>0) { + iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + } + } + + # if no metabolites have the correct m/z, look for adducts and isotopes only + } else if (!is.null(tmplist.mass.adduct)) { + + theormz_HMDB <- as.numeric(tmplist.mass.adduct[1, column_label]) + + # adducts of metabolites + if (!is.null(tmplist.mass.adduct)) { + if (dim(tmplist.mass.adduct)[1] > 0) { + if (is.na(assi_HMDB)) { + assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[ , "CompoundName"]), collapse = ";")) + HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) + } else { + assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") + HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") + } + } + } + + # isotopes of metabolites + if (!is.null(tmplist.mass.iso)) { + if (dim(tmplist.mass.iso)[1]>0) { + iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + } + } + + # if no metabolites or adducts can be found, only look for isotopes + } else if (!is.null(tmplist.mass.iso)) { + + if (dim(tmplist.mass.iso)[1]>0) { + theormz_HMDB <- as.numeric(tmplist.mass.iso[1,column_label]) + iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + } + } + + } + + # combine all information + outpgrlist.identified <- rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp), + t(as.matrix(ints.allsamps)), + data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB))) + } + + # remove index metabolite from HMDB part and continue while loop + HMDB_add_iso <- HMDB_add_iso[-index,] + +} + +# save peak list corresponding to masses in HMDB part +# save(outlist.grouped, file=paste0(batch_number, "_", scanmode, "_all.RData")) +# save peak group list, identified part +save(outpgrlist.identified, file=paste0(batch_number, "_", scanmode, "_identified.RData")) diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf new file mode 100644 index 0000000..be9d542 --- /dev/null +++ b/DIMS/PeakGrouping.nf @@ -0,0 +1,19 @@ +process PeakGrouping { + label 'PeakGrouping' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(HMDBpart_file) + path(SpectrumPeak_file) + path(pattern_file) + + output: + // path '*_all.RData', emit: peaklist_all + path '*_identified.RData', emit: grouped_identified + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $SpectrumPeak_file $pattern_file $params.ppm + """ +} From ab5c4574e7de7df394af9aedd3fece156b17d4a3 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 1 Sep 2023 17:10:46 +0200 Subject: [PATCH 11/73] modified files for Nextflow --- DIMS/FillMissing.R | 44 +++----- DIMS/FillMissing.nf | 2 +- DIMS/PeakGrouping.R | 22 ++-- DIMS/PeakGrouping.nf | 4 +- DIMS/PeakGroupingIdentified.R | 180 --------------------------------- DIMS/PeakGroupingIdentified.nf | 19 ---- 6 files changed, 33 insertions(+), 238 deletions(-) delete mode 100644 DIMS/PeakGroupingIdentified.R delete mode 100644 DIMS/PeakGroupingIdentified.nf diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index 8c3c86c..127c557 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -7,52 +7,36 @@ for (arg in cmd_args) cat(" ", arg, "\n", sep="") # define parameters peakgrouplist_file <- cmd_args[1] -#pattern_file <- cmd_args[2] scripts_dir <- cmd_args[2] -#thresh <- as.numeric(cmd_args[4]) -#resol <- as.numeric(cmd_args[5]) -#ppm <- as.numeric(cmd_args[6]) -thresh <- cmd_args[3] -resol <- cmd_args[4] -ppm <- cmd_args[5] -# outdir <- cmd_args[2] -# scanmode <- cmd_args[3] +thresh <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) +ppm <- as.numeric(cmd_args[5]) outdir <- "./" + if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" } # load in function scripts source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) +source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R")) +source(paste0(scripts_dir, "AddOnFunctions/getSD.R")) +source(paste0(scripts_dir, "AddOnFunctions/getArea.R")) +source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R")) source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R")) - -# for debugging: -print(peakgrouplist_file) -#print(pattern_file) -print(scripts_dir) -print(thresh) -print(resol) -print(ppm) - -outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) -print(outputfile_name) +source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R")) +source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R")) # get replication pattern for sample names pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) -print(repl_pattern[[1]]) -# peakgrouplist <- get(load(peakgrouplist_file)) +# load peak group list and determine output file name load(peakgrouplist_file) -batch_number <- strsplit(basename(peakgrouplist_file), ".",fixed = TRUE)[[1]][1] - -print(dim(outpgrlist.identified)) -print(colnames(outpgrlist.identified)) -print(batch_number) +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) # replace missing values (zeros) with random noise peakgrouplist_filled <- replaceZeros(outpgrlist.identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) -# save output -save(peakgrouplist_filled, file=paste0("./", outputfile_name) -save(peakgrouplist_filled, file=paste0("./", batch_number, scanmode, "identified_filled.RData") +# save output +save(peakgrouplist_filled, file=paste0("./", outputfile_name)) diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index c88ea5c..4749b87 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -8,7 +8,7 @@ process FillMissing { path(replication_pattern) output: - path('.RData') + path('*_filled.RData') script: """ diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index b5038e8..756c0a7 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -6,9 +6,9 @@ cmd_args <- commandArgs(trailingOnly = TRUE) for (arg in cmd_args) cat(" ", arg, "\n", sep="") HMDB_part_file <- cmd_args[1] -SpecPeaks_file <- cmd_args[2] -pattern_file <- cmd_args[3] -ppm <- as.numeric(cmd_args[4]) +# SpecPeaks_file <- cmd_args[2] +# pattern_file <- cmd_args[3] +ppm <- as.numeric(cmd_args[2]) options(digits=16) @@ -30,6 +30,8 @@ load(SpecPeaks_file) outlist.copy <- outlist.tot rm(outlist.tot) +print(dim(outlist.copy)) + # load replication pattern # load(paste0("./", scanmode, "_repl_pattern", ".RData")) pattern_file <- paste0(scanmode, "_repl_pattern.RData") @@ -40,9 +42,11 @@ load(pattern_file) if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" } # for debugging: -print(head(HMDB_add_iso)) +print(dim(HMDB_add_iso)) print(scanmode) print(column_label) +print(ppm) +print(head(repl_pattern_filtered, 1)) # Initialize outpgrlist.identified <- NULL @@ -56,11 +60,14 @@ while (dim(HMDB_add_iso)[1] > 0) { reference_mass <- as.numeric(HMDB_add_iso[index, column_label]) mass_tolerance <- (reference_mass * ppm) / 10^6 + print(paste0("ref_mass ", reference_mass, " mtol ", mass_tolerance)) + # find the peaks in the dataset with corresponding m/z mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"]) selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance))) tmplist <- outlist.copy[selp,,drop=FALSE] outlist.grouped <- rbind(outlist.grouped, tmplist) + nrsamples <- length(selp) if (nrsamples > 0) { mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) @@ -72,8 +79,8 @@ while (dim(HMDB_add_iso)[1] > 0) { fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"])) # set up object for intensities for all samples - ints.allsamps <- rep(0, length(names(repl.pattern.filtered))) - names(ints.allsamps) <- names(repl.pattern.filtered) + ints.allsamps <- rep(0, length(names(repl_pattern_filtered))) + names(ints.allsamps) <- names(repl_pattern_filtered) # Check for each sample if multiple peaks exist, if so take the sum of the intensities labels <- unique(tmplist[ ,"samplenr"]) @@ -180,6 +187,9 @@ while (dim(HMDB_add_iso)[1] > 0) { } + +print(head(outpgrlist.identified)) + # save peak list corresponding to masses in HMDB part # save(outlist.grouped, file=paste0(batch_number, "_", scanmode, "_all.RData")) # save peak group list, identified part diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index be9d542..89328fb 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -5,7 +5,7 @@ process PeakGrouping { input: path(HMDBpart_file) - path(SpectrumPeak_file) + path(SpectrumPeak_file) // input files need to be linked, but called within R script path(pattern_file) output: @@ -14,6 +14,6 @@ process PeakGrouping { script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $SpectrumPeak_file $pattern_file $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $params.ppm """ } diff --git a/DIMS/PeakGroupingIdentified.R b/DIMS/PeakGroupingIdentified.R deleted file mode 100644 index e6bf3fc..0000000 --- a/DIMS/PeakGroupingIdentified.R +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/Rscript - -# load required packages -# none - -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") - -#outdir <- cmd_args[2] -#scanmode <- cmd_args[3] -SpecPeaks_file <- cmd_args[1] -HMDB_part_file <- cmd_args[2] -pattern_file <- cmd_args[3] -resol <- as.numeric(cmd_args[4]) -ppm <- as.numeric(cmd_args[5]) - -options(digits=16) - -cat(paste("File to group:", fileIn)) - - -options(digits=16) -load(HMDB_part_file) # outlist_part(HMDB_add_iso) -HMDB_add_iso = outlist_part - -# Windows and Unix-like -batch = strsplit(fileIn, "/",fixed = TRUE)[[1]] -batch = batch[length(batch)] -batch = strsplit(batch, ".",fixed = TRUE)[[1]][2] - -load(SpecPeaks_file) -outlist.copy = outlist.tot -rm(outlist.tot) - -load(pattern_file) -load("./breaks.fwhm.RData") - -outpgrlist.identified = NULL - -if (scanmode=="negative"){ - label = "MNeg" -} else { - label = "Mpos" -} - -outlist.grouped <- NULL - -# First group on HMDB masses -while (dim(HMDB_add_iso)[1] > 0) { - index = 1 - # message(HMDB_add_iso[index,"CompoundName"]) - - mass = as.numeric(HMDB_add_iso[index,label]) - mtol = (mass*ppm)/10^6 - - mzmed = as.numeric(outlist.copy[,"mzmed.pkt"]) - selp = which((mzmed > (mass - mtol)) & (mzmed < (mass + mtol))) - tmplist = outlist.copy[selp,,drop=FALSE] - outlist.grouped <- rbind(outlist.grouped, tmplist) - - nrsamples = length(selp) - if (nrsamples > 0) { - # message(paste("Bingo ",n)) - - mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) - # mzmed.pgrp = median(as.numeric(outlist.copy[selp, "mzmed.pkt"])) - mzmin.pgrp = mass - mtol - mzmax.pgrp = mass + mtol - - fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"])) - fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"])) - ints.allsamps = rep(0, length(names(repl.pattern.filtered))) - names(ints.allsamps) = names(repl.pattern.filtered) # same order as sample list!!! - - # # Check for each sample if multiple peaks exists, if so take the sum! - labels=unique(tmplist[,"samplenr"]) - ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) - # ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])}))) - - # Identification - # assi_HMDB = iso_HMDB = HMDB_code = "" - assi_HMDB = iso_HMDB = HMDB_code = NA - tmplist.mass.iso = tmplist.mass.adduct = NULL - # index = which(HMDB_add_iso[,label]==mass) - - # Consider groups off 4ppm - mass.all = as.numeric(HMDB_add_iso[,label]) - index = which((mass.all > (mass - mtol)) & (mass.all < (mass + mtol))) - # index = which(mass.all < (mass + 2*mtol)) - tmplist.mass = HMDB_add_iso[index,,drop=FALSE] - - if (dim(tmplist.mass)[1]>0) { - - index.iso = grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE) - if (length(index.iso)>0){ - tmplist.mass.iso = tmplist.mass[index.iso,,drop=FALSE] - tmplist.mass = tmplist.mass[-index.iso,,drop=FALSE] - } - - if (dim(tmplist.mass)[1]>0) { - index.adduct = grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE) - if (length(index.adduct)>0){ - tmplist.mass.adduct = tmplist.mass[index.adduct,,drop=FALSE] - tmplist.mass = tmplist.mass[-index.adduct,,drop=FALSE] - } - } - - # First compouds without adducts or isotopes - if (dim(tmplist.mass)[1]>0) { - - # pure compounds - assi_HMDB = as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";")) - HMDB_code = as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";")) - theormz_HMDB = as.numeric(tmplist.mass[1,label]) - - # adducts - if (!is.null(tmplist.mass.adduct)) { - if (dim(tmplist.mass.adduct)[1]>0) { - if (is.na(assi_HMDB)){ - assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")) - HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) - } else { - assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") - HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") - }}} - - # isotopes - if (!is.null(tmplist.mass.iso)) { - if (dim(tmplist.mass.iso)[1]>0) { - iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) - }} - - # No pure compounts - } else if (!is.null(tmplist.mass.adduct)) { - - theormz_HMDB = as.numeric(tmplist.mass.adduct[1,label]) - - # adducts - if (!is.null(tmplist.mass.adduct)) { - if (dim(tmplist.mass.adduct)[1]>0) { - if (is.na(assi_HMDB)){ - assi_HMDB = as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")) - HMDB_code = as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) - } else { - assi_HMDB = paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") - HMDB_code = paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") - }}} - - # isotopes - if (!is.null(tmplist.mass.iso)) { - if (dim(tmplist.mass.iso)[1]>0) { - iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) - }} - - # only isotopes - } else if (!is.null(tmplist.mass.iso)) { - - if (dim(tmplist.mass.iso)[1]>0) { - theormz_HMDB = as.numeric(tmplist.mass.iso[1,label]) - iso_HMDB = as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) - } - } - - } - - outpgrlist.identified = rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp), - t(as.matrix(ints.allsamps)), - data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB))) - } - - HMDB_add_iso = HMDB_add_iso[-index,] - -} - -#dir.create(paste(outdir, "6-grouping_hmdb", sep="/"), showWarnings = FALSE) -save(outpgrlist.identified, file=paste(paste(outdir, "6-grouping_hmdb", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/")) - -#dir.create(paste(outdir, "6-grouping_hmdb_done", sep="/"), showWarnings = FALSE) -save(outlist.grouped, file=paste(paste(outdir, "6-grouping_hmdb_grouped", sep="/"), paste(paste(batch, scanmode, sep="_"), "RData", sep="."), sep="/")) diff --git a/DIMS/PeakGroupingIdentified.nf b/DIMS/PeakGroupingIdentified.nf deleted file mode 100644 index 2181cae..0000000 --- a/DIMS/PeakGroupingIdentified.nf +++ /dev/null @@ -1,19 +0,0 @@ -process PeakGroupingIdentified { - label 'PeakGroupingIdentified' - container = 'docker://umcugenbioinf/dims:1.3' - shell = ['/bin/bash', '-euo', 'pipefail'] - - input: - path(SpectrumPeak_file) - path(HMDBpart_file) - path(pattern_file) - - output: - path '*_negative.RData' - path '*_positive.RData' - - script: - """ - Rscript ${baseDir}/CustomModules/DIMS/PeakGroupingIdentified.R $SpectrumPeak_file $HMDBpart_file $pattern_file $params.resolution $params.ppm - """ -} From 6d54b1405d843cfc9a5a92f0523d641b5250b041 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 1 Sep 2023 17:12:30 +0200 Subject: [PATCH 12/73] extra files for Nextflow pipeline --- DIMS/CollectFilled.R | 83 ++++++ DIMS/CollectFilled.nf | 17 ++ DIMS/CollectSumAdducts.R | 25 ++ DIMS/CollectSumAdducts.nf | 17 ++ DIMS/GenerateExcel.R | 579 ++++++++++++++++++++++++++++++++++++++ DIMS/GenerateExcel.nf | 18 ++ DIMS/HMDBparts_main.R | 64 +++++ DIMS/HMDBparts_main.nf | 20 ++ DIMS/SumAdducts.R | 122 ++++++++ DIMS/SumAdducts.nf | 18 ++ 10 files changed, 963 insertions(+) create mode 100755 DIMS/CollectFilled.R create mode 100644 DIMS/CollectFilled.nf create mode 100755 DIMS/CollectSumAdducts.R create mode 100644 DIMS/CollectSumAdducts.nf create mode 100644 DIMS/GenerateExcel.R create mode 100644 DIMS/GenerateExcel.nf create mode 100644 DIMS/HMDBparts_main.R create mode 100644 DIMS/HMDBparts_main.nf create mode 100755 DIMS/SumAdducts.R create mode 100644 DIMS/SumAdducts.nf diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R new file mode 100755 index 0000000..62be430 --- /dev/null +++ b/DIMS/CollectFilled.R @@ -0,0 +1,83 @@ +#!/usr/bin/Rscript +## adapted from 10-collectSamplesFilled.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +scripts_dir <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) +z_score <- as.numeric(cmd_args[3]) + +source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R")) +source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R")) +# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R")) + +# for each scan mode, collect all filled peak group lists +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + # get list of files + filled_files <- list.files("./", full.names=TRUE, pattern=scanmode) + # load files and combine into one object + outlist.tot <- NULL + for (i in 1:length(filled_files)) { + load(filled_files[i]) + print(filled_files[i]) + outlist.tot <- rbind(outlist.tot, peakgrouplist_filled) + } + + # remove duplicates; peak groups with exactly the same m/z + outlist.tot <- mergeDuplicatedRows(outlist.tot) + + # sort on mass + outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),] + + # load replication pattern + pattern_file <- paste0(scanmode, "_repl_pattern.RData") + repl_pattern <- get(load(pattern_file)) + + # Normalization: not done. + # if (normalization != "disabled") { + # outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") + # } + + if (z_score == 1) { + outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) + nr.removed.samples <- length(which(repl_pattern[] == "character(0)")) + order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)]) + outlist.stats.more <- cbind(outlist.stats[,1:7], + outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)], + outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int], + outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)]) + + tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) + tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index])) + tmp <- outlist.stats.more[,tmp.index[tmp.index.order]] + outlist.stats.more <- outlist.stats.more[,-tmp.index] + outlist.stats.more <- cbind(outlist.stats.more,tmp) + outlist.tot <- outlist.stats.more + } + + # filter identified compounds + index.1 <- which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"]))) + index.2 <- which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"]))) + index <- union(index.1,index.2) + outlist.ident <- outlist.tot[index,] + outlist.not.ident <- outlist.tot[-index,] + + if (z_score == 1) { + outlist.ident$ppmdev <- as.numeric(outlist.ident$ppmdev) + outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),] + } + # NAs in theormz_noise + outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0 + outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise) + outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0 + outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise) + + # save output for identified peak groups (not.identified later) + # save(outlist.not.ident, file="./outlist_not_identified_", scanmode, ".RData") + save(outlist.ident, file=paste0("./outlist_identified_", scanmode, ".RData")) + +} diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf new file mode 100644 index 0000000..c28516c --- /dev/null +++ b/DIMS/CollectFilled.nf @@ -0,0 +1,17 @@ +process CollectFilled { + label 'CollectFilled' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(filled_files) + path(replication_pattern) + + output: + path('outlist*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectFilled.R $params.scripts_dir $params.ppm $params.zscore + """ +} diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R new file mode 100755 index 0000000..ae89cbc --- /dev/null +++ b/DIMS/CollectSumAdducts.R @@ -0,0 +1,25 @@ +#!/usr/bin/Rscript +## adapted from 12-collectSamplesAdded.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +# outdir <- cmd_args[1] +# scanmode <- cmd_args[2] +scanmode <- c("positive", "negative") + +for (scanmode in scanmodes) { + # collect all AdductSums part files for each scanmode + adductsum_part_files <- list.files("./", pattern = scanmode) + + outlist.tot <- NULL + for (i in 1:length(adductsum_part_files)) { + load(adductsum_part_files[i]) + outlist.tot <- rbind(outlist.tot, adductsum) + } + + # save output file + save(outlist.tot, file="/AdductSums_", scanmode, ".RData") +} + diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf new file mode 100644 index 0000000..a72455c --- /dev/null +++ b/DIMS/CollectSumAdducts.nf @@ -0,0 +1,17 @@ +process CollectSumAdducts { + tag {"DIMS CollectSumAdducts"} + label 'CollectSumAdducts' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(collect_file) // input files need to be linked, but called within R script + + output: + path('AdductSums_*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/CollectSumAdducts.R + """ +} diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R new file mode 100644 index 0000000..7958f95 --- /dev/null +++ b/DIMS/GenerateExcel.R @@ -0,0 +1,579 @@ +#!/usr/bin/Rscript +## adapted from 13-excelExport.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep = "") + +# outdir <- cmd_args[1] #"/Users/nunen/Documents/Metab/test_set" +init_filepath <- cmd_args[1] +project <- cmd_args[2] #"test" +dims_matrix <- cmd_args[3] #"DBS" +hmdb <- cmd_args[4] #"/hpc/dbg_mz/tools/db/HMDB_with_info_relevance_IS_C5OH.RData" +z_score <- as.numeric(cmd_args[5]) + +# load required packages +library("ggplot2") +library("reshape2") +library("openxlsx") +library("loder") +suppressMessages(library("dplyr")) +suppressMessages(library("stringr")) + +# Initialise and load data +outdir <- "../../../" +plot <- TRUE +export <- TRUE +control_label <- "C" +case_label <- "P" +imagesize_multiplier <- 2 +# init <- paste0(outdir, "/logs/init.RData") + +# load information on samples +load(init_filepath) +# load the HMDB file with info on biological relevance of metabolites +load(hmdb) # load object rlvnc into global environment + +rundate <- Sys.Date() + +# create a directory for plots in project directory +plotdir <- paste0(outdir, "/plots/adducts") +dir.create(paste0(outdir, "/plots"), showWarnings = F) +dir.create(plotdir, showWarnings = F) + +options(digits=16) + +# setwd(outdir) + +# sum positive and negative adductsums + +# Load pos and neg adduct sums +load("/AdductSums_negative.RData") +outlist.neg.adducts.HMDB <- outlist.tot + +load("/AdductSums_positive.RData") +outlist.pos.adducts.HMDB <- outlist.tot +rm(outlist.tot) + +# Only continue with patients (columns) that are in both pos and neg, so patients that are in both +tmp <- intersect(colnames(outlist.neg.adducts.HMDB), colnames(outlist.pos.adducts.HMDB)) +outlist.neg.adducts.HMDB <- outlist.neg.adducts.HMDB[,tmp] +outlist.pos.adducts.HMDB <- outlist.pos.adducts.HMDB[,tmp] + +# Find indexes of neg hmdb code that are also found in pos and vice versa +index.neg <- which(rownames(outlist.neg.adducts.HMDB) %in% rownames(outlist.pos.adducts.HMDB)) +index.pos <- which(rownames(outlist.pos.adducts.HMDB) %in% rownames(outlist.neg.adducts.HMDB)) + +# Get number of columns +# Only continue with HMDB codes (rows) that were found in both pos and neg mode and remove last column (hmdb_name) +tmp.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.pos.adducts.HMDB)[2]-1)] +tmp.hmdb_name.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], dim(outlist.pos.adducts.HMDB)[2]] +tmp.pos.left <- outlist.pos.adducts.HMDB[-index.pos,] + +tmp.neg <- outlist.neg.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.neg.adducts.HMDB)[2]-1)] +tmp.neg.left <- outlist.neg.adducts.HMDB[-index.neg,] + +# Combine positive and negative numbers and paste back HMDB column +tmp <- apply(tmp.pos, 2,as.numeric) + apply(tmp.neg, 2,as.numeric) +rownames(tmp) <- rownames(tmp.pos) +tmp <- cbind(tmp, "HMDB_name"=tmp.hmdb_name.pos) +outlist <- rbind(tmp, tmp.pos.left, tmp.neg.left) + +# Filter +peaksInList <- which(rownames(outlist) %in% rownames(rlvnc)) +outlist <- cbind(outlist[peaksInList,],as.data.frame(rlvnc[rownames(outlist)[peaksInList],])) +# filter out all irrelevant HMDB's. the tibble::rownames is needed for the older version of dplyr on the HPC (it will reindex the rownames) +outlist <- outlist %>% + tibble::rownames_to_column('rowname') %>% + filter(!grepl("Exogenous|Drug|exogenous", relevance)) %>% + tibble::column_to_rownames('rowname') + +# Add HMDB_code column with all the HMDB ID and sort on it +outlist <- cbind(outlist, "HMDB_code" = rownames(outlist)) +outlist <- outlist[order(outlist[,"HMDB_code"]),] + +# Create excel +filelist <- "AllPeakGroups" + +wb <- createWorkbook("SinglePatient") +addWorksheet(wb, filelist) + +# small function for rounding numbers to x digits for numeric values +round_df <- function(df, digits) { + # round all numeric variables + # df: data frame + # digits: number of digits to round + numeric_columns <- sapply(df, mode) == 'numeric' + df[numeric_columns] <- round(df[numeric_columns], digits) + df +} + +# Add Z-scores and create plots +if (z_score == 1) { + # Statistics: Z-score + outlist <- cbind(plots = NA, outlist) + #outlist <- as.data.frame(outlist) + + startcol <- dim(outlist)[2] + 3 + + # Get columns with control intensities + control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE) + control_columns <- outlist[, control_col_ids] + + # Get columns with patient intensities + patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE) + patient_columns <- outlist[, patient_col_ids] + + intensity_col_ids <- c(control_col_ids, patient_col_ids) + + # set intensities of 0 to NA? + outlist[,intensity_col_ids][outlist[,intensity_col_ids] == 0] <- NA + + # calculate mean and sd for Control group + outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x),na.rm = TRUE)) + outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x),na.rm = TRUE)) + + # Make and add columns with zscores + cnames.z <- NULL + for (i in intensity_col_ids) { + cname <- colnames(outlist)[i] + cnames.z <- c(cnames.z, paste(cname, "Zscore", sep="_")) + zscores.1col <- (as.numeric(as.vector(unlist(outlist[ , i]))) - outlist$avg.ctrls) / outlist$sd.ctrls + outlist <- cbind(outlist, zscores.1col) + } + colnames(outlist)[startcol:ncol(outlist)] <- cnames.z + + patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1])))) + patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] # sorts + + temp_png <- NULL + + # Iterate over every row, make boxplot, insert into excel, and make Zscore for every patient + for (p in 1:nrow(outlist)) { + # box plot + hmdb_name <- rownames(outlist[p,]) + + intensities <- list(as.numeric(as.vector(unlist(control_columns[p,])))) + labels <- c("C", patient_ids) + + for (i in 1:length(patient_ids)) { + id <- patient_ids[i] + # get all intensities that start with ex. P18. (so P18.1, P18.2, but not x_P18.1 and not P180.1) + p.int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1,])[startsWith(names(patient_columns[1,]), paste0(id, "."))]]))) + intensities[[i+1]] <- p.int + } + + intensities <- setNames(intensities, labels) + + plot_width <- length(labels) * 16 + 90 + + plot.new() + if (export) { + png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"), + width = plot_width, + height = 280) + } + par(oma=c(2,0,0,0)) + boxplot(intensities, + col=c("green", rep("red", length(intensities)-1)), + names.arg = labels, + las=2, + main = hmdb_name) + dev.off() + + file_png <- paste0(plotdir, "/", hmdb_name, "_box.png") + if (is.null(temp_png)) { + temp_png <- readPng(file_png) + img_dim <- dim(temp_png)[c(1,2)] + cell_dim <- img_dim * imagesize_multiplier + setColWidths(wb, filelist, cols = 1, widths = cell_dim[2]/20) + } + + insertImage(wb, + filelist, + file_png, + startRow = p + 1, + startCol = 1, + height = cell_dim[1], + width = cell_dim[2], + units = "px") + + if (p %% 100 == 0) { + cat("at row: ", p, "\n") + } + } + + setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1]/4) + setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20) +} else { + setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) + setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) +} +writeData(wb, sheet = 1, outlist, startCol = 1) +xlsx_name <- paste0(outdir, "/", project, ".xlsx") +saveWorkbook(wb, + xlsx_name, + overwrite = TRUE) +cat(xlsx_name) +rm(wb) + +write.table(outlist, file = paste(outdir, "allpgrps_stats.txt", sep = "/")) + +# INTERNE STANDAARDEN +IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),] +IS_codes <- rownames(IS) +cat(IS_codes,"\n") + +# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl.pattern) will contain more sample_names then the peak data (IS), +# thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data. +sample_names_nodata <- setdiff(names(repl.pattern),names(IS)) +if (!is.null(sample_names_nodata)) { + write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) + cat(sample_names_nodata,"\n") + for (sample_name in sample_names_nodata) { + repl.pattern[[sample_name]] <- NULL + }} + +# Retrieve IS summed adducts +IS_summed <- IS[c(names(repl.pattern), "HMDB_code")] +IS_summed$HMDB.name <- IS$name +IS_summed <- reshape2::melt(IS_summed, id.vars=c('HMDB_code','HMDB.name')) +colnames(IS_summed) <- c('HMDB.code','HMDB.name','Sample','Intensity') +IS_summed$Intensity <- as.numeric(IS_summed$Intensity) +IS_summed$Matrix <- dims_matrix +IS_summed$Rundate <- rundate +IS_summed$Project <- project +IS_summed$Intensity <- as.numeric(as.character(IS_summed$Intensity)) + +# Retrieve IS positive mode +IS_pos <- as.data.frame(subset(outlist.pos.adducts.HMDB,rownames(outlist.pos.adducts.HMDB) %in% IS_codes)) +IS_pos$HMDB_name <- IS[match(row.names(IS_pos),IS$HMDB_code,nomatch=NA),'name'] +IS_pos$HMDB.code <- row.names(IS_pos) +IS_pos <- reshape2::melt(IS_pos, id.vars=c('HMDB.code','HMDB_name')) +colnames(IS_pos) <- c('HMDB.code','HMDB.name','Sample','Intensity') +IS_pos$Matrix <- dims_matrix +IS_pos$Rundate <- rundate +IS_pos$Project <- project +IS_pos$Intensity <- as.numeric(as.character(IS_pos$Intensity)) + +# Retrieve IS negative mode +IS_neg <- as.data.frame(subset(outlist.neg.adducts.HMDB,rownames(outlist.neg.adducts.HMDB) %in% IS_codes)) +IS_neg$HMDB_name <- IS[match(row.names(IS_neg),IS$HMDB_code,nomatch=NA),'name'] +IS_neg$HMDB.code <- row.names(IS_neg) +IS_neg <- reshape2::melt(IS_neg, id.vars=c('HMDB.code','HMDB_name')) +colnames(IS_neg) <- c('HMDB.code','HMDB.name','Sample','Intensity') +IS_neg$Matrix <- dims_matrix +IS_neg$Rundate <- rundate +IS_neg$Project <- project +IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity)) + +# Save results +save(IS_pos,IS_neg,IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData')) + +# number of samples, for plotting length and width +sample_count <- length(repl.pattern) + +# change the order of the x-axis summed plots to a natural sorted one +Sample_naturalorder <- unique(as.character(IS_summed$Sample)) +Sample_naturalorder <- str_sort(Sample_naturalorder, numeric = TRUE) +IS_summed$Sample_level <- factor(IS_summed$Sample, levels = c(Sample_naturalorder)) +IS_pos$Sample_level <- factor(IS_pos$Sample, levels = c(Sample_naturalorder)) +IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder)) + +########## +##### bar plots with all IS +########## + +# function for ggplot theme +# theme for all IS bar plots +theme_IS_bar <- function(myPlot) { + myPlot + + scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + + theme( + legend.position='none', + axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.5, size=6), + axis.text.y=element_text(size=6)) +} + +# ggplot functions +IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_bar(aes(fill=HMDB.name),stat='identity') + + labs(x='',y='Intensity') + + facet_wrap(~HMDB.name, scales='free_y') + +IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_bar(aes(fill=HMDB.name),stat='identity') + + labs(x='',y='Intensity') + + facet_wrap(~HMDB.name, scales='free_y') + +IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Summed)") + + geom_bar(aes(fill=HMDB.name),stat='identity') + + labs(x='',y='Intensity') + + facet_wrap(~HMDB.name, scales='free_y') + +# add theme to ggplot functions +IS_neg_bar_plot <- theme_IS_bar(IS_neg_bar_plot) +IS_pos_bar_plot <- theme_IS_bar(IS_pos_bar_plot) +IS_sum_bar_plot <- theme_IS_bar(IS_sum_bar_plot) + +# save plots to disk +plot_width <- 9 + 0.35 * sample_count +ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot=IS_neg_bar_plot, height=plot_width/2.5, width=plot_width, units="in") +ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot=IS_pos_bar_plot, height=plot_width/2.5, width=plot_width, units="in") +ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot=IS_sum_bar_plot, height=plot_width/2.5, width=plot_width, units="in") + +########## +##### Line plots with all IS +########## + +# function for ggplot theme +# add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines +theme_IS_line <- function(myPlot) { + myPlot + + guides( + shape = guide_legend(override.aes = list(size = 0.5)), + color = guide_legend(override.aes = list(size = 0.5))) + + theme( + legend.title = element_text(size = 8), + legend.text = element_text(size = 6), + legend.key.size = unit(0.7, "line"), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)) +} + +# ggplot functions +IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_point(aes(col=HMDB.name)) + + geom_line(aes(col=HMDB.name, group=HMDB.name)) + + labs(x='',y='Intensity') + +IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = '', y = 'Intensity') + +IS_sum_line_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Sum)") + + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = '', y = 'Intensity') + +# add theme to ggplot functions +IS_sum_line_plot <- theme_IS_line(IS_sum_line_plot) +IS_neg_line_plot <- theme_IS_line(IS_neg_line_plot) +IS_pos_line_plot <- theme_IS_line(IS_pos_line_plot) + +# save plots to disk +plot_width <- 8 + 0.2 * sample_count +ggsave(paste0(outdir,"/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width/2.5, width = plot_width, units = "in") +ggsave(paste0(outdir,"/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width/2.5, width = plot_width, units = "in") +ggsave(paste0(outdir,"/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width/2.5, width = plot_width, units = "in") + +########## +##### bar plots with a selection of IS +########## +IS_neg_selection <- c('2H2-Ornithine (IS)', '2H3-Glutamate (IS)', '2H2-Citrulline (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)') +IS_pos_selection <- c('2H4-Alanine (IS)', '13C6-Phenylalanine (IS)', '2H4_13C5-Arginine (IS)', '2H3-Propionylcarnitine (IS)', '2H9-Isovalerylcarnitine (IS)') +IS_sum_selection <- c('2H8-Valine (IS)', '2H3-Leucine (IS)', '2H3-Glutamate (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)') + +# add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) +if (dims_matrix == "DBS"){ + hline.data.neg <- + data.frame(z = c(15000, 200000, 130000, 18000, 50000), + HMDB.name = IS_neg_selection) + hline.data.pos <- + data.frame(z = c(150000, 3300000, 1750000, 150000, 270000), + HMDB.name = IS_pos_selection) + hline.data.sum <- + data.frame(z = c(1300000, 2500000, 500000, 1800000, 1400000), + HMDB.name = IS_sum_selection) +} else if (dims_matrix == "Plasma"){ + hline.data.neg <- + data.frame(z = c(6500, 100000, 75000, 7500, 25000), + HMDB.name = IS_neg_selection) + hline.data.pos <- + data.frame(z = c(85000, 1000000, 425000, 70000, 180000), + HMDB.name = IS_pos_selection) + hline.data.sum <- + data.frame(z = c(700000, 1250000, 150000, 425000, 300000), + HMDB.name = IS_sum_selection) +} + +# function for ggplot theme +# see bar plots with all IS + +# ggplot functions +IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_bar(aes(fill=HMDB.name),stat='identity') + + labs(x='',y='Intensity') + + facet_wrap(~HMDB.name, scales='free', ncol = 2) + + if(exists("hline.data.neg")){geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))} #subset, if some IS have no data, no empty plots will be generated with a line) + +IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_bar(aes(fill=HMDB.name),stat='identity') + + labs(x='',y='Intensity') + + facet_wrap(~HMDB.name, scales='free', ncol = 2) + + if(exists("hline.data.pos")){geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))} + +IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Sum)") + + geom_bar(aes(fill=HMDB.name),stat='identity') + + labs(x='',y='Intensity') + + facet_wrap(~HMDB.name, scales='free', ncol = 2) + + if(exists("hline.data.sum")){geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))} + +# add theme to ggplot functions +IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot) +IS_pos_selection_barplot <- theme_IS_bar(IS_pos_selection_barplot) +IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot) + +# save plots to disk +plot_width <- 9 + 0.35 * sample_count +ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") + +########## +##### line plots with a selection of IS +########## + +# function for ggplot theme +# see line plots with all IS + +# ggplot functions +IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Neg)") + + geom_point(aes(col=HMDB.name)) + + geom_line(aes(col=HMDB.name, group=HMDB.name)) + + labs(x='',y='Intensity') + +IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Pos)") + + geom_point(aes(col=HMDB.name)) + + geom_line(aes(col=HMDB.name, group=HMDB.name)) + + labs(x='',y='Intensity') + +IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + + ggtitle("Interne Standaard (Sum)") + + geom_point(aes(col=HMDB.name)) + + geom_line(aes(col=HMDB.name, group=HMDB.name)) + + labs(x='',y='Intensity') + +# add theme to ggplot functions +IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot) +IS_pos_selection_lineplot <- theme_IS_line(IS_pos_selection_lineplot) +IS_sum_selection_lineplot <- theme_IS_line(IS_sum_selection_lineplot) + +# save plots to disk +plot_width <- 8 + 0.2 * sample_count +ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in") + + +### POSITIVE CONTROLS CHECK +# these positive controls need to be in the samplesheet, in order to make the Pos_Contr.RData file +# Positive control samples all have the format P1002.x, P1003.x and P1005.x (where x is a number) + +column_list <- colnames(outlist) +patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_") +positive_controls_index <- grepl(pattern=paste(patterns, collapse="|"), column_list) +positivecontrol_list <- column_list[positive_controls_index] + +if (z_score == 1) { + # find if one or more positive control samples are missing + pos_contr_warning <- c() + # any() grep because you get a vector of FALSE's and TRUE's. only one grep match is needed for each positive control + if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) && + any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) && + any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))){ + cat("All three positive controls are present") + } else { + pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse=" ") + } + # you need all positive control samples, thus starting the script only if all are available + if (length(pos_contr_warning) == 0) { + ### POSITIVE CONTROLS + # make positive control excel with specific HMDB_codes in combination with specific control samples + PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] #P1001.x_Zscore + PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] #P1003.x_Zscore + LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] #P1005.x_Zscore + + PA_codes <- c('HMDB00824', 'HMDB00783', 'HMDB00123') + PKU_codes <- c('HMDB00159') + LPI_codes <- c('HMDB00904', 'HMDB00641', 'HMDB00182') + + PA_data <- outlist[PA_codes, c('HMDB_code','name', PA_sample_name)] + PA_data <- reshape2::melt(PA_data, id.vars = c('HMDB_code','name')) + colnames(PA_data) <- c('HMDB.code','HMDB.name','Sample','Zscore') + + PKU_data <- outlist[PKU_codes, c('HMDB_code','name', PKU_sample_name)] + PKU_data <- reshape2::melt(PKU_data, id.vars = c('HMDB_code','name')) + colnames(PKU_data) <- c('HMDB.code','HMDB.name','Sample','Zscore') + + LPI_data <- outlist[LPI_codes, c('HMDB_code','name', LPI_sample_name)] + LPI_data <- reshape2::melt(LPI_data, id.vars = c('HMDB_code','name')) + colnames(LPI_data) <- c('HMDB.code','HMDB.name','Sample','Zscore') + + Pos_Contr <- rbind(PA_data, PKU_data, LPI_data) + Pos_Contr$Zscore <- as.numeric(Pos_Contr$Zscore) + # extra information added to excel for future reference. made in beginning of this script + Pos_Contr$Matrix <- dims_matrix + Pos_Contr$Rundate <- rundate + Pos_Contr$Project <- project + + #Save results + save(Pos_Contr,file = paste0(outdir, "/", project, '_Pos_Contr.RData')) + Pos_Contr$Zscore <- round_df(Pos_Contr$Zscore, 2) # asked by Lab to round the number to 2 digits + write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, '_Pos_Contr.xlsx'), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE) + + } else { + write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) + }} + +### MISSING M/Z CHECK +# check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail + +# Load the outlist_identified files + remove the loaded files +load(paste0(outdir,"/outlist_identified_negative.RData")) +outlist.ident.neg <- outlist.ident +load(paste0(outdir,"/outlist_identified_positive.RData")) +outlist.ident.pos <- outlist.ident +rm(outlist.ident) +rm(outlist.not.ident) + +# check for missing m/z in negative and positive mode +mode <- c("Negative", "Positive") +index <- 1 +results_ident <- c() #empty results list +outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos) +for(outlist.ident in outlist_ident_list){ + current_mode <- mode[index] + # retrieve all unique m/z values in whole numbers and check if all are available + mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits=0))) + mz_range = seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600 + mz_missing = c() + for (mz in mz_range){ + if (!mz %in% mz_values) { + mz_missing <- c(mz_missing, mz) + } } + y <- mz_missing + # check if m/z are missing and make an .txt file with information + group_ident <- cumsum(c(1, abs(y[-length(y)] - y[-1]) > 1)) + if(length(group_ident) > 1){ + results_ident <- c(results_ident, paste0("Missing m/z values ", current_mode, " mode")) + results_ident <- c(results_ident, by(y, group_ident, identity)) + } else { + results_ident <- c(results_ident, paste0(current_mode, " mode did not have missing mz values")) + } + index <- index + 1 # change to new mode in for loop +} +lapply(results_ident, write, file=paste(outdir, "missing_mz_warning.txt", sep = "/"), append=TRUE, ncolumns=1000) + +cat("Ready excelExport.R") diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf new file mode 100644 index 0000000..326c47e --- /dev/null +++ b/DIMS/GenerateExcel.nf @@ -0,0 +1,18 @@ +process GenerateExcel { + tag {"DIMS GenerateExcel"} + label 'GenerateExcel' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(collect_file) // input files need to be linked, but called within R script + path(init_filepath) + + output: + path('AdductSums_*.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $params.analysis_id $params.matrix $params.relevance_file $params.zscore + """ +} diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R new file mode 100644 index 0000000..09503a7 --- /dev/null +++ b/DIMS/HMDBparts_main.R @@ -0,0 +1,64 @@ +#!/usr/bin/Rscript +## adapted from hmdb_part_adductSums.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +db_path <- cmd_args[1] # location of HMDB db file +breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData + +load(db_path) +load(breaks_filepath) + +# Cut up HMDB minus adducts minus isotopes into small parts + +# load(paste(outdir, "breaks.fwhm.RData", sep = "/")) +# outdir <- paste(outdir, "hmdb_part_adductSums", sep = "/") +# dir.create(outdir, showWarnings = FALSE) + +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + if (scanmode == "negative") { + column_label <- "MNeg" + HMDB_add_iso <- HMDB_add_iso.Neg + } else if (scanmode == "positive") { + column_label <- "Mpos" + HMDB_add_iso <- HMDB_add_iso.Pos + } + + # filter mass range meassured NB: remove the last comma?! + outlist <- HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] & HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + + # remove adducts and isotopes, put internal standard at the beginning + outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed=TRUE), ] + outlist <- outlist[grep("HMDB", rownames(outlist), fixed=TRUE), ] + outlist <- outlist[-grep("_", rownames(outlist), fixed=TRUE), ] + outlist <- rbind(outlist_IS, outlist) + # sort on m/z value + outlist <- outlist[order(outlist[ ,column_label]), ] + + n <- dim(outlist)[1] + # size of hmdb parts in lines: + sub <- 2000 + end <- 0 + check <- 0 + + # generate hmdb parts + if (n >= sub & (floor(n/sub)) >= 2) { + for (i in 1:floor(n/sub)){ + start <- -(sub-1) + i*sub + end <- i*sub + outlist_part <- outlist[c(start:end), ] + save(outlist_part, file=paste0(scanmode, "_hmdb.", i, ".RData")) + } + } +} + +# finish last hmdb part +start = end + 1 +end = n + +outlist_part = outlist[c(start:end),] +save(outlist_part, file=paste0(scanmode, "_hmdb.", i+1, ".RData")) diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf new file mode 100644 index 0000000..72fedfc --- /dev/null +++ b/DIMS/HMDBparts_main.nf @@ -0,0 +1,20 @@ +process HMDBparts_main { + // Custom process to cut HMDB db into parts for main entry only, no adducts, no isotopes + tag {"DIMS HMDBparts_main"} + label 'HMDBparts_main' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(hmdb_db_file) + path(breaks_file) + + output: + path('*.RData') + + script: + + """ + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts_main.R $hmdb_db_file $breaks_file + """ +} diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R new file mode 100755 index 0000000..8b08408 --- /dev/null +++ b/DIMS/SumAdducts.R @@ -0,0 +1,122 @@ +#!/usr/bin/Rscript +## adapted from 11-runSumAdducts.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +# collect_file <- cmd_args[1] +hmdbpart_main_file <- cmd_args[1] +scripts_dir <- cmd_args[2] +z_score <- as.numeric(cmd_args[3]) +# outdir <- cmd_args[2] +# scanmode <- cmd_args[3] +# adducts <- cmd_args[4] + +# for debugging: +print(hmdbpart_main_file) +# NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder +print(scripts_dir) +print(z_score) + +if (grepl("positive_hmdb", hmdbpart_main_file)) { + scanmode <- "positive" + # for the adduct sum: include adducts M+Na (1) and M+K (2) + adducts = c(1,2) +} else { + if (grepl("negative_hmdb", hmdbpart_main_file)) { + scanmode <- "negative" + # for the adduct sum: include adduct M+Cl (1) + adducts <- c(1) + } +} + +# load input files +collect_file <- paste0("outlist_identified_", scanmode, ".RData") +load(collect_file) +repl_file <- paste0(scanmode, "_repl_pattern.RData") +load(repl_file) +outlist_part <- get(load(hmdbpart_main_file)) + +# adducts=as.vector(unlist(strsplit(adducts, ",",fixed = TRUE))) + +# load(paste(outdir, "/outlist_identified_", scanmode, ".RData", sep="")) + +# Local and on HPC +#batch = strsplit(file, "/",fixed = TRUE)[[1]] +#batch = batch[length(batch)] +#batch = strsplit(batch, ".",fixed = TRUE)[[1]][2] +batch_number = strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2] +print(batch_number) + +outlist.tot <- unique(outlist.ident) + +sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, scanmode, outdir, z_score){ + #theor_MZ = outlist_part + #grpnames.long = names(repl.pattern.filtered) + #peaklist = outlist.ident + #adducts = c(1) #for neg or c(1,2) for pos + #batch <- 300 + #outdir <- "/Users/nunen/Documents/Metab/processed/zebrafish" + #scanmode <- "negative" + #z_score <- 0 + + hmdb_codes <- rownames(theor_MZ) + hmdb_names <- theor_MZ[,1, drop=FALSE] + hmdb_names[] <- lapply(hmdb_names, as.character) + + # remove isotopes!!! + index <- grep("HMDB", hmdb_codes, fixed=TRUE) + hmdb_codes <- hmdb_codes[index] + hmdb_names <- hmdb_names[index,] + index = grep("_", rownames(hmdb_codes), fixed=TRUE) + if (length(index) > 0) hmdb_codes <- hmdb_codes[-index] + if (length(index) > 0) hmdb_names <- hmdb_names[-index] + + # negative + names <- NULL + adductsum <- NULL + names_long <- NULL + + if (length(hmdb_codes) != 0) { + + for(i in 1:length(hmdb_codes)){ + #compound="HMDB00045" + compound <- hmdb_codes[i] + compound_plus <- c(compound,paste(compound, adducts, sep = "_")) + + # x=peaklist$HMDB_code[1] + metab <- unlist(lapply(peaklist$HMDB_code, function(x) {(length(intersect(unlist(strsplit(as.vector(x),";")),compound_plus))>0)})) + # peaklist[metab, "assi.hmdb"] + # which(metab==TRUE) + + total <- c() + + # peaklist[metab, c("mzmed.pgrp", "HMDB_code", "C34.1")] + # ints=peaklist[metab, c(7:(length(grpnames.long)+6))] + if (z_score == 1) { + ints <- peaklist[metab, c(15:(length(grpnames.long)+14))] + } else { + ints <- peaklist[metab, c(7:(length(grpnames.long)+6))] + } + total <- apply(ints, 2, sum) + + if (sum(total)!=0) { + #message(i) + names <- c(names, compound) + adductsum <- rbind(adductsum,total) + names_long <- c(names_long, hmdb_names[i]) + } + } + + if (!is.null(adductsum)){ + rownames(adductsum) <- names + adductsum <- cbind(adductsum, "HMDB_name"=names_long) + save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep="")) + } + } +} + + +sumAdducts(outlist.tot, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score) + diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf new file mode 100644 index 0000000..6a7be46 --- /dev/null +++ b/DIMS/SumAdducts.nf @@ -0,0 +1,18 @@ +process SumAdducts { + label 'SumAdducts' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(collect_file) // input files need to be linked, but called within R script + path(replication_pattern) + path(HMDBpart_main_file) + + output: + path('*_SummedAdducts.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/SumAdducts.R $HMDBpart_main_file $params.scripts_dir $params.zscore + """ +} From 03edd3295abca1740371b45ffb3f8dd5ed32ad74 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 20 Oct 2023 17:19:14 +0200 Subject: [PATCH 13/73] modified files and added new files --- DIMS/AssignToBins.R | 1 - DIMS/AssignToBins.nf | 3 +- DIMS/AverageTechReplicates.R | 5 - DIMS/AverageTechReplicates.nf | 9 +- DIMS/CollectFilled.R | 22 +- DIMS/CollectFilled.nf | 6 +- DIMS/CollectSumAdducts.R | 7 +- DIMS/FillMissing.R | 16 +- DIMS/FillMissing.nf | 3 +- DIMS/GenerateBreaks.R | 1 - DIMS/GenerateBreaks.nf | 5 +- DIMS/GenerateExcel.R | 135 ++++++----- DIMS/GenerateExcel.nf | 4 +- DIMS/GenerateViolinPlots.R | 381 +++++++++++++++++++++++++++++++ DIMS/GenerateViolinPlots.nf | 17 ++ DIMS/HMDBparts.R | 18 +- DIMS/HMDBparts.nf | 7 +- DIMS/HMDBparts_main.R | 15 +- DIMS/HMDBparts_main.nf | 2 +- DIMS/MakeInit.nf | 5 +- DIMS/PeakFinding.R | 1 - DIMS/PeakFinding.nf | 3 +- DIMS/PeakGrouping.R | 28 +-- DIMS/PeakGrouping.nf | 3 +- DIMS/SpectrumPeakFinding.R | 3 - DIMS/SpectrumPeakFinding.nf | 8 +- DIMS/SumAdducts.R | 46 +--- DIMS/SumAdducts.nf | 3 +- DIMS/ThermoRawFileParser.nf | 6 +- DIMS/UnidentifiedCalcZscores.R | 67 ++++++ DIMS/UnidentifiedCalcZscores.nf | 18 ++ DIMS/UnidentifiedCollectPeaks.R | 102 +++++++++ DIMS/UnidentifiedCollectPeaks.nf | 20 ++ DIMS/UnidentifiedFillMissing.R | 50 ++++ DIMS/UnidentifiedFillMissing.nf | 18 ++ DIMS/UnidentifiedPeakGrouping.R | 93 ++++++++ DIMS/UnidentifiedPeakGrouping.nf | 19 ++ 37 files changed, 954 insertions(+), 196 deletions(-) create mode 100644 DIMS/GenerateViolinPlots.R create mode 100644 DIMS/GenerateViolinPlots.nf create mode 100755 DIMS/UnidentifiedCalcZscores.R create mode 100644 DIMS/UnidentifiedCalcZscores.nf create mode 100755 DIMS/UnidentifiedCollectPeaks.R create mode 100644 DIMS/UnidentifiedCollectPeaks.nf create mode 100755 DIMS/UnidentifiedFillMissing.R create mode 100644 DIMS/UnidentifiedFillMissing.nf create mode 100755 DIMS/UnidentifiedPeakGrouping.R create mode 100644 DIMS/UnidentifiedPeakGrouping.nf diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 4e320d3..ae9c8b1 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -6,7 +6,6 @@ suppressPackageStartupMessages(library("xcms")) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") filepath <- cmd_args[1] # location of mzML file breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index c7103cd..fbbb48e 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -1,10 +1,11 @@ process AssignToBins { + tag "DIMS AssignToBins ${file_id}" label 'AssignToBins' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(file_id, path(mzML_filename), path(breaks_file)) + tuple val(file_id), path(mzML_filename) , path(breaks_file) output: path("${file_id}.RData") diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index b880f13..4001937 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -3,7 +3,6 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") init_filepath <- cmd_args[1] nr_replicates <- as.numeric(cmd_args[2]) @@ -12,16 +11,12 @@ dimsThresh <- 100 removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { # bad_samples=remove_pos - tmp <- repl_pattern - removeFromGroup <- NULL for (i in 1:length(tmp)){ tmp2 <- repl_pattern[[i]] - remove <- NULL - for (j in 1:length(tmp2)){ if (tmp2[j] %in% bad_samples){ remove = c(remove, j) diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 26f9211..33567a3 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -1,4 +1,5 @@ process AverageTechReplicates { + tag "DIMS AverageTechReplicates" label 'AverageTechReplicates' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] @@ -8,10 +9,10 @@ process AverageTechReplicates { path(init_filepath) output: - path '*_repl_pattern.RData', emit: patterns - path '*_avg.RData', emit: binned - path 'miss_infusions_negative.txt' - path 'miss_infusions_positive.txt' + path('*_repl_pattern.RData'), emit: patterns + path('*_avg.RData'), emit: binned + path('miss_infusions_negative.txt') + path('miss_infusions_positive.txt') script: """ diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 62be430..e927a65 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -3,7 +3,6 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") scripts_dir <- cmd_args[1] ppm <- as.numeric(cmd_args[2]) @@ -23,7 +22,6 @@ for (scanmode in scanmodes) { outlist.tot <- NULL for (i in 1:length(filled_files)) { load(filled_files[i]) - print(filled_files[i]) outlist.tot <- rbind(outlist.tot, peakgrouplist_filled) } @@ -44,7 +42,7 @@ for (scanmode in scanmodes) { if (z_score == 1) { outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) - nr.removed.samples <- length(which(repl_pattern[] == "character(0)")) + nr.removed.samples <- length(which(repl_pattern[]=="character(0)")) order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)]) outlist.stats.more <- cbind(outlist.stats[,1:7], outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)], @@ -59,25 +57,23 @@ for (scanmode in scanmodes) { outlist.tot <- outlist.stats.more } - # filter identified compounds - index.1 <- which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"]))) - index.2 <- which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"]))) - index <- union(index.1,index.2) - outlist.ident <- outlist.tot[index,] - outlist.not.ident <- outlist.tot[-index,] + outlist.ident <- outlist.tot if (z_score == 1) { outlist.ident$ppmdev <- as.numeric(outlist.ident$ppmdev) outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),] } - # NAs in theormz_noise + # take care of NAs in theormz_noise outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0 outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise) outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0 outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise) - # save output for identified peak groups (not.identified later) - # save(outlist.not.ident, file="./outlist_not_identified_", scanmode, ".RData") - save(outlist.ident, file=paste0("./outlist_identified_", scanmode, ".RData")) + # Extra output in Excel-readable format: + remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") + remove_colindex <- which(colnames(outlist.ident) %in% remove_columns) + outlist.ident <- outlist.ident[ , -remove_colindex] + write.table(outlist.ident, file=paste0("outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) + save(outlist.ident, file=paste0("outlist_identified_", scanmode, ".RData")) } diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index c28516c..a7819f4 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -1,14 +1,16 @@ process CollectFilled { + tag "DIMS CollectFilled" label 'CollectFilled' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: path(filled_files) - path(replication_pattern) + path(replication_pattern) // input files need to be linked, but called within R script output: - path('outlist*.RData') + path('outlist*.txt') + path('outlist*.RData'), emit: filled_pgrlist script: """ diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R index ae89cbc..e935eb8 100755 --- a/DIMS/CollectSumAdducts.R +++ b/DIMS/CollectSumAdducts.R @@ -3,11 +3,8 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") -# outdir <- cmd_args[1] -# scanmode <- cmd_args[2] -scanmode <- c("positive", "negative") +scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # collect all AdductSums part files for each scanmode @@ -20,6 +17,6 @@ for (scanmode in scanmodes) { } # save output file - save(outlist.tot, file="/AdductSums_", scanmode, ".RData") + save(outlist.tot, file=paste0("AdductSums_", scanmode, ".RData")) } diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index 127c557..a2e29ff 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -13,9 +13,14 @@ resol <- as.numeric(cmd_args[4]) ppm <- as.numeric(cmd_args[5]) outdir <- "./" +print(peakgrouplist_file) + if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" } +print(scanmode) +print(scripts_dir) + # load in function scripts source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) @@ -31,12 +36,19 @@ source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R")) pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) +print(head(repl_pattern)) + # load peak group list and determine output file name -load(peakgrouplist_file) +outpgrlist_identified <- get(load(peakgrouplist_file)) + +print(head(outpgrlist_identified)) + outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) +print(outputfile_name) + # replace missing values (zeros) with random noise -peakgrouplist_filled <- replaceZeros(outpgrlist.identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) +peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) # save output save(peakgrouplist_filled, file=paste0("./", outputfile_name)) diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index 4749b87..3a1f61b 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -1,11 +1,12 @@ process FillMissing { + tag "DIMS FillMissing ${GroupedList_file}" label 'FillMissing' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: path(GroupedList_file) - path(replication_pattern) + path(replication_pattern) // input files need to be linked, but called within R script output: path('*_filled.RData') diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 38161fb..7aeb8c4 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -7,7 +7,6 @@ suppressPackageStartupMessages(library("xcms")) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") filepath <- cmd_args[1] # 1 of the mzML files outdir <- cmd_args[2] diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index f60d5cc..593014b 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -1,14 +1,15 @@ process GenerateBreaks { + tag "DIMS GenerateBreaks" label 'GenerateBreaks' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(file_id, path(mzML_file)) + tuple val(file_id), path(mzML_file) output: - path 'breaks.fwhm.RData' + path('breaks.fwhm.RData') script: """ diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 7958f95..a84c9d2 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -3,7 +3,6 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep = "") # outdir <- cmd_args[1] #"/Users/nunen/Documents/Metab/test_set" init_filepath <- cmd_args[1] @@ -21,13 +20,15 @@ suppressMessages(library("dplyr")) suppressMessages(library("stringr")) # Initialise and load data -outdir <- "../../../" plot <- TRUE export <- TRUE control_label <- "C" case_label <- "P" imagesize_multiplier <- 2 -# init <- paste0(outdir, "/logs/init.RData") +# setting outdir to export files to the project directory +outdir <- "../../../" +# percentage of outliers to remove from calculation of robust scaler +perc <- 5 # load information on samples load(init_filepath) @@ -43,15 +44,11 @@ dir.create(plotdir, showWarnings = F) options(digits=16) -# setwd(outdir) - # sum positive and negative adductsums - # Load pos and neg adduct sums -load("/AdductSums_negative.RData") +load("AdductSums_negative.RData") outlist.neg.adducts.HMDB <- outlist.tot - -load("/AdductSums_positive.RData") +load("AdductSums_positive.RData") outlist.pos.adducts.HMDB <- outlist.tot rm(outlist.tot) @@ -113,7 +110,6 @@ if (z_score == 1) { # Statistics: Z-score outlist <- cbind(plots = NA, outlist) #outlist <- as.data.frame(outlist) - startcol <- dim(outlist)[2] + 3 # Get columns with control intensities @@ -123,12 +119,14 @@ if (z_score == 1) { # Get columns with patient intensities patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE) patient_columns <- outlist[, patient_col_ids] - intensity_col_ids <- c(control_col_ids, patient_col_ids) # set intensities of 0 to NA? outlist[,intensity_col_ids][outlist[,intensity_col_ids] == 0] <- NA - + + # Extra output: save outlist as it is and use it to calculate robust scaler + outlist.noZ <- outlist + # calculate mean and sd for Control group outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x),na.rm = TRUE)) outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x),na.rm = TRUE)) @@ -142,13 +140,47 @@ if (z_score == 1) { outlist <- cbind(outlist, zscores.1col) } colnames(outlist)[startcol:ncol(outlist)] <- cnames.z - + + # Extra output: calculate robust scaler (Zscores minus outliers in Controls) + # calculate mean and sd for Control group without outliers + outlist.noZ$avg.ctrls <- 0 + outlist.noZ$sd.ctrls <- 0 + + robust_scaler <- function(control_intensities, control_col_ids, perc=5) { + nr_toremove <- ceiling(length(control_col_ids)*perc/100) + sorted_control_intensities <- sort(as.numeric(control_intensities)) + trimmed_control_intensities <- sorted_control_intensities[(nr_toremove+1):(length(sorted_control_intensities)-nr_toremove)] + return(trimmed_control_intensities) + } + + # only calculate robust Z-scores if there are enough Controls + if (length(control_col_ids) > 10) { + for (metabolite_index in 1:nrow(outlist)) { + outlist.noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc)) + outlist.noZ$sd.ctrls[metabolite_index] <- sd(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc)) + } + } + + # Make and add columns with robust zscores + cnames.robust <- gsub("_Zscore", "_RobustZscore", cnames.z) + for (i in intensity_col_ids) { + zscores.1col <- (as.numeric(as.vector(unlist(outlist.noZ[ , i]))) - outlist.noZ$avg.ctrls) / outlist.noZ$sd.ctrls + outlist.noZ <- cbind(outlist.noZ, zscores.1col) + } + colnames(outlist.noZ)[startcol:ncol(outlist.noZ)] <- cnames.robust + + # Extra output: metabolites filtered on relevance + save(outlist, file=paste0("AdductSums_filtered_Zscores.RData")) + write.table(outlist, file=paste0("AdductSums_filtered_Zscores.txt"), sep="\t", row.names = FALSE) + # Extra output: filtered metabolites with robust scaled Zscores + write.table(outlist.noZ, file=paste0("AdductSums_filtered_robustZ.txt"), sep="\t", row.names = FALSE) + + # get the IDs of the patients patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1])))) patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] # sorts - temp_png <- NULL - # Iterate over every row, make boxplot, insert into excel, and make Zscore for every patient + temp_png <- NULL for (p in 1:nrow(outlist)) { # box plot hmdb_name <- rownames(outlist[p,]) @@ -224,18 +256,18 @@ IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),] IS_codes <- rownames(IS) cat(IS_codes,"\n") -# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl.pattern) will contain more sample_names then the peak data (IS), +# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS), # thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data. -sample_names_nodata <- setdiff(names(repl.pattern),names(IS)) +sample_names_nodata <- setdiff(names(repl_pattern),names(IS)) if (!is.null(sample_names_nodata)) { write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) cat(sample_names_nodata,"\n") for (sample_name in sample_names_nodata) { - repl.pattern[[sample_name]] <- NULL + repl_pattern[[sample_name]] <- NULL }} # Retrieve IS summed adducts -IS_summed <- IS[c(names(repl.pattern), "HMDB_code")] +IS_summed <- IS[c(names(repl_pattern), "HMDB_code")] IS_summed$HMDB.name <- IS$name IS_summed <- reshape2::melt(IS_summed, id.vars=c('HMDB_code','HMDB.name')) colnames(IS_summed) <- c('HMDB.code','HMDB.name','Sample','Intensity') @@ -268,10 +300,10 @@ IS_neg$Project <- project IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity)) # Save results -save(IS_pos,IS_neg,IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData')) +save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData')) # number of samples, for plotting length and width -sample_count <- length(repl.pattern) +sample_count <- length(repl_pattern) # change the order of the x-axis summed plots to a natural sorted one Sample_naturalorder <- unique(as.character(IS_summed$Sample)) @@ -280,9 +312,7 @@ IS_summed$Sample_level <- factor(IS_summed$Sample, levels = c(Sample_naturalorde IS_pos$Sample_level <- factor(IS_pos$Sample, levels = c(Sample_naturalorder)) IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder)) -########## -##### bar plots with all IS -########## +## bar plots with all IS # function for ggplot theme # theme for all IS bar plots @@ -290,28 +320,28 @@ theme_IS_bar <- function(myPlot) { myPlot + scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + theme( - legend.position='none', - axis.text.x=element_text(angle = 90, hjust = 1, vjust = 0.5, size=6), - axis.text.y=element_text(size=6)) + legend.position = 'none', + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size=6), + axis.text.y = element_text(size=6)) } # ggplot functions IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + - geom_bar(aes(fill=HMDB.name),stat='identity') + - labs(x='',y='Intensity') + + geom_bar(aes(fill=HMDB.name), stat='identity') + + labs(x='', y='Intensity') + facet_wrap(~HMDB.name, scales='free_y') IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + - geom_bar(aes(fill=HMDB.name),stat='identity') + - labs(x='',y='Intensity') + + geom_bar(aes(fill=HMDB.name), stat='identity') + + labs(x='', y='Intensity') + facet_wrap(~HMDB.name, scales='free_y') IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Summed)") + - geom_bar(aes(fill=HMDB.name),stat='identity') + - labs(x='',y='Intensity') + + geom_bar(aes(fill=HMDB.name), stat='identity') + + labs(x='', y='Intensity') + facet_wrap(~HMDB.name, scales='free_y') # add theme to ggplot functions @@ -325,9 +355,7 @@ ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot=IS_neg_bar_plot, height ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot=IS_pos_bar_plot, height=plot_width/2.5, width=plot_width, units="in") ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot=IS_sum_bar_plot, height=plot_width/2.5, width=plot_width, units="in") -########## -##### Line plots with all IS -########## +## Line plots with all IS # function for ggplot theme # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines @@ -348,7 +376,7 @@ IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_point(aes(col=HMDB.name)) + geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='',y='Intensity') + labs(x='', y='Intensity') IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + @@ -373,9 +401,7 @@ ggsave(paste0(outdir,"/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, hei ggsave(paste0(outdir,"/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width/2.5, width = plot_width, units = "in") ggsave(paste0(outdir,"/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width/2.5, width = plot_width, units = "in") -########## -##### bar plots with a selection of IS -########## +## bar plots with a selection of IS IS_neg_selection <- c('2H2-Ornithine (IS)', '2H3-Glutamate (IS)', '2H2-Citrulline (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)') IS_pos_selection <- c('2H4-Alanine (IS)', '13C6-Phenylalanine (IS)', '2H4_13C5-Arginine (IS)', '2H3-Propionylcarnitine (IS)', '2H9-Isovalerylcarnitine (IS)') IS_sum_selection <- c('2H8-Valine (IS)', '2H3-Leucine (IS)', '2H3-Glutamate (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)') @@ -409,22 +435,22 @@ if (dims_matrix == "DBS"){ # ggplot functions IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + - geom_bar(aes(fill=HMDB.name),stat='identity') + - labs(x='',y='Intensity') + + geom_bar(aes(fill=HMDB.name), stat='identity') + + labs(x='', y='Intensity') + facet_wrap(~HMDB.name, scales='free', ncol = 2) + if(exists("hline.data.neg")){geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))} #subset, if some IS have no data, no empty plots will be generated with a line) IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + - geom_bar(aes(fill=HMDB.name),stat='identity') + - labs(x='',y='Intensity') + + geom_bar(aes(fill=HMDB.name), stat='identity') + + labs(x='', y='Intensity') + facet_wrap(~HMDB.name, scales='free', ncol = 2) + if(exists("hline.data.pos")){geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))} IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + - geom_bar(aes(fill=HMDB.name),stat='identity') + - labs(x='',y='Intensity') + + geom_bar(aes(fill=HMDB.name), stat='identity') + + labs(x='', y='Intensity') + facet_wrap(~HMDB.name, scales='free', ncol = 2) + if(exists("hline.data.sum")){geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))} @@ -439,9 +465,7 @@ ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_b ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") -########## -##### line plots with a selection of IS -########## +## line plots with a selection of IS # function for ggplot theme # see line plots with all IS @@ -451,19 +475,19 @@ IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selecti ggtitle("Interne Standaard (Neg)") + geom_point(aes(col=HMDB.name)) + geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='',y='Intensity') + labs(x='', y='Intensity') IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_point(aes(col=HMDB.name)) + geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='',y='Intensity') + labs(x='', y='Intensity') IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_point(aes(col=HMDB.name)) + geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='',y='Intensity') + labs(x='', y='Intensity') # add theme to ggplot functions IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot) @@ -539,15 +563,14 @@ if (z_score == 1) { ### MISSING M/Z CHECK # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail - +print("Nu in missing m/z check") # Load the outlist_identified files + remove the loaded files -load(paste0(outdir,"/outlist_identified_negative.RData")) +load(paste0(outdir, "RData/outlist_identified_negative.RData")) outlist.ident.neg <- outlist.ident -load(paste0(outdir,"/outlist_identified_positive.RData")) +load(paste0(outdir, "RData/outlist_identified_positive.RData")) outlist.ident.pos <- outlist.ident rm(outlist.ident) rm(outlist.not.ident) - # check for missing m/z in negative and positive mode mode <- c("Negative", "Positive") index <- 1 diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 326c47e..655ce0d 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -7,12 +7,14 @@ process GenerateExcel { input: path(collect_file) // input files need to be linked, but called within R script path(init_filepath) + val(analysis_id) + path(relevance_file) output: path('AdductSums_*.RData') script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $params.analysis_id $params.matrix $params.relevance_file $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $analysis_id $params.matrix $relevance_file $params.zscore """ } diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R new file mode 100644 index 0000000..bf2abe8 --- /dev/null +++ b/DIMS/GenerateViolinPlots.R @@ -0,0 +1,381 @@ +# For untargeted metabolomics, this tool calculates probability scores for +# metabolic disorders. In addition, it provides visual support with violin plots +# of the DIMS measurements for the lab specialists. +# Input needed: +# 1. Excel file in which metabolites are listed with their intensities for +# controls (with C in samplename) and patients (with P in samplename) and their +# corresponding Z-scores. +# 2. All files from github: https://github.com/UMCUGenetics/dIEM + +library(dplyr) # tidytable is for other_isobaric.R (left_join) +library(reshape2) # used in prepare_data.R +library(data.table) # for function setDT +library(openxlsx) # for opening Excel file +library(ggplot2) # for plotting +#library(gridExtra) # for table top highest/lowest + +# define parameters - check after addition to run.sh +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) { + cat(" ", arg, "\n", sep = "") +} + +scripts_dir <- cmd_args[1] +run_name <- cmd_args[2] # run_name <- "Test_run_5" +z_score <- as.numeric(cmd_args[3]) # calculate Z-scores or not? z_score <- 1 + +# load functions +source(paste0(scripts_dir, "AddOnFunctions/same_samplename.R")) +source(paste0(scripts_dir, "AddOnFunctions/prepare_data.R")) +source(paste0(scripts_dir, "AddOnFunctions/prepare_data_perpage.R")) +source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R")) +source(paste0(scripts_dir, "AddOnFunctions/violin_plots.R")) +source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R")) + +# The list of parameters can be shortened for HPC. Leave for now. +top_nr_IEM <- 5 # number of diseases that score highest in algorithm to plot +integer_list <- c(1:top_nr_IEM) # indices of top diseases +threshold_IEM <- 5 # probability score cut-off for plotting the top diseases +ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases +nr_plots_perpage <- 20 # number of violin plots per page in PDF + +# Settings from config.R +# binary variable: run function, yes(1) or no(0). Can be removed at later stage +if (z_score == 1) { algorithm <- ratios <- violin <- 1 } # ? Or put if statement in run.sh? +# integer: are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) +header_row = 1 +# column name where the data starts (default B) +col_start <- "B" +zscore_cutoff <- 5 +xaxis_cutoff <- 20 + +# path to DIMS excel file +# path_DIMSfile = "/Users/ihoek3/Documents/dIEM/voorbeeld_PL_Diagn17_RUN10.xlsx" +path_DIMSfile <- paste0("./", run_name, ".xlsx") + +# path: output folder +output_dir <- paste0("./dIEM") +dir.create(output_dir, showWarnings = F) + +# folder in which all metabolite lists are (.txt) +path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups" +### file for ratios step 3 +file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv" +### file for algorithm step 4 +file_expected_biomarkers_IEM <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv" + + +#### STEP 1: Preparation #### +# in: run_name, path_DIMSfile, header_row ||| out: output_dir, DIMS + +# Load the excel file. +dims_xls <- readWorkbook(xlsxFile = path_DIMSfile, sheet = 1, startRow = header_row) +if (exists("dims_xls")) { + cat(paste0("\nThe excel file is succesfully loaded:\n -> ",path_DIMSfile)) +} else { + cat(paste0("\n\n**** Error: Could not find an excel file. Please check if path to excel file is correct in config.R:\n -> ",path_DIMSfile,"\n")) +} + +#### STEP 2: Edit DIMS data ##### +# in: dims_xls ||| out: Data, nr_contr, nr_pat +# Input: the xlsx file that comes out of the pipeline with format: +# [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] +# Output: "_CSV.csv" file that is suited for the algorithm in shiny. + +# Determine the number of Contols and Patients in column names: +nr_contr <- length(grep("C",names(dims_xls)))/2 # Number of control samples +nr_pat <- length(grep("P",names(dims_xls)))/2 # Number of patient samples +# total number of samples +nrsamples <- nr_contr + nr_pat +# check whether the number of intensity columns equals the number of Zscore columns +if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { + cat("\n**** Error: there aren't as many intensities listed as Zscores") +} +cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) + +# Move the columns HMDB_code and HMDB_name to the beginning. +HMDB_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) +other_cols <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols] +dims_xls_copy <- dims_xls[ , c(HMDB_info_cols, other_cols)] +# Remove the columns from 'name' to 'pathway' +from_col <- which(colnames(dims_xls_copy) == "name") +to_col <- which(colnames(dims_xls_copy) == "pathway") +dims_xls_copy <- dims_xls_copy[ , -c(from_col:to_col)] +# in case the excel had an empty "plots" column, remove it +if ("plots" %in% colnames(dims_xls_copy)) { + dims_xls_copy <- dims_xls_copy[ , -grep("plots", colnames(dims_xls_copy))] +} +# Rename columns +names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) +names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) +names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) +names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) + +# intensity columns and mean and standard deviation of controls +numeric_cols <- c(3:ncol(dims_xls_copy)) +# make sure all values are numeric +dims_xls_copy[ , numeric_cols] <- sapply(dims_xls_copy[ , numeric_cols], as.numeric) + +if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { + cat("\n### Step 2 # Edit dims data is done.\n") +} else { + cat("\n**** Error: Could not execute step 2 \n") +} + +#### STEP 3: Calculate ratios of intensities for metabolites #### +# in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) +# This script loads the file with Ratios (file_ratios_metabolites) and calculates +# the ratios of the intensities of the given metabolites. It also calculates +# Zs-cores based on the avg and sd of the ratios of the controls. + +# Input: dataframe with intenstities and Zscores of controls and patients: +# [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore] + +# Output: "_CSV.csv" file that is suited for the algorithm, with format: +# "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. + +if (ratios == 1) { + cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) + ratio_input <- read.csv(file_ratios_metabolites, sep=';', stringsAsFactors=FALSE) + + # Prepare empty data frame to fill with ratios + ratio_list <- setNames(data.frame(matrix( + ncol=ncol(dims_xls_copy), + nrow=nrow(ratio_input) + )), colnames(dims_xls_copy)) + + # put HMDB info into first two columns of ratio_list + ratio_list[ ,1:2] <- ratio_input[ ,1:2] + + # look for intensity columns (exclude Zscore columns) + control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + intensity_cols <- c(control_cols, patient_cols) + # calculate each of the ratios of intensities + for (ratio_index in 1:nrow(ratio_input)) { + ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] + ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] + ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] + ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] + # find these HMDB IDs in dataset. Could be a sum of multiple metabolites + sel_denominator <- sel_numerator <- c() + for (numerator_index in 1:length(ratio_numerator)) { + sel_numerator <- c(sel_numerator, which(dims_xls_copy[ , "HMDB.code"] == ratio_numerator[numerator_index])) + } + for (denominator_index in 1:length(ratio_denominator)) { + sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) + } + # calculate ratio + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / + apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) + # calculate log of ratio + ratio_list[ratio_index, intensity_cols]<- log2(ratio_list[ratio_index, intensity_cols]) + } + + # Calculate means and SD's of the calculated ratios for Controls + ratio_list[ , "Mean_controls"] <- apply(ratio_list[ , control_cols], 1, mean) + ratio_list[ , "SD_controls"] <- apply(ratio_list[ , control_cols], 1, sd) + + # Calc z-scores with the means and SD's of Controls + zscore_cols <- grep("Zscore", colnames(ratio_list)) + for (sample_index in 1:length(zscore_cols)) { + zscore_col <- zscore_cols[sample_index] + # matching intensity column + int_col <- intensity_cols[sample_index] + # add test on column names + if (same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { + # calculate Z-scores + ratio_list[ , zscore_col] <- (ratio_list[ , int_col] - ratio_list[ , "Mean_controls"]) / ratio_list[ , "SD_controls"] + } + } + + # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. + dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) + + # Edit the DIMS output Zscores of all patients in format: + # HMDB_code patientname1 patientname2 + names(dims_xls_ratios) <- gsub("HMDB.code","HMDB_code", names(dims_xls_ratios)) + names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) + # remove the string "_Zscore" from column names; this causes problems in step 4 + # names(dims_xls_ratios) <- gsub("_Zscore", "", names(dims_xls_ratios)) + + # Select only the cols with zscores of the patients + zscore_patients <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] + # Select only the cols with zscores of the controls + zscore_controls <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] + +} + +#### STEP 4: Run the IEM algorithm ######### +# in: algorithm, file_expected_biomarkers_IEM, zscore_patients ||| out: prob_score (+file) +# algorithm taken from DOI: 10.3390/ijms21030979 + +if (algorithm == 1) { + # Load data + cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_IEM, "\n")) + expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep=';', stringsAsFactors=FALSE) + # modify column names + names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) + names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) + + # prepare dataframe scaffold rank_patients + rank_patients <- zscore_patients + # Fill df rank_patients with the ranks for each patient + for (patient_index in 3:ncol(zscore_patients)) { + # number of positive zscores in patient + pos <- sum(zscore_patients[ , patient_index] > 0) + # sort the column on zscore; NB: this sorts the entire object, not just one column + rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] + # Rank all positive zscores highest to lowest + rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) + # Rank all negative zscores lowest to highest + rank_patients[(pos+1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos+1):nrow(rank_patients), patient_index])) + } + # NB: Warning message: In xtfrm.data.frame(x) : cannot xtfrm data frames. Ignore for now. + + # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). + expected_zscores <- merge(x=expected_biomarkers, y=zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + expected_zscores_original <- expected_zscores # necessary copy? + + # determine which columns contain Z-scores and which contain disease info + select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) + select_info_cols <- 1:(min(select_zscore_cols) -1) + # set some zscores to zero + select_incr_indisp <- which(expected_zscores$Change=="Increase" & expected_zscores$Dispensability=="Indispensable") + expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6 , 0, x)) + select_decr_indisp <- which(expected_zscores$Change=="Decrease" & expected_zscores$Dispensability=="Indispensable") + expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2 , 0, x)) + + # calculate rank score: + expected_ranks <- merge(x=expected_biomarkers, y=rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols]/(expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols]*0.9) + # combine disease info with rank scores + expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) + + # multiply weight score and rank score + weight_score <- expected_zscores + weight_score[ , select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[ , select_zscore_cols] + + # sort table on Disease and Absolute_Weight + weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] + + # select columns to check duplicates + dup <- weight_score[ , c('Disease', 'M.z')] + uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast=FALSE),] + + # calculate probability score + prob_score <- aggregate(uni[ , select_zscore_cols], uni["Disease"], sum) + + # list of all diseases that have at least one metabolite Zscore at 0 + for (patient_index in 2:ncol(prob_score)) { + patient_zscore_colname <- colnames(prob_score)[patient_index] + matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) + # determine which Zscores are 0 for this patient + zscores_zero <- which(expected_zscores[ , matching_colname_expected] == 0) + # get Disease for these + disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) + # set the probability score of these diseases to 0 + prob_score[which(prob_score$Disease %in% disease_zero), patient_index]<- 0 + } + + # determine disease rank per patient + disease_rank <- prob_score + # rank diseases in decreasing order + disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) + # modify column names, Zscores have now been converted to probability scores + colnames(prob_score) <- gsub("_Zscore","_prob_score", colnames(prob_score)) # redundant? + colnames(disease_rank) <- gsub("_Zscore","", colnames(disease_rank)) + + # Create conditional formatting for output excel sheet. Colors according to values. + wb <- createWorkbook() + addWorksheet(wb, "Probability Scores") + writeData(wb, "Probability Scores", prob_score) + conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white","#FFFDA2","red"), rule = c(1, 10, 100)) + saveWorkbook(wb, file = paste0(output_dir,"/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + # check whether prob_score df exists and has expected dimensions. + if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { + cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") + } else { + cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") + } + + rm(wb) +} + +#### STEP 5: Make violin plots ##### +# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, top_diseases, top_metab, output_dir ||| out: pdf file + +if (violin == 1) { # make violin plots + + # preparation + # isobarics_txt <- c() + zscore_patients_copy <- zscore_patients + # keep the original for testing purposes, remove later. + colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) # for robust scaler + colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) + colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) # for robust scaler + colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) + + # Make patient list for violin plots + patient_list <- names(zscore_patients)[-c(1,2)] + + # from table expected_biomarkers, choose selected columns + select_columns <- c("Disease", "HMDB_code", "HMDB_name") + select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) + expected_biomarkers_select <- expected_biomarkers[ , select_col_nrs] + # remove duplicates + expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[ , c(1,2)]), ] + + # first step: normal violin plots + # Find all text files in the given folder, which contain metabolite lists of which + # each file will be a page in the pdf with violin plots. + # Make a PDF file for each of the categories in metabolite_dirs + metabolite_dirs <- list.files(path=path_metabolite_groups, full.names=FALSE, recursive=FALSE) + for (metabolite_dir in metabolite_dirs) { + # create a directory for the output PDFs + pdf_dir <- paste(output_dir, metabolite_dir, sep="/") + dir.create(pdf_dir, showWarnings=FALSE) + cat("making plots in category:", metabolite_dir, "\n") + + # get a list of all metabolite files + metabolite_files <- list.files(path=paste(path_metabolite_groups, metabolite_dir, sep="/"), pattern="*.txt", full.names=FALSE, recursive=FALSE) + # put all metabolites into one list + metab_list_all <- list() + metab_list_names <- c() + cat("making plots from the input files:") + # open the text files and add each to a list of dataframes (metab_list_all) + for (file_index in seq_along(metabolite_files)) { + infile <- metabolite_files[file_index] + metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep="/"), sep = "\t", header = TRUE, quote="") + # put into list of all lists + metab_list_all[[file_index]] <- metab_list + metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) + cat(paste0("\n", infile)) + } + # include list of classes in metabolite list + names(metab_list_all) <- metab_list_names + + # prepare list of metabolites; max nr_plots_perpage on one page + metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) + metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) + metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) + + # make violin plots per patient + for (pt_nr in 1:length(patient_list)) { + pt_name <- patient_list[pt_nr] + # for category Diagnostics, make list of metabolites that exceed alarm values for this patient + # for category Other, make list of top highest and lowest Z-scores for this patient + if (grepl("Diagnost", pdf_dir)) { + top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted) + } else { + top_metab_pt <- prepare_toplist(pt_name, zscore_patients) + } + + # generate normal violin plots + violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) + + } # end for pt_nr + + } # end for metabolite_dir + +} # end if violin = 1 diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf new file mode 100644 index 0000000..e580feb --- /dev/null +++ b/DIMS/GenerateViolinPlots.nf @@ -0,0 +1,17 @@ +process GenerateViolinPlots { + tag {"DIMS GenerateViolinPlots"} + label 'GenerateViolinPlots' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(excel_file) // input files need to be linked, but called within R script + + output: + path('*.pdf') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $params.scripts_dir $params.analysis_id $params.zscore + """ +} diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 51490b6..06e0f2c 100644 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -3,11 +3,11 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") +# Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm db_path <- cmd_args[1] # location of HMDB db file breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -standard_run <- cmd_args[3] # "yes" +standard_run <- cmd_args[4] # "yes" # Cut up entire HMDB into small parts based on the new binning/breaks load(breaks_filepath) @@ -15,18 +15,20 @@ load(breaks_filepath) # In case of a standard run (m/z 69-606) use external HMDB parts min_mz <- round(breaks_fwhm[1]) max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) + # test if standard mz range is used -if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) { +if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { # skip generating HMDB parts - hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/" - hmdb_parts <- list.files(hmdb_parts_dir, pattern=hmdb) # all files containing hmdb in file name + hmdb_parts_files <- cmd_args[3] + + hmdb_parts <- list.files(hmdb_parts_files, pattern="hmdb") # all files containing hmdb in file name for (hmdb_file in hmdb_parts) { - file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), "./", recursive = TRUE) + file.copy(paste(hmdb_parts_files, hmdb_file, sep="/"), "./", recursive = TRUE) } } else { # generate HMDB parts in case of non-standard mz range load(db_path) - ppm <- as.numeric(cmd_args[4]) + ppm <- as.numeric(cmd_args[5]) scanmodes <- c("positive", "negative") @@ -109,7 +111,7 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < outlist_part <- outlist_i save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep="")) check <- check + dim(outlist_part)[1] - cat("\n", "Check", check == dim(outlist)[1]) + # cat("\n", "Check", check == dim(outlist)[1]) } } diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 7d58bfe..833b94d 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -1,16 +1,13 @@ process HMDBparts { - // Custom process to cut HMDB db into parts tag {"DIMS HMDBparts"} + // Custom process to cut HMDB db into parts label 'HMDBparts' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - // tuple(path(hmdb_db_file), path(breaks_file)) path(hmdb_db_file) path(breaks_file) - // val(standard_run) - // val(ppm) output: path('*.RData') @@ -18,6 +15,6 @@ process HMDBparts { script: """ - Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.standard_run $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm """ } diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 09503a7..941d45f 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -3,7 +3,6 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") db_path <- cmd_args[1] # location of HMDB db file breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData @@ -13,10 +12,6 @@ load(breaks_filepath) # Cut up HMDB minus adducts minus isotopes into small parts -# load(paste(outdir, "breaks.fwhm.RData", sep = "/")) -# outdir <- paste(outdir, "hmdb_part_adductSums", sep = "/") -# dir.create(outdir, showWarnings = FALSE) - scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { @@ -41,7 +36,7 @@ for (scanmode in scanmodes) { n <- dim(outlist)[1] # size of hmdb parts in lines: - sub <- 2000 + sub <- 1000 end <- 0 check <- 0 @@ -57,8 +52,8 @@ for (scanmode in scanmodes) { } # finish last hmdb part -start = end + 1 -end = n +start <- end + 1 +end <- n -outlist_part = outlist[c(start:end),] -save(outlist_part, file=paste0(scanmode, "_hmdb.", i+1, ".RData")) +outlist_part <- outlist[c(start:end),] +save(outlist_part, file = paste0(scanmode, "_hmdb.", i+1, ".RData")) diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf index 72fedfc..3f9845b 100644 --- a/DIMS/HMDBparts_main.nf +++ b/DIMS/HMDBparts_main.nf @@ -1,6 +1,6 @@ process HMDBparts_main { + tag "DIMS HMDBparts_main" // Custom process to cut HMDB db into parts for main entry only, no adducts, no isotopes - tag {"DIMS HMDBparts_main"} label 'HMDBparts_main' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 4908047..48b1c9b 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -1,13 +1,14 @@ process MakeInit { + tag "DIMS MakeInit" label 'MakeInit' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(path(samplesheet), val(nr_replicates)) + tuple path(samplesheet), val(nr_replicates) output: - path 'init.RData' + path('init.RData') script: """ diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 379a0fa..926cb77 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -3,7 +3,6 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 343c4c1..1752a74 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -1,11 +1,10 @@ process PeakFinding { + tag "DIMS PeakFinding ${RData_file}" label 'PeakFinding' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - // path(RData_file) - // path(breaks_file) tuple(path(RData_file), path(breaks_file)) output: diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index 756c0a7..423fb69 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -3,19 +3,14 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") HMDB_part_file <- cmd_args[1] -# SpecPeaks_file <- cmd_args[2] -# pattern_file <- cmd_args[3] ppm <- as.numeric(cmd_args[2]) options(digits=16) # load part of the HMDB HMDB_add_iso <- get(load(HMDB_part_file)) -# load(HMDB_part_file) -# HMDB_add_iso <- outlist_part # determine appropriate scanmode based on HMDB_part_file if (grepl("negative", basename(HMDB_part_file))) { scanmode <- "negative" } else @@ -30,27 +25,16 @@ load(SpecPeaks_file) outlist.copy <- outlist.tot rm(outlist.tot) -print(dim(outlist.copy)) - # load replication pattern -# load(paste0("./", scanmode, "_repl_pattern", ".RData")) pattern_file <- paste0(scanmode, "_repl_pattern.RData") load(pattern_file) -# load("./breaks.fwhm.RData") # determine appropriate column name in HMDB part if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" } -# for debugging: -print(dim(HMDB_add_iso)) -print(scanmode) -print(column_label) -print(ppm) -print(head(repl_pattern_filtered, 1)) - # Initialize outpgrlist.identified <- NULL -outlist.grouped <- NULL +list_of_peaks_used_in_peak_groups_identified <- NULL # First find peak groups identified based on HMDB masses while (dim(HMDB_add_iso)[1] > 0) { @@ -60,13 +44,11 @@ while (dim(HMDB_add_iso)[1] > 0) { reference_mass <- as.numeric(HMDB_add_iso[index, column_label]) mass_tolerance <- (reference_mass * ppm) / 10^6 - print(paste0("ref_mass ", reference_mass, " mtol ", mass_tolerance)) - # find the peaks in the dataset with corresponding m/z mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"]) selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance))) tmplist <- outlist.copy[selp,,drop=FALSE] - outlist.grouped <- rbind(outlist.grouped, tmplist) + list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist) nrsamples <- length(selp) if (nrsamples > 0) { @@ -188,9 +170,7 @@ while (dim(HMDB_add_iso)[1] > 0) { } -print(head(outpgrlist.identified)) - # save peak list corresponding to masses in HMDB part -# save(outlist.grouped, file=paste0(batch_number, "_", scanmode, "_all.RData")) +save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData")) # save peak group list, identified part -save(outpgrlist.identified, file=paste0(batch_number, "_", scanmode, "_identified.RData")) +save(outpgrlist.identified, file = paste0(batch_number, "_", scanmode, "_identified.RData")) diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 89328fb..8da1e76 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -1,4 +1,5 @@ process PeakGrouping { + tag "DIMS PeakGrouping ${HMDBpart_file}" label 'PeakGrouping' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] @@ -9,7 +10,7 @@ process PeakGrouping { path(pattern_file) output: - // path '*_all.RData', emit: peaklist_all + path '*_peaks_used.RData', emit: peaks_used path '*_identified.RData', emit: grouped_identified script: diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index 9935b05..a8f33f9 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -7,11 +7,8 @@ scanmodes <- c("positive", "negative") # Check whether all jobs terminated correct! notRun = NULL -print("one") - # collect spectrum peaks for each scanmode for (scanmode in scanmodes) { - print(scanmode) # load peak lists of all biological samples input_dir <- getwd() # "./" peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep="")) diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf index 786131c..775cd2c 100644 --- a/DIMS/SpectrumPeakFinding.nf +++ b/DIMS/SpectrumPeakFinding.nf @@ -1,18 +1,18 @@ process SpectrumPeakFinding { + tag "DIMS SpectrumPeakFinding" label 'SpectrumPeakFinding' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(RData_file) - path(replication_pattern) + path(RData_file) // input files need to be linked, but called within R script + path(replication_pattern) // input files need to be linked, but called within R script output: path 'SpectrumPeaks_*.RData' - // path 'SpectrumPeaks_positive.RData' script: """ - Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R $replication_pattern + Rscript ${baseDir}/CustomModules/DIMS/SpectrumPeakFinding.R """ } diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 8b08408..716b473 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -3,21 +3,12 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") # collect_file <- cmd_args[1] hmdbpart_main_file <- cmd_args[1] scripts_dir <- cmd_args[2] z_score <- as.numeric(cmd_args[3]) -# outdir <- cmd_args[2] -# scanmode <- cmd_args[3] -# adducts <- cmd_args[4] - -# for debugging: -print(hmdbpart_main_file) # NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder -print(scripts_dir) -print(z_score) if (grepl("positive_hmdb", hmdbpart_main_file)) { scanmode <- "positive" @@ -38,29 +29,11 @@ repl_file <- paste0(scanmode, "_repl_pattern.RData") load(repl_file) outlist_part <- get(load(hmdbpart_main_file)) -# adducts=as.vector(unlist(strsplit(adducts, ",",fixed = TRUE))) - -# load(paste(outdir, "/outlist_identified_", scanmode, ".RData", sep="")) - -# Local and on HPC -#batch = strsplit(file, "/",fixed = TRUE)[[1]] -#batch = batch[length(batch)] -#batch = strsplit(batch, ".",fixed = TRUE)[[1]][2] -batch_number = strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2] -print(batch_number) +batch_number <- strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2] outlist.tot <- unique(outlist.ident) sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, scanmode, outdir, z_score){ - #theor_MZ = outlist_part - #grpnames.long = names(repl.pattern.filtered) - #peaklist = outlist.ident - #adducts = c(1) #for neg or c(1,2) for pos - #batch <- 300 - #outdir <- "/Users/nunen/Documents/Metab/processed/zebrafish" - #scanmode <- "negative" - #z_score <- 0 - hmdb_codes <- rownames(theor_MZ) hmdb_names <- theor_MZ[,1, drop=FALSE] hmdb_names[] <- lapply(hmdb_names, as.character) @@ -81,28 +54,27 @@ sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, if (length(hmdb_codes) != 0) { for(i in 1:length(hmdb_codes)){ - #compound="HMDB00045" compound <- hmdb_codes[i] compound_plus <- c(compound,paste(compound, adducts, sep = "_")) - # x=peaklist$HMDB_code[1] metab <- unlist(lapply(peaklist$HMDB_code, function(x) {(length(intersect(unlist(strsplit(as.vector(x),";")),compound_plus))>0)})) - # peaklist[metab, "assi.hmdb"] - # which(metab==TRUE) total <- c() - # peaklist[metab, c("mzmed.pgrp", "HMDB_code", "C34.1")] - # ints=peaklist[metab, c(7:(length(grpnames.long)+6))] + # get the intensities for selected metabolite. + # NB: column numbers in previous version of code are incorrect if (z_score == 1) { - ints <- peaklist[metab, c(15:(length(grpnames.long)+14))] + int_cols_C <- grep("C", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")]) + int_cols_P <- grep("P", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")]) + int_cols <- c(int_cols_C, int_cols_P) + ints <- peaklist[metab, int_cols] + # ints <- peaklist[metab, c(15:(length(grpnames.long)+14))] } else { ints <- peaklist[metab, c(7:(length(grpnames.long)+6))] } total <- apply(ints, 2, sum) - + if (sum(total)!=0) { - #message(i) names <- c(names, compound) adductsum <- rbind(adductsum,total) names_long <- c(names_long, hmdb_names[i]) diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf index 6a7be46..531308e 100644 --- a/DIMS/SumAdducts.nf +++ b/DIMS/SumAdducts.nf @@ -1,11 +1,12 @@ process SumAdducts { + tag "DIMS SumAdducts ${collect_file}" label 'SumAdducts' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: path(collect_file) // input files need to be linked, but called within R script - path(replication_pattern) + path(replication_pattern) // input files need to be linked, but called within R script path(HMDBpart_main_file) output: diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf index af86aa3..df91279 100644 --- a/DIMS/ThermoRawFileParser.nf +++ b/DIMS/ThermoRawFileParser.nf @@ -1,14 +1,14 @@ process ConvertRawFile { - // Custom process to convert raw file to mzML format tag {"DIMS ConvertRawFile ${file_id}"} + // Custom process to convert raw file to mzML format label 'ThermoRawFileParser_1_1_11' shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(file_id, path(raw_file)) + tuple val(file_id), path(raw_file) output: - tuple(file_id, path("${file_id}.mzML")) + tuple val(file_id), path("${file_id}.mzML") script: diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R new file mode 100755 index 0000000..525bfc0 --- /dev/null +++ b/DIMS/UnidentifiedCalcZscores.R @@ -0,0 +1,67 @@ +#!/usr/bin/Rscript +## adapted from 10-collectSamplesFilled.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n") + +scripts_dir <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) +z_score <- as.numeric(cmd_args[3]) + +source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R")) +source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R")) +# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R")) + +# for each scan mode, collect all filled peak group lists +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + # get list of files + # filled_files <- list.files("./", full.names=TRUE, pattern=scanmode) + filled_file <- paste0("./PeakGroupList_", scanmode, "_Unidentified_filled.RData") + print(filled_file) + # load file + outlist.tot <- get(load(filled_file)) + + # remove duplicates; peak groups with exactly the same m/z + outlist.tot <- mergeDuplicatedRows(outlist.tot) + + # sort on mass + outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),] + + # load replication pattern + pattern_file <- paste0(scanmode, "_repl_pattern.RData") + repl_pattern <- get(load(pattern_file)) + + # Normalization: not done. + # if (normalization != "disabled") { + # outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") + # } + + if (z_score == 1) { + outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) + nr.removed.samples <- length(which(repl_pattern[]=="character(0)")) + order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)]) + outlist.stats.more <- cbind(outlist.stats[,1:7], + outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)], + outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int], + outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)]) + + tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) + tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index])) + tmp <- outlist.stats.more[,tmp.index[tmp.index.order]] + outlist.stats.more <- outlist.stats.more[,-tmp.index] + outlist.stats.more <- cbind(outlist.stats.more,tmp) + outlist.tot <- outlist.stats.more + } + + outlist.not.ident = outlist.tot + + # Extra output in Excel-readable format: + remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") + remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns) + outlist.not.ident <- outlist.not.ident[ , -remove_colindex] + write.table(outlist.not.ident, file=paste0("unidentified_outlist_", scanmode, ".txt"), sep="\t", row.names = FALSE) + +} diff --git a/DIMS/UnidentifiedCalcZscores.nf b/DIMS/UnidentifiedCalcZscores.nf new file mode 100644 index 0000000..1687b76 --- /dev/null +++ b/DIMS/UnidentifiedCalcZscores.nf @@ -0,0 +1,18 @@ +process UnidentifiedCalcZscores { + tag "DIMS UnidentifiedCalcZscores" + label 'UnidentifiedCalcZscores' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(unidentified_filled_files) + path(replication_pattern) // input files need to be linked, but called within R script + + output: + path('unidentified_outlist*.txt') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCalcZscores.R $params.scripts_dir $params.ppm $params.zscore + """ +} diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R new file mode 100755 index 0000000..322a1c1 --- /dev/null +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -0,0 +1,102 @@ +#!/usr/bin/Rscript +## adapted from 7-collectSamplesGroupedHMDB.R + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +ppm <- as.numeric(cmd_args[1]) +outdir <- "./" + +scanmodes <- c("positive", "negative") + +for (scanmode in scanmodes) { + # get list of all files that contain lists of peaks that were used in identified peak grouping + files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep="")) + # load the list of all peaks + load(paste0("SpectrumPeaks_", scanmode, ".RData")) #outlist.tot + + # Make a list of indexes of peaks that have been identified, then remove these from the peaklist. + remove <- NULL + for (i in 1:length(files)) { + load(files[i]) # outlist.grouped, now called list_of_peaks_used_in_peak_groups_identified + remove <- c(remove, which(outlist.tot[ ,"mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i ,"mzmed.pkt"])) + } + outlist.rest <- outlist.tot[-remove, ] + + # sort on mass + outlist <- outlist.rest[order(as.numeric(outlist.rest[ ,"mzmed.pkt"])),] + + save(outlist, file=paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData")) + + # cut the unidentified peak list in parts of part_len length for parallel processing + # NB: the while statement below gives an error message. Skip the cutting into parts. + # part_len <- 10000 + # nr_peaks <- dim(outlist)[1] + # end <- 0 + # min_1_last <- part_len + # check <- 0 + # outlist_i_min_1 <- NULL + # part_nr <- 0 + + # if (nr_peaks >= part_len & (floor(nr_peaks/part_len) - 1) >= 2) { + # for (part_nr in 2:floor(nr_peaks/part_len) -1 ) { + # start <- -(part_len-1) + part_nr*part_len + # end <- part_nr*part_len + # + # if (part_nr > 1) { + # outlist_i <- outlist[c(start:end),] + # + # n_moved <- 0 + + # # Calculate ppm and replace border, avoid cut within peakgroup! + # while ((as.numeric(outlist_i[1, "mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last, "mzmed.pkt"])) * 1e+06/as.numeric(outlist_i[1, "mzmed.pkt"]) < ppm) { + # outlist_i_min_1 <- rbind(outlist_i_min_1, outlist_i[1,]) + # outlist_i <- outlist_i[-1, ] + # n_moved <- n_moved + 1 + # } + + # # message(paste("Process", i-1,":", dim(outlist_i_min_1)[1])) + # save(outlist_i_min_1, file = paste(outdir, paste(scanmode, paste("outlist_i_min_1", part_nr-1, "RData", sep="."), sep="_"), sep="/")) + # check <- check + dim(outlist_i_min_1)[1] + + # outlist_i_min_1 <- outlist_i + # min_1_last <- dim(outlist_i_min_1)[1] + + # } else { + # outlist_i_min_1 = outlist[c(start:end), ] + # } + # } + #} + + #start <- end + 1 + #end <- nr_peaks + #outlist_i <- outlist[c(start:end), ] + #n_moved <- 0 + + #if(!is.null(outlist_i_min_1)){ + # # Calculate ppm and replace border, avoid cut within peakgroup! + # while ((as.numeric(outlist_i[1,"mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last,"mzmed.pkt"]))*1e+06/as.numeric(outlist_i[1,"mzmed.pkt"]) < ppm) { + # outlist_i_min_1 = rbind(outlist_i_min_1, outlist_i[1,]) + # outlist_i = outlist_i[-1,] + # n_moved = n_moved + 1 + # } + + # cat(paste("Process", i+1-1,":", dim(outlist_i_min_1)[1])) + # save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i,"RData", sep="."), sep="_"), sep="/")) + # check=check+dim(outlist_i_min_1)[1] + #} + + #outlist_i_min_1=outlist_i + #cat("Process", i+2-1,":", dim(outlist_i_min_1)[1], "\n") + #save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i+1,"RData", sep="."), sep="_"), sep="/")) + #} + + #check <- check + dim(outlist_i_min_1)[1] + #if (check==dim(outlist)[1]){ + # cat("Check is oke!\n") + #} else { + # cat("Check is failed!\n") +} diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf new file mode 100644 index 0000000..475b0a4 --- /dev/null +++ b/DIMS/UnidentifiedCollectPeaks.nf @@ -0,0 +1,20 @@ +process UnidentifiedCollectPeaks { + tag "DIMS UnidentifiedCollectPeaks" + label 'UnidentifiedCollectPeaks' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(SpectrumPeaks_file) + path(PeakList_identified) + + output: + path('SpectrumPeaks_*_Unidentified.RData') + // path('SpectrumPeaks_negative_Unidentified.RData') + // path('SpectrumPeaks_positive_Unidentified.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $SpectrumPeaks_file $params.ppm + """ +} diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R new file mode 100755 index 0000000..edfc8a7 --- /dev/null +++ b/DIMS/UnidentifiedFillMissing.R @@ -0,0 +1,50 @@ +#!/usr/bin/Rscript +# adapted from 9-runFillMissing.R + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) +for (arg in cmd_args) cat(" ", arg, "\n", sep="") + +# define parameters +peakgrouplist_file1 <- cmd_args[1] +peakgrouplist_file2 <- cmd_args[2] +scripts_dir <- cmd_args[3] +thresh <- as.numeric(cmd_args[4]) +resol <- as.numeric(cmd_args[5]) +ppm <- as.numeric(cmd_args[6]) +outdir <- "./" + +print(scripts_dir) + +# load in function scripts +source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) +source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) +source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R")) +source(paste0(scripts_dir, "AddOnFunctions/getSD.R")) +source(paste0(scripts_dir, "AddOnFunctions/getArea.R")) +source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R")) +source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R")) +source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R")) +source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R")) + +peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) +for (peakgrouplist_file in peakgrouplist_files) { + + if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else + if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" } + + # get replication pattern for sample names + pattern_file <- paste0(scanmode, "_repl_pattern.RData") + repl_pattern <- get(load(pattern_file)) + + # load peak group list and determine output file name + outpgrlist_identified <- get(load(peakgrouplist_file)) + + outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) + + # replace missing values (zeros) with random noise + peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) + + # save output + save(peakgrouplist_filled, file=paste0("./", outputfile_name)) +} diff --git a/DIMS/UnidentifiedFillMissing.nf b/DIMS/UnidentifiedFillMissing.nf new file mode 100644 index 0000000..50c9783 --- /dev/null +++ b/DIMS/UnidentifiedFillMissing.nf @@ -0,0 +1,18 @@ +process UnidentifiedFillMissing { + tag "DIMS UnidentifiedFillMissing ${GroupedList_file}" + label 'UnidentifiedFillMissing' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(GroupedList_file) + path(replication_pattern) // input files need to be linked, but called within R script + + output: + path('*_filled.RData') + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + """ +} diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R new file mode 100755 index 0000000..d7a3dee --- /dev/null +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -0,0 +1,93 @@ +#!/usr/bin/Rscript +## adapted from 8-peakGrouping.rest.R + +# load required packages +# none + +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +resol <- as.numeric(cmd_args[1]) +ppm <- as.numeric(cmd_args[2]) +outdir <- "./" +options(digits=16) + +print(list.files(outdir, pattern="RData")) + +scanmodes <- c("positive", "negative") + +# function for grouping unidentified peaks +groupingRest <- function(outdir, unidentified_peaklist, scanmode, ppm) { + outlist.copy <- get(load(unidentified_peaklist)) + # batch = strsplit(unidentified_peaklist, ".",fixed = TRUE)[[1]][2] + load(paste0("./", scanmode, "_repl_pattern.RData")) + + outpgrlist <- NULL + + # Then group on highest peaks + range <- ppm*1e-06 + startcol <- 7 + + # while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) { + while (dim(outlist.copy)[1] > 0) { + + sel <- which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1] + + # ppm range around max + mzref <- as.numeric(outlist.copy[sel, "mzmed.pkt"]) + pkmin <- -(range*mzref - mzref) + pkmax <- 2*mzref-pkmin + + selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax + tmplist <- outlist.copy[selp,,drop=FALSE] + + nrsamples <- length(unique(tmplist[,"samplenr"])) + if (nrsamples > 0) { + + mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) + mzmin.pgrp <- -(range*mzmed.pgrp - mzmed.pgrp) + mzmax.pgrp <- 2*mzmed.pgrp - mzmin.pgrp + + selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp + tmplist <- outlist.copy[selp,,drop=FALSE] + + # remove used peaks!!! + tmp <- as.vector(which(tmplist[,"height.pkt"]==-1)) + if (length(tmp)>0) tmplist<-tmplist[-tmp,,drop=FALSE] + + nrsamples <- length(unique(tmplist[,"samplenr"])) + + fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"])) + fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"])) + ints.allsamps <- rep(0, length(names(repl_pattern_filtered))) + names(ints.allsamps) <- names(repl_pattern_filtered) # same order as sample list!!! + + # Check for each sample if multiple peaks exists, if so take the sum! + labels <- unique(tmplist[,"samplenr"]) + ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) + + outpgrlist <- rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps,NA,NA,NA,NA)) + } + + outlist.copy <- outlist.copy[-which(selp==TRUE),,drop=FALSE] + } + + outpgrlist <- as.data.frame(outpgrlist) # ignore warnings of duplicate row names + colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") + colnames(outpgrlist)[(length(repl_pattern_filtered)+7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB") + + return(outpgrlist) + +} + + +for (scanmode in scanmodes) { + unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData") + # generate peak group lists of the unidentified peaks + outpgrlist <- groupingRest(outdir, unidentified_peaklist, scanmode, ppm=ppm) + + # save output in RData format for further processing + save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData")) + write.table(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) +} + diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf new file mode 100644 index 0000000..00fc576 --- /dev/null +++ b/DIMS/UnidentifiedPeakGrouping.nf @@ -0,0 +1,19 @@ +process UnidentifiedPeakGrouping { + tag "DIMS UnidentifiedPeakGrouping ${UnidentifiedSpectrumPeaks_file}" + label 'UnidentifiedPeakGrouping' + container = 'docker://umcugenbioinf/dims:1.3' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + path(UnidentifiedSpectrumPeaks_file) // input files need to be linked, but called within R script + path(replication_pattern) // input files need to be linked, but called within R script + + output: + path('*_Unidentified.txt') + path('*_Unidentified.RData'), emit: grouped_unidentified + + script: + """ + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $params.resolution $params.ppm + """ +} From 657a7cdca8d9db879352e7177efbc676f5b04ad5 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 21 Dec 2023 08:24:06 +0100 Subject: [PATCH 14/73] first working version of full NF pipeline --- DIMS/AddOnFunctions/check_same_samplename.R | 4 + DIMS/AddOnFunctions/create_violin_plots.R | 98 +++++++++++ DIMS/AddOnFunctions/prepare_alarmvalues.R | 32 ++++ DIMS/AddOnFunctions/prepare_data.R | 42 +++++ DIMS/AddOnFunctions/prepare_data_perpage.R | 47 +++++ DIMS/AddOnFunctions/prepare_toplist.R | 29 +++ DIMS/AssignToBins.R | 5 + DIMS/AssignToBins.nf | 3 +- DIMS/AverageTechReplicates.R | 61 ++++++- DIMS/AverageTechReplicates.nf | 7 +- DIMS/CollectSumAdducts.nf | 2 +- DIMS/GenerateExcel.R | 22 ++- DIMS/GenerateExcel.nf | 3 +- DIMS/GenerateViolinPlots.R | 184 ++++++++++++++------ DIMS/GenerateViolinPlots.nf | 8 +- DIMS/HMDBparts.nf | 2 +- DIMS/ThermoRawFileParser.nf | 2 +- 17 files changed, 476 insertions(+), 75 deletions(-) create mode 100644 DIMS/AddOnFunctions/check_same_samplename.R create mode 100644 DIMS/AddOnFunctions/create_violin_plots.R create mode 100644 DIMS/AddOnFunctions/prepare_alarmvalues.R create mode 100644 DIMS/AddOnFunctions/prepare_data.R create mode 100644 DIMS/AddOnFunctions/prepare_data_perpage.R create mode 100644 DIMS/AddOnFunctions/prepare_toplist.R diff --git a/DIMS/AddOnFunctions/check_same_samplename.R b/DIMS/AddOnFunctions/check_same_samplename.R new file mode 100644 index 0000000..16ae9cf --- /dev/null +++ b/DIMS/AddOnFunctions/check_same_samplename.R @@ -0,0 +1,4 @@ +# function to test whether intensity and Z-score columns match +check_same_samplename <- function(int_col_name, zscore_col_name) { + paste0(int_col_name, "_Zscore") == zscore_col_name +} diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R new file mode 100644 index 0000000..35e6afb --- /dev/null +++ b/DIMS/AddOnFunctions/create_violin_plots.R @@ -0,0 +1,98 @@ +create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) { + + # set parameters for plots + plot_height <- 9.6 + plot_width <- 6 + fontsize <- 1 + nr_plots_perpage <- 20 + circlesize <- 0.8 + colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF","#A704FD","#F36265","#DA0641") + # green blue blue/purple purple orange red + + # patient plots, create the PDF device + if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" + } + + pdf(paste0(pdf_dir, "/", prefix, pt_name, ".pdf"), + onefile = TRUE, + width = plot_width, + height = plot_height) + + # page headers: + page_headers <- names(metab_perpage) + + # put table into PDF file, if not empty + if (!is.null(dim(top_metab_pt))) { + plot.new() + # get the names and numbers in the table aligned + table_theme <- ttheme_default(core = list(fg_params = list(hjust=0, x=0.05, fontsize=6)), + colhead = list(fg_params = list(fontsize=8, fontface="bold"))) + grid.table(top_metab_pt, theme = table_theme, rows = NULL) + # g <- tableGrob(top_metab_pt) + # grid.draw(g) + text(x=0.45, y=1.02, paste0("Top deviating metabolites for patient: ", pt_name), font=1, cex=1) + } + + # violin plots + for (page_index in 1:length(metab_perpage)) { + # extract list of metabolites to plot on a page + metab_list_2plot <- metab_perpage[[page_index]] + # cut off Z-scores higher than 20 or lower than -5 (for nicer plots) + metab_list_2plot$value[metab_list_2plot$value > 20] <- 20 + metab_list_2plot$value[metab_list_2plot$value < -5] <- -5 + # extract data for patient of interest (pt_name) + pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients + metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ] + # subtitle per page + sub_perpage <- gsub("_", " ", page_headers[page_index]) + # for IEM plots, put subtitle on two lines + sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + + # draw violin plot. shape=22 gives square for patient of interest + ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) + + theme(axis.text.y=element_text(size=rel(fontsize)), plot.caption = element_text(size=rel(fontsize))) + + xlim(-5, 20) + + geom_violin(scale="width") + + geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") + + scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") + + # add labels. Use font Courier to get all the plots in the same location. + labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + + theme(axis.text.y = element_text(family = "Courier", size=6)) + + # do not show legend + theme(legend.position="none") + + # add title + ggtitle(label = paste0("Results for patient ", pt_name)) + + # labs(x = "Z-scores", y = "Metabolites", title = paste0("Results for patient ", pt_name), subtitle = sub_perpage, color = "z-score") + + # add vertical lines + geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) + + geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2) + + suppressWarnings(print(ggplot_object)) + + } + + # add explanation of violin plots, version number etc. + # plot.new() + plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='') + if (length(explanation) > 0) { + text(0.2, 5, explanation[1], pos=4, cex=0.8) + for (line_index in 2:length(explanation)) { + text_y_position <- 5 - (line_index*0.2) + text(-0.2, text_y_position, explanation[line_index], pos=4, cex=0.5) + } + # full_explanation <- paste(explanation[2:length(explanation)], sep=" \n") + # text(0.2, 4, full_explanation, pos=4, cex=0.6) + #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n")) + #grid.arrange(explanation_grob) + } + + # close the PDF file + dev.off() + +} diff --git a/DIMS/AddOnFunctions/prepare_alarmvalues.R b/DIMS/AddOnFunctions/prepare_alarmvalues.R new file mode 100644 index 0000000..ff94fce --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_alarmvalues.R @@ -0,0 +1,32 @@ +prepare_alarmvalues <- function(pt_name, metab_interest_sorted) { + # set parameters for table + high_zscore_cutoff <- 5 + low_zscore_cutoff <- -3 + + # make table of all metabolites + all_metab <- c() + for (page_nr in 1:length(metab_interest_sorted)) { + all_metab <- rbind(all_metab, metab_interest_sorted[[page_nr]]) + } + # extract data for patient of interest (pt_name) + pt_list <- all_metab[which(all_metab$variable==pt_name), ] + # remove column with patient name + pt_list <- pt_list[ , -2] + # round off Z-scores + pt_list$value <- round(as.numeric(pt_list$value), 2) + + # determine alarms for this patient: Z-score above 5 or below -3 + pt_list_high <- pt_list[pt_list$value > high_zscore_cutoff, ] + pt_list_low <- pt_list[pt_list$value < low_zscore_cutoff, ] + # add lines for increased, decreased + extra_line1 <- c("Increased", "") + extra_line2 <- c("Decreased", "") + # combine the two lists + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + # change column names for display + colnames(top_metab_pt) <- c("Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/AddOnFunctions/prepare_data.R b/DIMS/AddOnFunctions/prepare_data.R new file mode 100644 index 0000000..07a8805 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_data.R @@ -0,0 +1,42 @@ +prepare_data <- function(metab_list_all, zscore_patients_local) { + # remove "_Zscore" from column (patient) names + colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local)) + # put data into pages, max 20 violin plots per page in PDF + metab_interest_sorted <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_list_all)) { # "acyl_carnitines" "amino_acids" "crea_gua" + metab_class <- names(metab_list_all)[metab_class_index] + metab_list <- metab_list_all[[metab_class_index]] + if (ncol(metab_list) > 2) { + # third column are the alarm values, so reduce the data frame to 2 columns and save list + metab_list_alarm <- metab_list + metab_list <- metab_list[ , c(1,2)] + } + # make sure that all HMDB_names have 45 characters + for (metab_index in 1:length(metab_list$HMDB_name)) { + if (is.character(metab_list$HMDB_name[metab_index])) { + HMDB_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]] + } else { HMDB_name_separated <- "strspliterror" } + if (length(HMDB_name_separated) <= 45) { + HMDB_name_separated <- c(HMDB_name_separated, rep(" ", 45-length(HMDB_name_separated))) + } else { + HMDB_name_separated <- c(HMDB_name_separated[1:42], "...") + } + metab_list$HMDB_name[metab_index] <- paste0(HMDB_name_separated, collapse = "") + } + # find metabolites and ratios in data frame zscore_patients_local + metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code") + # remove column "HMDB_code" + metab_interest <- metab_interest[ , -which(colnames(metab_interest) == "HMDB_code")] + # put the data frame in long format + metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name") + # sort on metabolite names (HMDB_name) + sort_order <- order(metab_interest_melt$HMDB_name) + metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ] + metab_category <- c(metab_category, metab_class) + } + names(metab_interest_sorted) <- metab_category + + return(metab_interest_sorted) + +} diff --git a/DIMS/AddOnFunctions/prepare_data_perpage.R b/DIMS/AddOnFunctions/prepare_data_perpage.R new file mode 100644 index 0000000..9f112f3 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_data_perpage.R @@ -0,0 +1,47 @@ +prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat=20, nr_contr=30) { + total_nr_pages <- 0 + metab_perpage <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_interest_sorted)) { # "acyl_carnitines" "amino_acids" "crea_gua" + # split list into pages, each page containing max nr_plots_perpage (20) compounds + metab_interest_perclass <- metab_interest_sorted[[metab_class_index]] + metab_class <- names(metab_interest_sorted)[metab_class_index] + # add controls + metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]] + # number of pages for this class + nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage) + for (page_nr in 1:nr_pages) { + total_nr_pages <- total_nr_pages + 1 + select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr-1)) + 1 + select_rows_end <- nr_pat * nr_plots_perpage * page_nr + metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ] + # same for controls + select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr-1)) + 1 + select_rows_end_contr <- nr_contr * nr_plots_perpage * page_nr + metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ] + # add controls + metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr) + # if a page has fewer than nr_plots_perpage plots, fill page with empty plots + NA_rows <- which(is.na(metab_onepage$HMDB_name)) + if (length(NA_rows) > 0) { + # repeat the patient and control variables + metab_onepage$variable[NA_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)] + # for HMDB name, substitute a number of spaces + for (row_nr in NA_rows) { + metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr/(nr_pat + nr_contr))), collapse = "") + } + metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name) + # leave the values at NA + } + # put data for one page into object with data for all pages + metab_perpage[[total_nr_pages]] <- metab_onepage + # create list of page headers + metab_category <- c(metab_category, paste(metab_class, page_nr, sep="_")) + } + } + # add page headers to list + names(metab_perpage) <- metab_category + + return(metab_perpage) + +} diff --git a/DIMS/AddOnFunctions/prepare_toplist.R b/DIMS/AddOnFunctions/prepare_toplist.R new file mode 100644 index 0000000..369f887 --- /dev/null +++ b/DIMS/AddOnFunctions/prepare_toplist.R @@ -0,0 +1,29 @@ +prepare_toplist <- function(pt_name, zscore_patients_copy) { + # set parameters for table + top_highest <- 20 + top_lowest <- 10 + + # extract data for patient of interest (pt_name) + pt_list <- zscore_patients_copy[ , c(1,2, which(colnames(zscore_patients_copy) == pt_name))] + # sort metabolites on Z-scores for this patient + pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE) + pt_list_sorted <- pt_list[pt_list_sort$ix, ] + # determine top highest and lowest Z-scores for this patient + pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE) + pt_list_low <- pt_list[pt_list_sort$ix[1:top_lowest], ] + pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix)-top_highest+1)], ] + # round off Z-scores + pt_list_low[ , 3] <- round(as.numeric(pt_list_low[ , 3]), 2) + pt_list_high[ , 3] <- round(as.numeric(pt_list_high[ , 3]), 2) + # add lines for increased, decreased + extra_line1 <- c("Increased", "", "") + extra_line2 <- c("Decreased", "", "") + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + + # change column names for display + colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index ae9c8b1..cfbe15c 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -35,6 +35,11 @@ neg_results <- NULL # read in the data for 1 sample raw_data <- suppressMessages(xcmsRaw(filepath)) +# generate TIC plots. Prepare txt files with data for plots +TIC_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) +colnames(TIC_intensity_persample) <- c("retentionTime", "TIC") +write.table(TIC_intensity_persample, file = paste0("./", sample_name, "_TIC.txt")) + # load breaks_fwhm load(breaks_filepath) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index fbbb48e..25b3891 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -8,7 +8,8 @@ process AssignToBins { tuple val(file_id), path(mzML_filename) , path(breaks_file) output: - path("${file_id}.RData") + path("${file_id}.RData"), emit: RData_files + path("${file_id}_TIC.txt"), emit: TIC_txt_files script: """ diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 4001937..098651a 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -6,8 +6,20 @@ cmd_args <- commandArgs(trailingOnly = TRUE) init_filepath <- cmd_args[1] nr_replicates <- as.numeric(cmd_args[2]) -thresh2remove <- 2000 +thresh2remove <- 1000000000 dimsThresh <- 100 +run_name <- cmd_args[3] +dims_matrix <- cmd_args[4] + +print(run_name) +print(dims_matrix) + +if (dims_matrix == "DBS") { + thresh2remove <- 50000000 +} + +library("ggplot2") +library("gridExtra") removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { # bad_samples=remove_pos @@ -50,11 +62,8 @@ for (i in 1:length(repl_pattern)) { sum_pos <- 0 n_pos <- 0 n_neg <- 0 - cat("\n\nNow sample ", i, " from replication pattern with length ", length(repl_pattern)) for (j in 1:length(tech_reps)) { load(paste("./", tech_reps[j], ".RData", sep="")) - cat("\n\nParsing", tech_reps[j]) - cat("\n\tNegative peak_list sum", sum(peak_list$neg[,1])) if (sum(peak_list$neg[,1]) < thresh2remove){ cat(" ... Removed") @@ -104,4 +113,48 @@ repl_pattern_filtered <- retVal$pattern save(repl_pattern_filtered, file = "./positive_repl_pattern.RData") write.table(remove_pos, file = "./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE , sep= "\t") +# New: generate TIC plots +# get all txt files +TIC_files = list.files("./", full.names=TRUE, pattern="*TIC.txt") +all_samps <- sub('_TIC\\..*$', '', basename(TIC_files)) + +# determine maximum intensity +highest_tic_max <- 0 +for (file in TIC_files) { + tic <- read.table(file) + this_tic_max <- max(tic$TIC) + if (this_tic_max > highest_tic_max) { + highest_tic_max <- this_tic_max + max_sample <- sub('_TIC\\..*$', '', basename(file)) + } +} + +tic_plot_list <- list() +k = 0 +for (i in c(1:length(repl_pattern))) { # change after test-phase !!! + tech_reps <- as.vector(unlist(repl_pattern[i])) + sampleName <- names(repl_pattern)[i] + for (j in 1:length(tech_reps)) { + k = k + 1 + repl1_nr <- read.table(TIC_files[j]) + bad_color_pos <- tech_reps[j] %in% remove_pos[[1]] + bad_color_neg <- tech_reps[j] %in% remove_neg[[1]] + if (bad_color_neg & bad_color_pos) {plotcolor = '#F8766D'} else if (bad_color_pos) {plotcolor = "#ED8141"} else if (bad_color_neg) {plotcolor = "#BF80FF"} else {plotcolor = 'white'} + tic_plot <- ggplot(repl1_nr, aes(retentionTime, TIC)) + + geom_line(linewidth = 0.3) + + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + + labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], " || ", sampleName)) + + theme(plot.background = element_rect(fill = plotcolor), axis.text = element_text(size = 4), axis.title = element_text(size = 4), plot.title = element_text(size = 6)) + tic_plot_list[[k]] <- tic_plot + } + +} +# create a layout matrix dependent on numer of replicates +layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) +tic_plot_pdf <- marrangeGrob(grobs = tic_plot_list, + nrow = 10, + ncol = nr_replicates, + layout_matrix = layout, + top = quote(paste("TICs of run", run_name," \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", g, "/", npages))) +ggsave(filename = paste0("./../../../", run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 33567a3..aa89477 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -5,8 +5,11 @@ process AverageTechReplicates { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(RData_file) + path(RData_file) // input files need to be linked, but called within R script + path(TIC_txt_files) // input files need to be linked, but called within R script path(init_filepath) + val(analysis_id) + val(matrix) output: path('*_repl_pattern.RData'), emit: patterns @@ -16,7 +19,7 @@ process AverageTechReplicates { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix """ } diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf index a72455c..da7ab0c 100644 --- a/DIMS/CollectSumAdducts.nf +++ b/DIMS/CollectSumAdducts.nf @@ -1,5 +1,5 @@ process CollectSumAdducts { - tag {"DIMS CollectSumAdducts"} + tag "DIMS CollectSumAdducts" label 'CollectSumAdducts' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index a84c9d2..0fd394f 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -243,14 +243,12 @@ if (z_score == 1) { } writeData(wb, sheet = 1, outlist, startCol = 1) xlsx_name <- paste0(outdir, "/", project, ".xlsx") -saveWorkbook(wb, - xlsx_name, - overwrite = TRUE) +saveWorkbook(wb, xlsx_name, overwrite = TRUE) +# save a local copy in work directory as well +saveWorkbook(wb, paste0("./", project, ".xlsx"), overwrite = TRUE) cat(xlsx_name) rm(wb) -write.table(outlist, file = paste(outdir, "allpgrps_stats.txt", sep = "/")) - # INTERNE STANDAARDEN IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),] IS_codes <- rownames(IS) @@ -570,26 +568,26 @@ outlist.ident.neg <- outlist.ident load(paste0(outdir, "RData/outlist_identified_positive.RData")) outlist.ident.pos <- outlist.ident rm(outlist.ident) -rm(outlist.not.ident) # check for missing m/z in negative and positive mode mode <- c("Negative", "Positive") index <- 1 results_ident <- c() #empty results list outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos) -for(outlist.ident in outlist_ident_list){ +for (outlist.ident in outlist_ident_list) { current_mode <- mode[index] # retrieve all unique m/z values in whole numbers and check if all are available mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits=0))) - mz_range = seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600 - mz_missing = c() - for (mz in mz_range){ + mz_range <- seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600 + mz_missing <- c() + for (mz in mz_range) { if (!mz %in% mz_values) { mz_missing <- c(mz_missing, mz) - } } + } + } y <- mz_missing # check if m/z are missing and make an .txt file with information group_ident <- cumsum(c(1, abs(y[-length(y)] - y[-1]) > 1)) - if(length(group_ident) > 1){ + if (length(group_ident) > 1) { results_ident <- c(results_ident, paste0("Missing m/z values ", current_mode, " mode")) results_ident <- c(results_ident, by(y, group_ident, identity)) } else { diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 655ce0d..9e861e2 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -1,5 +1,5 @@ process GenerateExcel { - tag {"DIMS GenerateExcel"} + tag "DIMS GenerateExcel" label 'GenerateExcel' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] @@ -12,6 +12,7 @@ process GenerateExcel { output: path('AdductSums_*.RData') + path('*.xlsx'), emit: excel_file script: """ diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index bf2abe8..a8b819c 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -5,14 +5,15 @@ # 1. Excel file in which metabolites are listed with their intensities for # controls (with C in samplename) and patients (with P in samplename) and their # corresponding Z-scores. -# 2. All files from github: https://github.com/UMCUGenetics/dIEM +# 2. All files from github: https://github.com/UMCUGenetics/DIMS + +## adapted from 15-dIEM_violin.R library(dplyr) # tidytable is for other_isobaric.R (left_join) library(reshape2) # used in prepare_data.R -library(data.table) # for function setDT library(openxlsx) # for opening Excel file library(ggplot2) # for plotting -#library(gridExtra) # for table top highest/lowest +library(gridExtra) # for table top highest/lowest # define parameters - check after addition to run.sh cmd_args <- commandArgs(trailingOnly = TRUE) @@ -20,50 +21,56 @@ for (arg in cmd_args) { cat(" ", arg, "\n", sep = "") } -scripts_dir <- cmd_args[1] -run_name <- cmd_args[2] # run_name <- "Test_run_5" -z_score <- as.numeric(cmd_args[3]) # calculate Z-scores or not? z_score <- 1 +run_name <- cmd_args[1] +scripts_dir <- cmd_args[2] +z_score <- as.numeric(cmd_args[3]) # load functions -source(paste0(scripts_dir, "AddOnFunctions/same_samplename.R")) +source(paste0(scripts_dir, "AddOnFunctions/check_same_samplename.R")) source(paste0(scripts_dir, "AddOnFunctions/prepare_data.R")) source(paste0(scripts_dir, "AddOnFunctions/prepare_data_perpage.R")) source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R")) -source(paste0(scripts_dir, "AddOnFunctions/violin_plots.R")) +source(paste0(scripts_dir, "AddOnFunctions/create_violin_plots.R")) source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R")) # The list of parameters can be shortened for HPC. Leave for now. -top_nr_IEM <- 5 # number of diseases that score highest in algorithm to plot -integer_list <- c(1:top_nr_IEM) # indices of top diseases -threshold_IEM <- 5 # probability score cut-off for plotting the top diseases -ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases +top_nr_IEM <- 5 # number of diseases that score highest in algorithm to plot +threshold_IEM <- 5 # probability score cut-off for plotting the top diseases +ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases nr_plots_perpage <- 20 # number of violin plots per page in PDF # Settings from config.R # binary variable: run function, yes(1) or no(0). Can be removed at later stage -if (z_score == 1) { algorithm <- ratios <- violin <- 1 } # ? Or put if statement in run.sh? +if (z_score == 1) { + algorithm <- ratios <- violin <- 1 +} # integer: are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) -header_row = 1 +header_row <- 1 # column name where the data starts (default B) -col_start <- "B" +col_start <- "B" zscore_cutoff <- 5 -xaxis_cutoff <- 20 +xaxis_cutoff <- 20 # path to DIMS excel file -# path_DIMSfile = "/Users/ihoek3/Documents/dIEM/voorbeeld_PL_Diagn17_RUN10.xlsx" -path_DIMSfile <- paste0("./", run_name, ".xlsx") +path_DIMSfile <- paste0("./", run_name, ".xlsx") -# path: output folder -output_dir <- paste0("./dIEM") +# path: output folder for dIEM and violin plots +output_dir <- paste0("../../../dIEM") dir.create(output_dir, showWarnings = F) +print(getwd()) +print(path_DIMSfile) # folder in which all metabolite lists are (.txt) path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups" -### file for ratios step 3 +# file for ratios step 3 file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv" -### file for algorithm step 4 +# file for algorithm step 4 file_expected_biomarkers_IEM <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv" +# explanation: file with text to be included in violin plots +file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt" +# copy list of isomers to project folder. +file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir) #### STEP 1: Preparation #### # in: run_name, path_DIMSfile, header_row ||| out: output_dir, DIMS @@ -84,7 +91,7 @@ if (exists("dims_xls")) { # Determine the number of Contols and Patients in column names: nr_contr <- length(grep("C",names(dims_xls)))/2 # Number of control samples -nr_pat <- length(grep("P",names(dims_xls)))/2 # Number of patient samples +nr_pat <- length(grep("P",names(dims_xls)))/2 # Number of patient samples # total number of samples nrsamples <- nr_contr + nr_pat # check whether the number of intensity columns equals the number of Zscore columns @@ -95,11 +102,11 @@ cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n- # Move the columns HMDB_code and HMDB_name to the beginning. HMDB_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) -other_cols <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols] -dims_xls_copy <- dims_xls[ , c(HMDB_info_cols, other_cols)] +other_cols <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols] +dims_xls_copy <- dims_xls[ , c(HMDB_info_cols, other_cols)] # Remove the columns from 'name' to 'pathway' -from_col <- which(colnames(dims_xls_copy) == "name") -to_col <- which(colnames(dims_xls_copy) == "pathway") +from_col <- which(colnames(dims_xls_copy) == "name") +to_col <- which(colnames(dims_xls_copy) == "pathway") dims_xls_copy <- dims_xls_copy[ , -c(from_col:to_col)] # in case the excel had an empty "plots" column, remove it if ("plots" %in% colnames(dims_xls_copy)) { @@ -143,13 +150,13 @@ if (ratios == 1) { ncol=ncol(dims_xls_copy), nrow=nrow(ratio_input) )), colnames(dims_xls_copy)) - + # put HMDB info into first two columns of ratio_list ratio_list[ ,1:2] <- ratio_input[ ,1:2] # look for intensity columns (exclude Zscore columns) - control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) intensity_cols <- c(control_cols, patient_cols) # calculate each of the ratios of intensities for (ratio_index in 1:nrow(ratio_input)) { @@ -163,15 +170,23 @@ if (ratios == 1) { sel_numerator <- c(sel_numerator, which(dims_xls_copy[ , "HMDB.code"] == ratio_numerator[numerator_index])) } for (denominator_index in 1:length(ratio_denominator)) { - sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) + # special case for sum of metabolites (dividing by one) + if (ratio_denominator[denominator_index] != "one") { + sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) + } } # calculate ratio - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / - apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) + if (ratio_denominator[denominator_index] != "one") { + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / + apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) + } else { + # special case for sum of metabolites (dividing by one) + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) + } # calculate log of ratio ratio_list[ratio_index, intensity_cols]<- log2(ratio_list[ratio_index, intensity_cols]) } - + # Calculate means and SD's of the calculated ratios for Controls ratio_list[ , "Mean_controls"] <- apply(ratio_list[ , control_cols], 1, mean) ratio_list[ , "SD_controls"] <- apply(ratio_list[ , control_cols], 1, sd) @@ -182,13 +197,13 @@ if (ratios == 1) { zscore_col <- zscore_cols[sample_index] # matching intensity column int_col <- intensity_cols[sample_index] - # add test on column names - if (same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { + # test on column names + if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { # calculate Z-scores ratio_list[ , zscore_col] <- (ratio_list[ , int_col] - ratio_list[ , "Mean_controls"]) / ratio_list[ , "SD_controls"] } } - + # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) @@ -196,9 +211,10 @@ if (ratios == 1) { # HMDB_code patientname1 patientname2 names(dims_xls_ratios) <- gsub("HMDB.code","HMDB_code", names(dims_xls_ratios)) names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) - # remove the string "_Zscore" from column names; this causes problems in step 4 - # names(dims_xls_ratios) <- gsub("_Zscore", "", names(dims_xls_ratios)) - + + # for debugging: + write.table(dims_xls_ratios, file=paste0(output_dir, "/ratios.txt"), sep="\t") + # Select only the cols with zscores of the patients zscore_patients <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] # Select only the cols with zscores of the controls @@ -231,12 +247,11 @@ if (algorithm == 1) { # Rank all negative zscores lowest to highest rank_patients[(pos+1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos+1):nrow(rank_patients), patient_index])) } - # NB: Warning message: In xtfrm.data.frame(x) : cannot xtfrm data frames. Ignore for now. - + # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). expected_zscores <- merge(x=expected_biomarkers, y=zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) expected_zscores_original <- expected_zscores # necessary copy? - + # determine which columns contain Z-scores and which contain disease info select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) select_info_cols <- 1:(min(select_zscore_cols) -1) @@ -251,18 +266,18 @@ if (algorithm == 1) { rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols]/(expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols]*0.9) # combine disease info with rank scores expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) - + # multiply weight score and rank score weight_score <- expected_zscores weight_score[ , select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[ , select_zscore_cols] - + # sort table on Disease and Absolute_Weight weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] # select columns to check duplicates dup <- weight_score[ , c('Disease', 'M.z')] uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast=FALSE),] - + # calculate probability score prob_score <- aggregate(uni[ , select_zscore_cols], uni["Disease"], sum) @@ -277,7 +292,7 @@ if (algorithm == 1) { # set the probability score of these diseases to 0 prob_score[which(prob_score$Disease %in% disease_zero), patient_index]<- 0 } - + # determine disease rank per patient disease_rank <- prob_score # rank diseases in decreasing order @@ -285,7 +300,7 @@ if (algorithm == 1) { # modify column names, Zscores have now been converted to probability scores colnames(prob_score) <- gsub("_Zscore","_prob_score", colnames(prob_score)) # redundant? colnames(disease_rank) <- gsub("_Zscore","", colnames(disease_rank)) - + # Create conditional formatting for output excel sheet. Colors according to values. wb <- createWorkbook() addWorksheet(wb, "Probability Scores") @@ -326,6 +341,12 @@ if (violin == 1) { # make violin plots # remove duplicates expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[ , c(1,2)]), ] + # load file with explanatory information to be included in PDF. + explanation <- readLines(file_explanation) + + # for debugging: + #write.table(explanation, file=paste0(outdir, "explanation_read_in.txt"), sep="\t") + # first step: normal violin plots # Find all text files in the given folder, which contain metabolite lists of which # each file will be a page in the pdf with violin plots. @@ -367,15 +388,80 @@ if (violin == 1) { # make violin plots # for category Other, make list of top highest and lowest Z-scores for this patient if (grepl("Diagnost", pdf_dir)) { top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted) + # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_alarmvalues.RData")) } else { top_metab_pt <- prepare_toplist(pt_name, zscore_patients) + # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_toplist.RData")) } # generate normal violin plots - violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) + create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) } # end for pt_nr } # end for metabolite_dir + # Second step: dIEM plots in separate directory + dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep="/") + dir.create(dIEM_plot_dir) + + # Select the metabolites that are associated with the top highest scoring IEM, for each patient + # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. + for (pt_nr in 1:length(patient_list)) { + pt_name <- patient_list[pt_nr] + # get top diseases for this patient + pt_colnr <- which(colnames(disease_rank) == pt_name) + pt_top_indices <- which(disease_rank[ , pt_colnr] <= top_nr_IEM) + pt_IEMs <- disease_rank[pt_top_indices, "Disease"] + pt_top_IEMs <- pt_prob_score_top_IEMs <- c() + for (single_IEM in pt_IEMs) { + # get the probability score + prob_score_IEM <- prob_score[which(prob_score$Disease == single_IEM), pt_colnr] + # use only diseases for which probability score is above threshold + if (prob_score_IEM >= threshold_IEM) { + pt_top_IEMs <- c(pt_top_IEMs, single_IEM) + pt_prob_score_top_IEMs <- c(pt_prob_score_top_IEMs, prob_score_IEM) + } + } + + # prepare data for plotting dIEM violin plots + # If prob_score_top_IEM is an empty list, don't make a plot + if (length(pt_top_IEMs) > 0) { + # Sorting from high to low, both prob_score_top_IEMs and pt_top_IEMs. + pt_prob_score_order <- order(-pt_prob_score_top_IEMs) + pt_prob_score_top_IEMs <- round(pt_prob_score_top_IEMs, 1) + pt_prob_score_top_IEM_sorted <- pt_prob_score_top_IEMs[pt_prob_score_order] + pt_top_IEM_sorted <- pt_top_IEMs[pt_prob_score_order] + # getting metabolites for each top_IEM disease exactly like in metab_list_all + metab_IEM_all <- list() + metab_IEM_names <- c() + for (single_IEM_index in 1:length(pt_top_IEM_sorted)) { + single_IEM <- pt_top_IEM_sorted[single_IEM_index] + single_prob_score <- pt_prob_score_top_IEM_sorted[single_IEM_index] + select_rows <- which(expected_biomarkers_select$Disease == single_IEM) + metab_list <- expected_biomarkers_select[select_rows, ] + metab_IEM_names <- c(metab_IEM_names, paste0(single_IEM, ", probability score ", single_prob_score)) + metab_list <- metab_list[ , -1] + metab_IEM_all[[single_IEM_index]] <- metab_list + } + # put all metabolites into one list + names(metab_IEM_all) <- metab_IEM_names + + # get Zscore information from zscore_patients_copy, similar to normal violin plots + metab_IEM_sorted <- prepare_data(metab_IEM_all, zscore_patients_copy) + metab_IEM_controls <- prepare_data(metab_IEM_all, zscore_controls) + # make sure every page has 20 metabolites + dIEM_metab_perpage <- prepare_data_perpage(metab_IEM_sorted, metab_IEM_controls, nr_plots_perpage, nr_pat) + + # generate dIEM violin plots + create_violin_plots(dIEM_plot_dir, pt_name, dIEM_metab_perpage, top_metab_pt) + + } else { + cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM,". + Therefore, this pdf was not made:\t ", pt_name ,"_IEM \n")) + } + + } # end for pt_nr + } # end if violin = 1 + diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index e580feb..f002341 100644 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -1,17 +1,19 @@ process GenerateViolinPlots { - tag {"DIMS GenerateViolinPlots"} + tag "DIMS GenerateViolinPlots" label 'GenerateViolinPlots' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: path(excel_file) // input files need to be linked, but called within R script + val(analysis_id) output: - path('*.pdf') + // path('*.pdf') // pdf files are generated, but in different directory + path('*.xlsx') script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $params.scripts_dir $params.analysis_id $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore """ } diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 833b94d..50786af 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -1,5 +1,5 @@ process HMDBparts { - tag {"DIMS HMDBparts"} + tag "DIMS HMDBparts" // Custom process to cut HMDB db into parts label 'HMDBparts' container = 'docker://umcugenbioinf/dims:1.3' diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf index df91279..17cee27 100644 --- a/DIMS/ThermoRawFileParser.nf +++ b/DIMS/ThermoRawFileParser.nf @@ -1,5 +1,5 @@ process ConvertRawFile { - tag {"DIMS ConvertRawFile ${file_id}"} + tag "DIMS ConvertRawFile ${file_id}" // Custom process to convert raw file to mzML format label 'ThermoRawFileParser_1_1_11' shell = ['/bin/bash', '-euo', 'pipefail'] From 1aa51e0efc846e87533ded407ed6931a080ad202 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 15 Jan 2024 17:15:49 +0100 Subject: [PATCH 15/73] added parameter for measurements with high m/z --- DIMS/GenerateBreaks.R | 2 +- DIMS/GenerateBreaks.nf | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 7aeb8c4..8a3ca07 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -52,4 +52,4 @@ for (i in 1:nsegment) { # generate output file save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData") - +write(highMZ, file="./highest_mz.txt") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index 593014b..8f37438 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -10,6 +10,7 @@ process GenerateBreaks { output: path('breaks.fwhm.RData') + path('highest_mz.txt'), emit: highest_mz script: """ From f973a23279162bc6a5ed80a4c58a35723ac83353 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 15 Jan 2024 17:20:45 +0100 Subject: [PATCH 16/73] added modification of threshold in case of high m/z --- DIMS/AverageTechReplicates.R | 21 +++++++++++++++++---- DIMS/AverageTechReplicates.nf | 1 + 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 098651a..f84c6ba 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -10,12 +10,17 @@ thresh2remove <- 1000000000 dimsThresh <- 100 run_name <- cmd_args[3] dims_matrix <- cmd_args[4] +highest_mz <- as.numeric(cmd_args[5]) print(run_name) print(dims_matrix) +print(highest_mz) if (dims_matrix == "DBS") { - thresh2remove <- 50000000 + thresh2remove <- 50000000 +} +if (highest_mz > 700) { + thresh2remove <- 1000000 } library("ggplot2") @@ -133,17 +138,25 @@ tic_plot_list <- list() k = 0 for (i in c(1:length(repl_pattern))) { # change after test-phase !!! tech_reps <- as.vector(unlist(repl_pattern[i])) - sampleName <- names(repl_pattern)[i] + sample_name <- names(repl_pattern)[i] for (j in 1:length(tech_reps)) { k = k + 1 repl1_nr <- read.table(TIC_files[j]) bad_color_pos <- tech_reps[j] %in% remove_pos[[1]] bad_color_neg <- tech_reps[j] %in% remove_neg[[1]] - if (bad_color_neg & bad_color_pos) {plotcolor = '#F8766D'} else if (bad_color_pos) {plotcolor = "#ED8141"} else if (bad_color_neg) {plotcolor = "#BF80FF"} else {plotcolor = 'white'} + if (bad_color_neg & bad_color_pos) { + plotcolor = '#F8766D' + } else if (bad_color_pos) { + plotcolor = "#ED8141" + } else if (bad_color_neg) { + plotcolor = "#BF80FF" + } else { + plotcolor = 'white' + } tic_plot <- ggplot(repl1_nr, aes(retentionTime, TIC)) + geom_line(linewidth = 0.3) + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + - labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], " || ", sampleName)) + + labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], " || ", sample_name)) + theme(plot.background = element_rect(fill = plotcolor), axis.text = element_text(size = 4), axis.title = element_text(size = 4), plot.title = element_text(size = 6)) tic_plot_list[[k]] <- tic_plot } diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index aa89477..8775976 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -10,6 +10,7 @@ process AverageTechReplicates { path(init_filepath) val(analysis_id) val(matrix) + val(highest_mz) output: path('*_repl_pattern.RData'), emit: patterns From e41ea88d4d17b60a632aa45d339158edf7f87d85 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 08:53:58 +0100 Subject: [PATCH 17/73] Add EditSummaryFileHappy --- Utils/EditSummaryFileHappy.nf | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 Utils/EditSummaryFileHappy.nf diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf new file mode 100644 index 0000000..20693ee --- /dev/null +++ b/Utils/EditSummaryFileHappy.nf @@ -0,0 +1,26 @@ +process EditSummaryFileHappy { + tag {"EditSummaryFileHappy"} + label 'EditSummaryFileHappy' + shell = ['/bin/bash', '-euo', 'pipefail'] + + input: + tuple(val(meta), path(summary_csv)) + + output: + path("INDEL_PASS_summary.txt", includeInputs: false), emit: indel_pass_summary_txt + path("INDEL_ALL_summary.txt", includeInputs: false), emit: indel_all_summary_txt + path("SNP_PASS_summary.txt", includeInputs: false), emit: snp_pass_summary_txt + path("SNP_ALL_summary.txt", includeInputs: false), emit: snpp_all_summary_txt + + script: + """ + # Add samplenames as columns + sed '1s/$/,sample_query,sample_truth/; 2,$s/$/,${meta.query},${meta.truth}' ${summary_csv} > ${summary_csv}.tmp + + # Split file and add column + awk '{print $0 > $3_$4"_summary.txt"}' ${summary_csv}.tmp + + # Remove tmp files + rm *tmp + """ +} \ No newline at end of file From a70501c60cd7cfec638716f81a391e8e7e49bdd7 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 11:32:43 +0100 Subject: [PATCH 18/73] refactor script in EditSummaryFileHappy --- Utils/EditSummaryFileHappy.nf | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf index 20693ee..d2219c2 100644 --- a/Utils/EditSummaryFileHappy.nf +++ b/Utils/EditSummaryFileHappy.nf @@ -4,23 +4,24 @@ process EditSummaryFileHappy { shell = ['/bin/bash', '-euo', 'pipefail'] input: + // meta should have the keys 'id', 'query' and 'truth' tuple(val(meta), path(summary_csv)) output: - path("INDEL_PASS_summary.txt", includeInputs: false), emit: indel_pass_summary_txt - path("INDEL_ALL_summary.txt", includeInputs: false), emit: indel_all_summary_txt - path("SNP_PASS_summary.txt", includeInputs: false), emit: snp_pass_summary_txt - path("SNP_ALL_summary.txt", includeInputs: false), emit: snpp_all_summary_txt + path("INDEL_PASS_summary.csv"), emit: indel_pass_summary_csv + path("INDEL_ALL_summary.csv"), emit: indel_all_summary_csv + path("SNP_PASS_summary.csv"), emit: snp_pass_summary_csv + path("SNP_ALL_summary.csv"), emit: snp_all_summary_csv script: """ - # Add samplenames as columns - sed '1s/$/,sample_query,sample_truth/; 2,$s/$/,${meta.query},${meta.truth}' ${summary_csv} > ${summary_csv}.tmp + # Add samplenames as columns (header and row values) at start of line + sed '1s/^/sample_query,sample_truth,/; 2,\$s/^/${meta.query},${meta.truth},/' ${summary_csv} > ${summary_csv}.tmp - # Split file and add column - awk '{print $0 > $3_$4"_summary.txt"}' ${summary_csv}.tmp + # Split file including header (first line) + awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$3"_"\$4"_summary.csv"; print \$0>>\$3"_"\$4"_summary.csv"}' ${summary_csv}.tmp # Remove tmp files - rm *tmp + rm ${summary_csv}.tmp """ } \ No newline at end of file From c20d8f8d111a498c535904c0646f45c644cf4636 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 12:52:18 +0100 Subject: [PATCH 19/73] add joined column of truth and query samples --- Utils/EditSummaryFileHappy.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf index d2219c2..8ddb472 100644 --- a/Utils/EditSummaryFileHappy.nf +++ b/Utils/EditSummaryFileHappy.nf @@ -16,7 +16,7 @@ process EditSummaryFileHappy { script: """ # Add samplenames as columns (header and row values) at start of line - sed '1s/^/sample_query,sample_truth,/; 2,\$s/^/${meta.query},${meta.truth},/' ${summary_csv} > ${summary_csv}.tmp + sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp # Split file including header (first line) awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$3"_"\$4"_summary.csv"; print \$0>>\$3"_"\$4"_summary.csv"}' ${summary_csv}.tmp From d8c0bc084c7f829223b451cf471e2084b7fea486 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 12:52:28 +0100 Subject: [PATCH 20/73] add delim --- CheckQC/check_qc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index a47b14a..8e9a5d1 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -177,7 +177,7 @@ def create_and_write_output(qc_output, output_path, output_prefix): def read_and_judge_metrics(qc, metrics): for qc_file in metrics: - qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter="\t", quotechar='"') + qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("sep", "\t"), quotechar='"') report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"]) qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"]) failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"]) From 78c04e7762133b824f792d605eaada33643e85b7 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 12:59:45 +0100 Subject: [PATCH 21/73] change column to Type and Filter --- Utils/EditSummaryFileHappy.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf index 8ddb472..20f29fc 100644 --- a/Utils/EditSummaryFileHappy.nf +++ b/Utils/EditSummaryFileHappy.nf @@ -19,7 +19,7 @@ process EditSummaryFileHappy { sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp # Split file including header (first line) - awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$3"_"\$4"_summary.csv"; print \$0>>\$3"_"\$4"_summary.csv"}' ${summary_csv}.tmp + awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$4"_"\$5"_summary.csv"; print \$0>>\$4"_"\$5"_summary.csv"}' ${summary_csv}.tmp # Remove tmp files rm ${summary_csv}.tmp From eb55d6b09b48e2dc684b8216e901f61f83564b5b Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 13:45:26 +0100 Subject: [PATCH 22/73] use meta.id as tag --- Utils/EditSummaryFileHappy.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf index 20f29fc..83ef755 100644 --- a/Utils/EditSummaryFileHappy.nf +++ b/Utils/EditSummaryFileHappy.nf @@ -1,5 +1,5 @@ process EditSummaryFileHappy { - tag {"EditSummaryFileHappy"} + tag "$meta.id" label 'EditSummaryFileHappy' shell = ['/bin/bash', '-euo', 'pipefail'] From 240008c4d5239c586e65aa472500987fa389af8e Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 14:00:31 +0100 Subject: [PATCH 23/73] add stageAs to CheckQC --- CheckQC/CheckQC.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CheckQC/CheckQC.nf b/CheckQC/CheckQC.nf index 19dbddd..2fe48db 100644 --- a/CheckQC/CheckQC.nf +++ b/CheckQC/CheckQC.nf @@ -6,7 +6,7 @@ process CheckQC { input: val(identifier) - path(input_files) + path(input_files, stageAs: "?/*") output: path("${identifier}_summary.csv", emit: qc_output) From 56093d452f5a3ae1031f3a159aaf8cb6565636cf Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 20 Feb 2024 14:24:30 +0100 Subject: [PATCH 24/73] typo --- CheckQC/check_qc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index 8e9a5d1..db94a4b 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -177,7 +177,7 @@ def create_and_write_output(qc_output, output_path, output_prefix): def read_and_judge_metrics(qc, metrics): for qc_file in metrics: - qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("sep", "\t"), quotechar='"') + qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"') report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"]) qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"]) failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"]) From 1036d5769fc04a024639b54a978ee4b560abe5ec Mon Sep 17 00:00:00 2001 From: mraves2 Date: Tue, 20 Feb 2024 17:01:32 +0100 Subject: [PATCH 25/73] used lintr and styler to clean up code --- DIMS/AssignToBins.R | 71 ++-- DIMS/AverageTechReplicates.R | 198 +++++------ DIMS/CollectFilled.R | 75 ++--- DIMS/FillMissing.R | 24 +- DIMS/GenerateBreaks.R | 46 ++- DIMS/GenerateExcel.R | 574 +++++++++++++++++--------------- DIMS/GenerateViolinPlots.R | 295 ++++++++-------- DIMS/HMDBparts.R | 72 ++-- DIMS/HMDBparts_main.R | 33 +- DIMS/MakeInit.R | 22 +- DIMS/PeakFinding.R | 16 +- DIMS/PeakGrouping.R | 244 +++++++------- DIMS/SpectrumPeakFinding.R | 71 ++-- DIMS/SumAdducts.R | 80 ++--- DIMS/UnidentifiedCalcZscores.R | 56 ++-- DIMS/UnidentifiedCollectPeaks.R | 92 +---- DIMS/UnidentifiedFillMissing.R | 26 +- DIMS/UnidentifiedPeakGrouping.R | 127 ++++--- 18 files changed, 1019 insertions(+), 1103 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index cfbe15c..ed6984a 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,44 +1,43 @@ #!/usr/bin/Rscript ## adapted from 2-DIMS.R -# load required packages +# load required packages suppressPackageStartupMessages(library("xcms")) -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -filepath <- cmd_args[1] # location of mzML file -breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -resol <- as.numeric(cmd_args[3]) # 140000 +filepath <- cmd_args[1] +breaks_filepath <- cmd_args[2] +resol <- as.numeric(cmd_args[3]) trim <- 0.1 -dimsThresh <- 100 +dims_thresh <- 100 # get sample name -sample_name <- sub('\\..*$', '', basename(filepath)) +sample_name <- sub("\\..*$", "", basename(filepath)) -options(digits=16) +options(digits = 16) # Initialize -int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic) +int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve scale <- 2 # Initial value used to estimate scaling parameter width <- 1024 height <- 768 -trimLeft <- NULL -trimRight <- NULL +trim_left <- NULL +trim_right <- NULL breaks_fwhm <- NULL breaks_fwhm_avg <- NULL bins <- NULL pos_results <- NULL neg_results <- NULL -### process one sample at a time and find peaks FOR BOTH SCAN MODES! # # read in the data for 1 sample raw_data <- suppressMessages(xcmsRaw(filepath)) -# generate TIC plots. Prepare txt files with data for plots -TIC_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) -colnames(TIC_intensity_persample) <- c("retentionTime", "TIC") -write.table(TIC_intensity_persample, file = paste0("./", sample_name, "_TIC.txt")) +# for TIC plots: prepare txt files with data for plots +tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) +colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") +write.table(tic_intensity_persample, file = paste0("./", sample_name, "_TIC.txt")) # load breaks_fwhm load(breaks_filepath) @@ -54,45 +53,47 @@ raw_data_matrix <- rawMat(raw_data) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] -# Select scans between trimLeft and trimRight -pos_times <- pos_times[pos_times > trimLeft & pos_times < trimRight] -neg_times <- neg_times[neg_times > trimLeft & neg_times < trimRight] +# Select scans between trim_left and trim_right +pos_times <- pos_times[pos_times > trim_left & pos_times < trim_right] +neg_times <- neg_times[neg_times > trim_left & neg_times < trim_right] # Generate an index with which to select values for each mode -pos_index <- which(raw_data_matrix[ ,"time"] %in% pos_times) -neg_index <- which(raw_data_matrix[ ,"time"] %in% neg_times) +pos_index <- which(raw_data_matrix[, "time"] %in% pos_times) +neg_index <- which(raw_data_matrix[, "time"] %in% neg_times) # Separate each mode into its own matrix pos_raw_data_matrix <- raw_data_matrix[pos_index, ] neg_raw_data_matrix <- raw_data_matrix[neg_index, ] # Get index for binning intensity values -bin_indices_pos <- cut(pos_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) -bin_indices_neg <- cut(neg_raw_data_matrix[ ,"mz"], breaks_fwhm, include.lowest=TRUE, right=TRUE, labels=FALSE) +bin_indices_pos <- cut(pos_raw_data_matrix[, "mz"], breaks_fwhm, + include.lowest = TRUE, right = TRUE, labels = FALSE) +bin_indices_neg <- cut(neg_raw_data_matrix[, "mz"], breaks_fwhm, + include.lowest = TRUE, right = TRUE, labels = FALSE) # Get the list of intensity values for each bin, and add the # intensity values which are in the same bin if (nrow(pos_raw_data_matrix) > 0) { # set NA in intensities to zero - pos_raw_data_matrix[is.na(pos_raw_data_matrix[,"intensity"]), "intensity"] <- 0 - # use only values above dimsThresh - pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"] + pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0 + # use only values above dims_thresh + pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"] # aggregate intensities, calculate mean aggr_int_pos <- stats::aggregate(pos_intensity_above_threshold, list(bin_indices_pos), mean) - pos_bins[aggr_int_pos[ ,1]] <- aggr_int_pos[ ,2] + pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2] } if (nrow(neg_raw_data_matrix) > 0) { # set NA in intensities to zero - neg_raw_data_matrix[is.na(neg_raw_data_matrix[,"intensity"]), "intensity"] <- 0 - # use only values above dimsThresh - neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[ ,"intensity"] > dimsThresh), "intensity"] + neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0 + # use only values above dims_thresh + neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"] # aggregate intensities, calculate mean aggr_int_neg <- stats::aggregate(neg_intensity_above_threshold, list(bin_indices_neg), mean) - neg_bins[aggr_int_neg[ ,1]] <- aggr_int_neg[ ,2] + neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2] } # Zero any values that are below the threshold -pos_bins[pos_bins < dimsThresh] <- 0 -neg_bins[neg_bins < dimsThresh] <- 0 +pos_bins[pos_bins < dims_thresh] <- 0 +neg_bins[neg_bins < dims_thresh] <- 0 pos_results <- cbind(pos_results, pos_bins) neg_results <- cbind(neg_results, neg_bins) @@ -118,6 +119,6 @@ colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1 pos_results_final <- t(pos_results_transpose) neg_results_final <- t(neg_results_transpose) -peak_list <- list("pos"=pos_results_final, "neg"=neg_results_final, "breaksFwhm"=breaks_fwhm) +peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm) -save(peak_list, file=paste("./", sample_name, ".RData", sep="")) +save(peak_list, file = paste0("./", sample_name, ".RData")) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index f84c6ba..4c3d49b 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,173 +1,173 @@ #!/usr/bin/Rscript # adapted from 3-AverageTechReplicates.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) init_filepath <- cmd_args[1] nr_replicates <- as.numeric(cmd_args[2]) thresh2remove <- 1000000000 -dimsThresh <- 100 +dims_thresh <- 100 run_name <- cmd_args[3] dims_matrix <- cmd_args[4] highest_mz <- as.numeric(cmd_args[5]) -print(run_name) -print(dims_matrix) -print(highest_mz) - +# lower the threshold below which a sample will be removed for DBS and for high m/z if (dims_matrix == "DBS") { - thresh2remove <- 50000000 + thresh2remove <- 50000000 } if (highest_mz > 700) { - thresh2remove <- 1000000 + thresh2remove <- 1000000 } library("ggplot2") library("gridExtra") -removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { - # bad_samples=remove_pos +remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { tmp <- repl_pattern - removeFromGroup <- NULL - - for (i in 1:length(tmp)){ - tmp2 <- repl_pattern[[i]] + remove_from_group <- NULL + for (sample_nr in 1:length(tmp)){ + tmp2 <- repl_pattern[[sample_nr]] remove <- NULL - for (j in 1:length(tmp2)){ - if (tmp2[j] %in% bad_samples){ - remove = c(remove, j) + for (file_name in 1:length(tmp2)) { + if (tmp2[file_name] %in% bad_samples) { + remove <- c(remove, file_name) } } - - if (length(remove) == nr_replicates) removeFromGroup <- c(removeFromGroup,i) - if (!is.null(remove)) repl_pattern[[i]] <- repl_pattern[[i]][-remove] + if (length(remove) == nr_replicates) { + remove_from_group <- c(remove_from_group, sample_nr) + } + if (!is.null(remove)) { + repl_pattern[[sample_nr]] <- repl_pattern[[sample_nr]][-remove] + } } - - if (length(removeFromGroup)!=0) { - repl_pattern <- repl_pattern[-removeFromGroup] + if (length(remove_from_group) != 0) { + repl_pattern <- repl_pattern[-remove_from_group] } - - return(list("pattern"=repl_pattern)) + return(list("pattern" = repl_pattern)) } - # get repl_pattern load("./init.RData") remove_neg <- NULL remove_pos <- NULL cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") -for (i in 1:length(repl_pattern)) { - techRepsArray.pos <- NULL - techRepsArray.neg <- NULL - - tech_reps <- as.vector(unlist(repl_pattern[i])) +for (sample_nr in 1:length(repl_pattern)) { + tech_reps_array_pos <- NULL + tech_reps_array_neg <- NULL + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) sum_neg <- 0 sum_pos <- 0 - n_pos <- 0 - n_neg <- 0 - for (j in 1:length(tech_reps)) { - load(paste("./", tech_reps[j], ".RData", sep="")) - cat("\n\tNegative peak_list sum", sum(peak_list$neg[,1])) - if (sum(peak_list$neg[,1]) < thresh2remove){ + nr_pos <- 0 + nr_neg <- 0 + for (file_name in 1:length(tech_reps)) { + load(paste("./", tech_reps[file_name], ".RData", sep = "")) + # negative scanmode + cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) + if (sum(peak_list$neg[, 1]) < thresh2remove) { cat(" ... Removed") - remove_neg <- c(remove_neg, tech_reps[j]) + remove_neg <- c(remove_neg, tech_reps[file_name]) } else { - n_neg <- n_neg + 1 + nr_neg <- nr_neg + 1 sum_neg <- sum_neg + peak_list$neg } - - techRepsArray.neg <- cbind(techRepsArray.neg, peak_list$neg) - - cat("\n\tPositive peak_list sum", sum(peak_list$pos[,1])) - if (sum(peak_list$pos[,1]) < thresh2remove){ + tech_reps_array_neg <- cbind(tech_reps_array_neg, peak_list$neg) + # positive scanmode + cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) + if (sum(peak_list$pos[, 1]) < thresh2remove) { cat(" ... Removed") - remove_pos <- c(remove_pos, tech_reps[j]) + remove_pos <- c(remove_pos, tech_reps[file_name]) } else { - n_pos <- n_pos + 1 + nr_pos <- nr_pos + 1 sum_pos <- sum_pos + peak_list$pos } - - techRepsArray.pos <- cbind(techRepsArray.pos, peak_list$pos) + tech_reps_array_pos <- cbind(tech_reps_array_pos, peak_list$pos) } - - # filter within bins on at least signal in more than one tech. rep.!!! - if (!is.null(dim(sum_pos))) sum_pos[apply(techRepsArray.pos,1,function(x) length(which(x>dimsThresh))==1),1]=0 - if (!is.null(dim(sum_neg))) sum_neg[apply(techRepsArray.neg,1,function(x) length(which(x>dimsThresh))==1),1]=0 - - if (n_neg != 0){ - sum_neg[,1] <- sum_neg[,1]/n_neg - colnames(sum_neg) <- names(repl_pattern)[i] - save(sum_neg, file=paste("./", names(repl_pattern)[i], "_neg_avg.RData", sep="")) + # save to file + if (nr_neg != 0) { + sum_neg[, 1] <- sum_neg[, 1] / nr_neg + colnames(sum_neg) <- names(repl_pattern)[sample_nr] + save(sum_neg, file = paste0("./", names(repl_pattern)[sample_nr], "_neg_avg.RData")) } - if (n_pos != 0) { - sum_pos[,1] <- sum_pos[,1]/n_pos - colnames(sum_pos) <- names(repl_pattern)[i] - save(sum_pos, file=paste("./", names(repl_pattern)[i], "_pos_avg.RData", sep="")) + if (nr_pos != 0) { + sum_pos[, 1] <- sum_pos[, 1] / nr_pos + colnames(sum_pos) <- names(repl_pattern)[sample_nr] + save(sum_pos, file = paste0("./", names(repl_pattern)[sample_nr], "_pos_avg.RData")) } } -retVal <- removeFromRepl.pat(remove_neg, repl_pattern, nr_replicates) -repl_pattern_filtered <- retVal$pattern +pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) +repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "./negative_repl_pattern.RData") -write.table(remove_neg, file = "./miss_infusions_negative.txt", row.names=FALSE, col.names=FALSE , sep= "\t") +write.table(remove_neg, file = "./miss_infusions_negative.txt", + row.names = FALSE, col.names = FALSE, sep = "\t") -retVal <- removeFromRepl.pat(remove_pos, repl_pattern, nr_replicates) -repl_pattern_filtered <- retVal$pattern +pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) +repl_pattern_filtered <- pattern_list$pattern save(repl_pattern_filtered, file = "./positive_repl_pattern.RData") -write.table(remove_pos, file = "./miss_infusions_positive.txt", row.names=FALSE, col.names=FALSE , sep= "\t") +write.table(remove_pos, file = "./miss_infusions_positive.txt", + row.names = FALSE, col.names = FALSE, sep = "\t") # New: generate TIC plots # get all txt files -TIC_files = list.files("./", full.names=TRUE, pattern="*TIC.txt") -all_samps <- sub('_TIC\\..*$', '', basename(TIC_files)) +tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") +all_samps <- sub("_TIC\\..*$", "", basename(tic_files)) # determine maximum intensity highest_tic_max <- 0 -for (file in TIC_files) { +for (file in tic_files) { tic <- read.table(file) - this_tic_max <- max(tic$TIC) + this_tic_max <- max(tic$tic_intensity) if (this_tic_max > highest_tic_max) { highest_tic_max <- this_tic_max - max_sample <- sub('_TIC\\..*$', '', basename(file)) - } + max_sample <- sub("_TIC\\..*$", "", basename(file)) + } } tic_plot_list <- list() -k = 0 -for (i in c(1:length(repl_pattern))) { # change after test-phase !!! - tech_reps <- as.vector(unlist(repl_pattern[i])) - sample_name <- names(repl_pattern)[i] - for (j in 1:length(tech_reps)) { - k = k + 1 - repl1_nr <- read.table(TIC_files[j]) - bad_color_pos <- tech_reps[j] %in% remove_pos[[1]] - bad_color_neg <- tech_reps[j] %in% remove_neg[[1]] +plot_nr <- 0 +for (sample_nr in c(1:length(repl_pattern))) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) + sample_name <- names(repl_pattern)[sample_nr] + for (file_name in 1:length(tech_reps)) { + plot_nr <- plot_nr + 1 + repl1_nr <- read.table(tic_files[file_name]) + bad_color_pos <- tech_reps[file_name] %in% remove_pos[[1]] + bad_color_neg <- tech_reps[file_name] %in% remove_neg[[1]] if (bad_color_neg & bad_color_pos) { - plotcolor = '#F8766D' + plot_color <- "#F8766D" } else if (bad_color_pos) { - plotcolor = "#ED8141" + plot_color <- "#ED8141" } else if (bad_color_neg) { - plotcolor = "#BF80FF" + plot_color <- "#BF80FF" } else { - plotcolor = 'white' + plot_color <- "white" } - tic_plot <- ggplot(repl1_nr, aes(retentionTime, TIC)) + + tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + geom_line(linewidth = 0.3) + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + - labs(x = 't (s)', y = 'TIC', title = paste0(tech_reps[j], " || ", sample_name)) + - theme(plot.background = element_rect(fill = plotcolor), axis.text = element_text(size = 4), axis.title = element_text(size = 4), plot.title = element_text(size = 6)) - tic_plot_list[[k]] <- tic_plot + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[j], " || ", sample_name)) + + theme(plot.background = element_rect(fill = plot_color), + axis.text = element_text(size = 4), + axis.title = element_text(size = 4), + plot.title = element_text(size = 6)) + tic_plot_list[[plot_nr]] <- tic_plot } - } -# create a layout matrix dependent on numer of replicates +# create a layout matrix dependent on number of replicates layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) - -tic_plot_pdf <- marrangeGrob(grobs = tic_plot_list, - nrow = 10, - ncol = nr_replicates, - layout_matrix = layout, - top = quote(paste("TICs of run", run_name," \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", g, "/", npages))) -ggsave(filename = paste0("./../../../", run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") +# put TIC plots in matrix +tic_plot_pdf <- marrangeGrob( + grobs = tic_plot_list, + nrow = 10, ncol = nr_replicates, + layout_matrix = layout, + top = quote(paste( + "TICs of run", run_name, + " \n colors: red = both modes misinjection, orange = pos mode misinjection, purple = neg mode misinjection \n ", + g, "/", npages + )) +) +# save to file +ggsave(filename = paste0("./../../../Bioinformatics/", run_name, "_TICplots.pdf"), + tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index e927a65..9e97db2 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -1,7 +1,7 @@ #!/usr/bin/Rscript ## adapted from 10-collectSamplesFilled.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) scripts_dir <- cmd_args[1] @@ -10,70 +10,65 @@ z_score <- as.numeric(cmd_args[3]) source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R")) source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R")) -# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # get list of files - filled_files <- list.files("./", full.names=TRUE, pattern=scanmode) + filled_files <- list.files("./", full.names = TRUE, pattern = scanmode) # load files and combine into one object - outlist.tot <- NULL - for (i in 1:length(filled_files)) { - load(filled_files[i]) - outlist.tot <- rbind(outlist.tot, peakgrouplist_filled) + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + load(filled_files[file_nr]) + outlist_total <- rbind(outlist_total, peakgrouplist_filled) } # remove duplicates; peak groups with exactly the same m/z - outlist.tot <- mergeDuplicatedRows(outlist.tot) + outlist_total <- mergeDuplicatedRows(outlist_total) # sort on mass - outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),] + outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] # load replication pattern pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) - # Normalization: not done. - # if (normalization != "disabled") { - # outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") - # } - if (z_score == 1) { - outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) - nr.removed.samples <- length(which(repl_pattern[]=="character(0)")) - order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)]) - outlist.stats.more <- cbind(outlist.stats[,1:7], - outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)], - outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int], - outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)]) - - tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) - tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index])) - tmp <- outlist.stats.more[,tmp.index[tmp.index.order]] - outlist.stats.more <- outlist.stats.more[,-tmp.index] - outlist.stats.more <- cbind(outlist.stats.more,tmp) - outlist.tot <- outlist.stats.more + outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE) + nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) + order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) + outlist_stats_more <- cbind( + outlist_stats[, 1:7], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)], + outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)] + ) + + tmp_index <- grep("_Zscore", colnames(outlist_stats.more), fixed = TRUE) + tmp_index_order <- order(colnames(outlist_stats.more[, tmp_index])) + tmp <- outlist_stats.more[, tmp_index[tmp_index_order]] + outlist_stats_more <- outlist_stats.more[, -tmp_index] + outlist_stats_more <- cbind(outlist_stats_more, tmp) + outlist_total <- outlist_stats_more } - outlist.ident <- outlist.tot + outlist_ident <- outlist_total if (z_score == 1) { - outlist.ident$ppmdev <- as.numeric(outlist.ident$ppmdev) - outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),] + outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev) + outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ] } # take care of NAs in theormz_noise - outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0 - outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise) - outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] <- 0 - outlist.ident$theormz_noise <- as.numeric(outlist.ident$theormz_noise) + outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 + outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) + outlist_ident$theormz_noise[which(is.na(outlist_ident$theormz_noise))] <- 0 + outlist_ident$theormz_noise <- as.numeric(outlist_ident$theormz_noise) # Extra output in Excel-readable format: remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") - remove_colindex <- which(colnames(outlist.ident) %in% remove_columns) - outlist.ident <- outlist.ident[ , -remove_colindex] - write.table(outlist.ident, file=paste0("outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) - save(outlist.ident, file=paste0("outlist_identified_", scanmode, ".RData")) - + remove_colindex <- which(colnames(outlist_ident) %in% remove_columns) + outlist_ident <- outlist_ident[, -remove_colindex] + write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE) + save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData")) } diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index a2e29ff..0364ad2 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -3,9 +3,9 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") +for (arg in cmd_args) cat(" ", arg, "\n", sep = "") -# define parameters +# define parameters peakgrouplist_file <- cmd_args[1] scripts_dir <- cmd_args[2] thresh <- as.numeric(cmd_args[3]) @@ -13,13 +13,11 @@ resol <- as.numeric(cmd_args[4]) ppm <- as.numeric(cmd_args[5]) outdir <- "./" -print(peakgrouplist_file) - -if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else - if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" } - -print(scanmode) -print(scripts_dir) +if (grepl("_pos", peakgrouplist_file)) { + scanmode <- "positive" +} else if (grepl("_neg", peakgrouplist_file)) { + scanmode <- "negative" +} # load in function scripts source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) @@ -41,14 +39,10 @@ print(head(repl_pattern)) # load peak group list and determine output file name outpgrlist_identified <- get(load(peakgrouplist_file)) -print(head(outpgrlist_identified)) - outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) -print(outputfile_name) - # replace missing values (zeros) with random noise peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) -# save output -save(peakgrouplist_filled, file=paste0("./", outputfile_name)) +# save output +save(peakgrouplist_filled, file = paste0("./", outputfile_name)) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 8a3ca07..fd60723 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -2,54 +2,52 @@ ## adapted from 1-generateBreaksFwhm.HPC.R ## #!/usr/bin/Rscript -# load required package +# load required package suppressPackageStartupMessages(library("xcms")) -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) filepath <- cmd_args[1] # 1 of the mzML files -outdir <- cmd_args[2] +outdir <- cmd_args[2] trim <- as.numeric(cmd_args[3]) # 0.1 resol <- as.numeric(cmd_args[4]) # 140000 # initialize -trimLeft <- NULL -trimRight <- NULL +trim_left <- NULL +trim_right <- NULL breaks_fwhm <- NULL breaks_fwhm_avg <- NULL bins <- NULL -posRes <- NULL -negRes <- NULL # read in mzML file raw_data <- suppressMessages(xcmsRaw(filepath)) # trim (remove) scans at the start and end -trimLeft <- round(raw_data@scantime[length(raw_data@scantime)*trim]) -trimRight <- round(raw_data@scantime[length(raw_data@scantime)*(1-trim)]) +trim_left <- round(raw_data@scantime[length(raw_data@scantime) * trim]) +trim_right <- round(raw_data@scantime[length(raw_data@scantime) * (1 - trim)]) # Mass range m/z -lowMZ <- raw_data@mzrange[1] -highMZ <- raw_data@mzrange[2] +low_mz <- raw_data@mzrange[1] +high_mz <- raw_data@mzrange[2] # determine number of segments (bins) -nsegment <- 2*(highMZ-lowMZ) -segment <- seq(from=lowMZ, to=highMZ, length.out=nsegment+1) +nr_segments <- 2 * (high_mz - low_mz) +segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) # determine start and end of each bin. -for (i in 1:nsegment) { - startsegm <- segment[i] - endsegm <- segment[i+1] - resol_mz <- resol*(1/sqrt(2)^(log2(startsegm/200))) - fwhmsegm <- startsegm/resol_mz - breaks_fwhm <- c(breaks_fwhm, seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm)) +for (i in 1:nr_segments) { + start_segment <- segment[i] + end_segment <- segment[i+1] + resol_mz <- resol*(1 / sqrt(2) ^ (log2(start_segment / 200))) + fwhm_segment <- start_segment / resol_mz + breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) # average the m/z instead of start value - range <- seq(from=(startsegm + fwhmsegm), to=endsegm, by=0.2*fwhmsegm) - deltaMZ <- range[2] - range[1] - breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5*deltaMZ) + range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) + delta_mz <- range[2] - range[1] + breaks_fwhm_avg <- c(breaks_fwhm_avg, range + 0.5 * delta_mz) } # generate output file -save(breaks_fwhm, breaks_fwhm_avg, trimLeft, trimRight, file="./breaks.fwhm.RData") -write(highMZ, file="./highest_mz.txt") +save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "./breaks.fwhm.RData") +write(high_mz, file = "./highest_mz.txt") diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 0fd394f..029498f 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -1,17 +1,16 @@ #!/usr/bin/Rscript ## adapted from 13-excelExport.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -# outdir <- cmd_args[1] #"/Users/nunen/Documents/Metab/test_set" init_filepath <- cmd_args[1] -project <- cmd_args[2] #"test" -dims_matrix <- cmd_args[3] #"DBS" -hmdb <- cmd_args[4] #"/hpc/dbg_mz/tools/db/HMDB_with_info_relevance_IS_C5OH.RData" -z_score <- as.numeric(cmd_args[5]) +project <- cmd_args[2] +dims_matrix <- cmd_args[3] +hmdb <- cmd_args[4] +z_score <- as.numeric(cmd_args[5]) -# load required packages +# load required packages library("ggplot2") library("reshape2") library("openxlsx") @@ -19,256 +18,264 @@ library("loder") suppressMessages(library("dplyr")) suppressMessages(library("stringr")) +round_df <- function(df, digits) { + #' function for rounding numbers to x digits for numeric values + #' + #' @param df dataframe + #' @param digits integer number of digits to round off to + #' + #' @return df + numeric_columns <- sapply(df, mode) == "numeric" + df[numeric_columns] <- round(df[numeric_columns], digits) + return(df) +} + +robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { + #' calculate robust scaler: Z-score based on controls without outliers + #' + #' @param control_intensities matrix with intensities for control samples + #' @param control_col_ids vector with column names for control samples + #' @param perc float percentage of outliers which will be removed from controls + #' + #' @return trimmed_control_intensities + nr_toremove <- ceiling(length(control_col_ids) * perc / 100) + sorted_control_intensities <- sort(as.numeric(control_intensities)) + trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : (length(sorted_control_intensities) - nr_toremove)] + return(trimmed_control_intensities) +} + # Initialise and load data -plot <- TRUE +plot <- TRUE export <- TRUE control_label <- "C" -case_label <- "P" +case_label <- "P" imagesize_multiplier <- 2 # setting outdir to export files to the project directory -outdir <- "../../../" +outdir <- "./" # percentage of outliers to remove from calculation of robust scaler perc <- 5 # load information on samples load(init_filepath) # load the HMDB file with info on biological relevance of metabolites -load(hmdb) # load object rlvnc into global environment +load(hmdb) +# get current date rundate <- Sys.Date() # create a directory for plots in project directory plotdir <- paste0(outdir, "/plots/adducts") -dir.create(paste0(outdir, "/plots"), showWarnings = F) -dir.create(plotdir, showWarnings = F) +dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) +dir.create(plotdir, showWarnings = FALSE) -options(digits=16) +# set the number of digits for floats +options(digits = 16) -# sum positive and negative adductsums -# Load pos and neg adduct sums +# load positive and negative adduct sums load("AdductSums_negative.RData") -outlist.neg.adducts.HMDB <- outlist.tot +outlist_neg_adducts_hmdb <- outlist.tot load("AdductSums_positive.RData") -outlist.pos.adducts.HMDB <- outlist.tot +outlist_pos_adducts_hmdb <- outlist.tot rm(outlist.tot) # Only continue with patients (columns) that are in both pos and neg, so patients that are in both -tmp <- intersect(colnames(outlist.neg.adducts.HMDB), colnames(outlist.pos.adducts.HMDB)) -outlist.neg.adducts.HMDB <- outlist.neg.adducts.HMDB[,tmp] -outlist.pos.adducts.HMDB <- outlist.pos.adducts.HMDB[,tmp] +tmp <- intersect(colnames(outlist_neg_adducts_hmdb), colnames(outlist_pos_adducts_hmdb)) +outlist_neg_adducts_hmdb <- outlist_neg_adducts_hmdb[, tmp] +outlist_pos_adducts_hmdb <- outlist_pos_adducts_hmdb[, tmp] # Find indexes of neg hmdb code that are also found in pos and vice versa -index.neg <- which(rownames(outlist.neg.adducts.HMDB) %in% rownames(outlist.pos.adducts.HMDB)) -index.pos <- which(rownames(outlist.pos.adducts.HMDB) %in% rownames(outlist.neg.adducts.HMDB)) +index_neg <- which(rownames(outlist_neg_adducts_hmdb) %in% rownames(outlist_pos_adducts_hmdb)) +index_pos <- which(rownames(outlist_pos_adducts_hmdb) %in% rownames(outlist_neg_adducts_hmdb)) -# Get number of columns -# Only continue with HMDB codes (rows) that were found in both pos and neg mode and remove last column (hmdb_name) -tmp.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.pos.adducts.HMDB)[2]-1)] -tmp.hmdb_name.pos <- outlist.pos.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], dim(outlist.pos.adducts.HMDB)[2]] -tmp.pos.left <- outlist.pos.adducts.HMDB[-index.pos,] - -tmp.neg <- outlist.neg.adducts.HMDB[rownames(outlist.pos.adducts.HMDB)[index.pos], 1:(dim(outlist.neg.adducts.HMDB)[2]-1)] -tmp.neg.left <- outlist.neg.adducts.HMDB[-index.neg,] +# Only continue with HMDB codes (rows) that were found in both positive mode and remove last column (hmdb_name) +tmp_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_pos_adducts_hmdb)[2] - 1)] +tmp_hmdb_name_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], dim(outlist_pos_adducts_hmdb)[2]] +tmp_pos_left <- outlist_pos_adducts_hmdb[-index_pos, ] +# same for negative mode +tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_neg_adducts_hmdb)[2] - 1)] +tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ] # Combine positive and negative numbers and paste back HMDB column -tmp <- apply(tmp.pos, 2,as.numeric) + apply(tmp.neg, 2,as.numeric) -rownames(tmp) <- rownames(tmp.pos) -tmp <- cbind(tmp, "HMDB_name"=tmp.hmdb_name.pos) -outlist <- rbind(tmp, tmp.pos.left, tmp.neg.left) - -# Filter -peaksInList <- which(rownames(outlist) %in% rownames(rlvnc)) -outlist <- cbind(outlist[peaksInList,],as.data.frame(rlvnc[rownames(outlist)[peaksInList],])) -# filter out all irrelevant HMDB's. the tibble::rownames is needed for the older version of dplyr on the HPC (it will reindex the rownames) -outlist <- outlist %>% - tibble::rownames_to_column('rowname') %>% +tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) +rownames(tmp) <- rownames(tmp_pos) +tmp <- cbind(tmp, "HMDB_name" = tmp_hmdb_name_pos) +outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left) + +# Filter for biological relevance +peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) +outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc[rownames(outlist)[peaks_in_list], ])) +# filter out all irrelevant HMDBs +outlist <- outlist %>% + tibble::rownames_to_column("rowname") %>% filter(!grepl("Exogenous|Drug|exogenous", relevance)) %>% - tibble::column_to_rownames('rowname') + tibble::column_to_rownames("rowname") # Add HMDB_code column with all the HMDB ID and sort on it outlist <- cbind(outlist, "HMDB_code" = rownames(outlist)) -outlist <- outlist[order(outlist[,"HMDB_code"]),] +outlist <- outlist[order(outlist[, "HMDB_code"]), ] # Create excel filelist <- "AllPeakGroups" - wb <- createWorkbook("SinglePatient") addWorksheet(wb, filelist) -# small function for rounding numbers to x digits for numeric values -round_df <- function(df, digits) { - # round all numeric variables - # df: data frame - # digits: number of digits to round - numeric_columns <- sapply(df, mode) == 'numeric' - df[numeric_columns] <- round(df[numeric_columns], digits) - df -} - # Add Z-scores and create plots if (z_score == 1) { - # Statistics: Z-score + # add a column for plots outlist <- cbind(plots = NA, outlist) - #outlist <- as.data.frame(outlist) - startcol <- dim(outlist)[2] + 3 - - # Get columns with control intensities + # two columns will be added for mean and stdev of controls; Z-scores start at ncol + 3 + startcol <- ncol(outlist) + 3 + + # Get columns with control intensities control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE) control_columns <- outlist[, control_col_ids] - - # Get columns with patient intensities + + # Get columns with patient intensities patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE) patient_columns <- outlist[, patient_col_ids] intensity_col_ids <- c(control_col_ids, patient_col_ids) - - # set intensities of 0 to NA? - outlist[,intensity_col_ids][outlist[,intensity_col_ids] == 0] <- NA - # Extra output: save outlist as it is and use it to calculate robust scaler - outlist.noZ <- outlist + # if there are any intensities of 0 left, set them to NA for stats + outlist[, intensity_col_ids][outlist[, intensity_col_ids] == 0] <- NA + + # save outlist as it is and use it to calculate robust scaler + outlist_noZ <- outlist # calculate mean and sd for Control group - outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x),na.rm = TRUE)) - outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x),na.rm = TRUE)) - + outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) + outlist$sd.ctrls <- apply(control_columns, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) + # Make and add columns with zscores - cnames.z <- NULL + colnames_z <- NULL for (i in intensity_col_ids) { cname <- colnames(outlist)[i] - cnames.z <- c(cnames.z, paste(cname, "Zscore", sep="_")) - zscores.1col <- (as.numeric(as.vector(unlist(outlist[ , i]))) - outlist$avg.ctrls) / outlist$sd.ctrls - outlist <- cbind(outlist, zscores.1col) - } - colnames(outlist)[startcol:ncol(outlist)] <- cnames.z - - # Extra output: calculate robust scaler (Zscores minus outliers in Controls) - # calculate mean and sd for Control group without outliers - outlist.noZ$avg.ctrls <- 0 - outlist.noZ$sd.ctrls <- 0 - - robust_scaler <- function(control_intensities, control_col_ids, perc=5) { - nr_toremove <- ceiling(length(control_col_ids)*perc/100) - sorted_control_intensities <- sort(as.numeric(control_intensities)) - trimmed_control_intensities <- sorted_control_intensities[(nr_toremove+1):(length(sorted_control_intensities)-nr_toremove)] - return(trimmed_control_intensities) + colnames_z <- c(colnames_z, paste(cname, "Zscore", sep = "_")) + zscores_1col <- (as.numeric(as.vector(unlist(outlist[, i]))) - outlist$avg.ctrls) / outlist$sd.ctrls + outlist <- cbind(outlist, zscores_1col) } + colnames(outlist)[startcol:ncol(outlist)] <- colnames_z + + # calculate robust scaler (Zscores minus outliers in Controls) + outlist_noZ$avg.ctrls <- 0 + outlist_noZ$sd.ctrls <- 0 # only calculate robust Z-scores if there are enough Controls if (length(control_col_ids) > 10) { for (metabolite_index in 1:nrow(outlist)) { - outlist.noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc)) - outlist.noZ$sd.ctrls[metabolite_index] <- sd(robust_scaler(outlist.noZ[metabolite_index, control_col_ids], control_col_ids, perc)) + outlist_noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist_noZ[metabolite_index, control_col_ids], + control_col_ids, perc)) + outlist_noZ$sd.ctrls[metabolite_index] <- sd(robust_scaler(outlist_noZ[metabolite_index, control_col_ids], + control_col_ids, perc)) } } # Make and add columns with robust zscores - cnames.robust <- gsub("_Zscore", "_RobustZscore", cnames.z) + cnames_robust <- gsub("_Zscore", "_RobustZscore", colnames_z) for (i in intensity_col_ids) { - zscores.1col <- (as.numeric(as.vector(unlist(outlist.noZ[ , i]))) - outlist.noZ$avg.ctrls) / outlist.noZ$sd.ctrls - outlist.noZ <- cbind(outlist.noZ, zscores.1col) + zscores_1col <- (as.numeric(as.vector(unlist(outlist_noZ[, i]))) - outlist_noZ$avg.ctrls) / outlist_noZ$sd.ctrls + outlist_noZ <- cbind(outlist_noZ, zscores_1col) } - colnames(outlist.noZ)[startcol:ncol(outlist.noZ)] <- cnames.robust + colnames(outlist_noZ)[startcol:ncol(outlist_noZ)] <- cnames_robust - # Extra output: metabolites filtered on relevance - save(outlist, file=paste0("AdductSums_filtered_Zscores.RData")) - write.table(outlist, file=paste0("AdductSums_filtered_Zscores.txt"), sep="\t", row.names = FALSE) - # Extra output: filtered metabolites with robust scaled Zscores - write.table(outlist.noZ, file=paste0("AdductSums_filtered_robustZ.txt"), sep="\t", row.names = FALSE) + # output metabolites filtered on relevance into tab-separated file + write.table(outlist, file = paste0("AdductSums_filtered_Zscores.txt"), sep = "\t", row.names = FALSE) + # output filtered metabolites with robust scaled Zscores + write.table(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE) - # get the IDs of the patients + # get the IDs of the patients and sort patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1])))) - patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] # sorts - - # Iterate over every row, make boxplot, insert into excel, and make Zscore for every patient + patient_ids <- patient_ids[order(nchar(patient_ids), patient_ids)] + + # for every row, make boxplot, insert into excel, and calculate Zscore for every patient temp_png <- NULL for (p in 1:nrow(outlist)) { - # box plot - hmdb_name <- rownames(outlist[p,]) - - intensities <- list(as.numeric(as.vector(unlist(control_columns[p,])))) + # get HMDB ID + hmdb_name <- rownames(outlist[p, ]) + # get intensities per metabolite for box plot for control samples + intensities <- list(as.numeric(as.vector(unlist(control_columns[p, ])))) labels <- c("C", patient_ids) - + # get intensities per metabolite for box plot for patient samples for (i in 1:length(patient_ids)) { id <- patient_ids[i] - # get all intensities that start with ex. P18. (so P18.1, P18.2, but not x_P18.1 and not P180.1) - p.int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1,])[startsWith(names(patient_columns[1,]), paste0(id, "."))]]))) - intensities[[i+1]] <- p.int + # combine all intensities that start with the same string for patients + patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ])[startsWith(names(patient_columns[1, ]), paste0(id, "."))]]))) + intensities[[i + 1]] <- patient_int } - intensities <- setNames(intensities, labels) - + plot_width <- length(labels) * 16 + 90 - + plot.new() if (export) { - png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"), - width = plot_width, + png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"), + width = plot_width, height = 280) } - par(oma=c(2,0,0,0)) - boxplot(intensities, - col=c("green", rep("red", length(intensities)-1)), - names.arg = labels, - las=2, - main = hmdb_name) + # set margins + par(oma = c(2, 0, 0, 0)) + boxplot(intensities, + col = c("green", rep("red", length(intensities) - 1)), + names.arg = labels, + las = 2, + main = hmdb_name) dev.off() - + file_png <- paste0(plotdir, "/", hmdb_name, "_box.png") if (is.null(temp_png)) { temp_png <- readPng(file_png) - img_dim <- dim(temp_png)[c(1,2)] + img_dim <- dim(temp_png)[c(1, 2)] cell_dim <- img_dim * imagesize_multiplier - setColWidths(wb, filelist, cols = 1, widths = cell_dim[2]/20) + setColWidths(wb, filelist, cols = 1, widths = cell_dim[2] / 20) } - - insertImage(wb, - filelist, - file_png, - startRow = p + 1, - startCol = 1, - height = cell_dim[1], - width = cell_dim[2], + + insertImage(wb, + filelist, + file_png, + startRow = p + 1, + startCol = 1, + height = cell_dim[1], + width = cell_dim[2], units = "px") - + if (p %% 100 == 0) { cat("at row: ", p, "\n") } } - - setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1]/4) + + setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4) setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20) } else { setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) } +# write Excel file writeData(wb, sheet = 1, outlist, startCol = 1) xlsx_name <- paste0(outdir, "/", project, ".xlsx") saveWorkbook(wb, xlsx_name, overwrite = TRUE) -# save a local copy in work directory as well -saveWorkbook(wb, paste0("./", project, ".xlsx"), overwrite = TRUE) cat(xlsx_name) rm(wb) -# INTERNE STANDAARDEN -IS <- outlist[grep("Internal standard", outlist[,"relevance"], fixed = TRUE),] +#### INTERNE STANDAARDEN #### +IS <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] IS_codes <- rownames(IS) -cat(IS_codes,"\n") +cat(IS_codes, "\n") -# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS), +# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS), # thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data. -sample_names_nodata <- setdiff(names(repl_pattern),names(IS)) +sample_names_nodata <- setdiff(names(repl_pattern), names(IS)) if (!is.null(sample_names_nodata)) { write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) - cat(sample_names_nodata,"\n") + cat(sample_names_nodata, "\n") for (sample_name in sample_names_nodata) { repl_pattern[[sample_name]] <- NULL - }} + } +} # Retrieve IS summed adducts IS_summed <- IS[c(names(repl_pattern), "HMDB_code")] IS_summed$HMDB.name <- IS$name -IS_summed <- reshape2::melt(IS_summed, id.vars=c('HMDB_code','HMDB.name')) -colnames(IS_summed) <- c('HMDB.code','HMDB.name','Sample','Intensity') +IS_summed <- reshape2::melt(IS_summed, id.vars = c("HMDB_code", "HMDB.name")) +colnames(IS_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") IS_summed$Intensity <- as.numeric(IS_summed$Intensity) IS_summed$Matrix <- dims_matrix IS_summed$Rundate <- rundate @@ -276,29 +283,29 @@ IS_summed$Project <- project IS_summed$Intensity <- as.numeric(as.character(IS_summed$Intensity)) # Retrieve IS positive mode -IS_pos <- as.data.frame(subset(outlist.pos.adducts.HMDB,rownames(outlist.pos.adducts.HMDB) %in% IS_codes)) -IS_pos$HMDB_name <- IS[match(row.names(IS_pos),IS$HMDB_code,nomatch=NA),'name'] +IS_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% IS_codes)) +IS_pos$HMDB_name <- IS[match(row.names(IS_pos), IS$HMDB_code, nomatch = NA), "name"] IS_pos$HMDB.code <- row.names(IS_pos) -IS_pos <- reshape2::melt(IS_pos, id.vars=c('HMDB.code','HMDB_name')) -colnames(IS_pos) <- c('HMDB.code','HMDB.name','Sample','Intensity') +IS_pos <- reshape2::melt(IS_pos, id.vars = c("HMDB.code", "HMDB_name")) +colnames(IS_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") IS_pos$Matrix <- dims_matrix IS_pos$Rundate <- rundate IS_pos$Project <- project IS_pos$Intensity <- as.numeric(as.character(IS_pos$Intensity)) # Retrieve IS negative mode -IS_neg <- as.data.frame(subset(outlist.neg.adducts.HMDB,rownames(outlist.neg.adducts.HMDB) %in% IS_codes)) -IS_neg$HMDB_name <- IS[match(row.names(IS_neg),IS$HMDB_code,nomatch=NA),'name'] +IS_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% IS_codes)) +IS_neg$HMDB_name <- IS[match(row.names(IS_neg), IS$HMDB_code, nomatch = NA), "name"] IS_neg$HMDB.code <- row.names(IS_neg) -IS_neg <- reshape2::melt(IS_neg, id.vars=c('HMDB.code','HMDB_name')) -colnames(IS_neg) <- c('HMDB.code','HMDB.name','Sample','Intensity') +IS_neg <- reshape2::melt(IS_neg, id.vars = c("HMDB.code", "HMDB_name")) +colnames(IS_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") IS_neg$Matrix <- dims_matrix IS_neg$Rundate <- rundate IS_neg$Project <- project IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity)) # Save results -save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, '_IS_results.RData')) +save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) # number of samples, for plotting length and width sample_count <- length(repl_pattern) @@ -315,32 +322,33 @@ IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder)) # function for ggplot theme # theme for all IS bar plots theme_IS_bar <- function(myPlot) { - myPlot + + myPlot + scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + theme( - legend.position = 'none', - axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size=6), - axis.text.y = element_text(size=6)) + legend.position = "none", + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), + axis.text.y = element_text(size = 6) + ) } # ggplot functions IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + - geom_bar(aes(fill=HMDB.name), stat='identity') + - labs(x='', y='Intensity') + - facet_wrap(~HMDB.name, scales='free_y') + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free_y") IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + - geom_bar(aes(fill=HMDB.name), stat='identity') + - labs(x='', y='Intensity') + - facet_wrap(~HMDB.name, scales='free_y') + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free_y") IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Summed)") + - geom_bar(aes(fill=HMDB.name), stat='identity') + - labs(x='', y='Intensity') + - facet_wrap(~HMDB.name, scales='free_y') + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free_y") # add theme to ggplot functions IS_neg_bar_plot <- theme_IS_bar(IS_neg_bar_plot) @@ -349,44 +357,46 @@ IS_sum_bar_plot <- theme_IS_bar(IS_sum_bar_plot) # save plots to disk plot_width <- 9 + 0.35 * sample_count -ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot=IS_neg_bar_plot, height=plot_width/2.5, width=plot_width, units="in") -ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot=IS_pos_bar_plot, height=plot_width/2.5, width=plot_width, units="in") -ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot=IS_sum_bar_plot, height=plot_width/2.5, width=plot_width, units="in") +ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot = IS_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot = IS_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot = IS_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") ## Line plots with all IS # function for ggplot theme # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines theme_IS_line <- function(myPlot) { - myPlot + + myPlot + guides( shape = guide_legend(override.aes = list(size = 0.5)), - color = guide_legend(override.aes = list(size = 0.5))) + + color = guide_legend(override.aes = list(size = 0.5)) + ) + theme( - legend.title = element_text(size = 8), - legend.text = element_text(size = 6), + legend.title = element_text(size = 8), + legend.text = element_text(size = 6), legend.key.size = unit(0.7, "line"), - axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8)) + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8) + ) } # ggplot functions IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + - geom_point(aes(col=HMDB.name)) + - geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='', y='Intensity') + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + - labs(x = '', y = 'Intensity') + labs(x = "", y = "Intensity") IS_sum_line_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + - labs(x = '', y = 'Intensity') + labs(x = "", y = "Intensity") # add theme to ggplot functions IS_sum_line_plot <- theme_IS_line(IS_sum_line_plot) @@ -395,37 +405,49 @@ IS_pos_line_plot <- theme_IS_line(IS_pos_line_plot) # save plots to disk plot_width <- 8 + 0.2 * sample_count -ggsave(paste0(outdir,"/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width/2.5, width = plot_width, units = "in") -ggsave(paste0(outdir,"/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width/2.5, width = plot_width, units = "in") -ggsave(paste0(outdir,"/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width/2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") ## bar plots with a selection of IS -IS_neg_selection <- c('2H2-Ornithine (IS)', '2H3-Glutamate (IS)', '2H2-Citrulline (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)') -IS_pos_selection <- c('2H4-Alanine (IS)', '13C6-Phenylalanine (IS)', '2H4_13C5-Arginine (IS)', '2H3-Propionylcarnitine (IS)', '2H9-Isovalerylcarnitine (IS)') -IS_sum_selection <- c('2H8-Valine (IS)', '2H3-Leucine (IS)', '2H3-Glutamate (IS)', '2H4_13C5-Arginine (IS)', '13C6-Tyrosine (IS)') +IS_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)") +IS_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", "2H9-Isovalerylcarnitine (IS)") +IS_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)") # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) -if (dims_matrix == "DBS"){ - hline.data.neg <- - data.frame(z = c(15000, 200000, 130000, 18000, 50000), - HMDB.name = IS_neg_selection) - hline.data.pos <- - data.frame(z = c(150000, 3300000, 1750000, 150000, 270000), - HMDB.name = IS_pos_selection) - hline.data.sum <- - data.frame(z = c(1300000, 2500000, 500000, 1800000, 1400000), - HMDB.name = IS_sum_selection) -} else if (dims_matrix == "Plasma"){ - hline.data.neg <- - data.frame(z = c(6500, 100000, 75000, 7500, 25000), - HMDB.name = IS_neg_selection) - hline.data.pos <- - data.frame(z = c(85000, 1000000, 425000, 70000, 180000), - HMDB.name = IS_pos_selection) - hline.data.sum <- - data.frame(z = c(700000, 1250000, 150000, 425000, 300000), - HMDB.name = IS_sum_selection) -} +if (dims_matrix == "DBS") { + hline.data.neg <- + data.frame( + z = c(15000, 200000, 130000, 18000, 50000), + HMDB.name = IS_neg_selection + ) + hline.data.pos <- + data.frame( + z = c(150000, 3300000, 1750000, 150000, 270000), + HMDB.name = IS_pos_selection + ) + hline.data.sum <- + data.frame( + z = c(1300000, 2500000, 500000, 1800000, 1400000), + HMDB.name = IS_sum_selection + ) +} else if (dims_matrix == "Plasma") { + hline.data.neg <- + data.frame( + z = c(6500, 100000, 75000, 7500, 25000), + HMDB.name = IS_neg_selection + ) + hline.data.pos <- + data.frame( + z = c(85000, 1000000, 425000, 70000, 180000), + HMDB.name = IS_pos_selection + ) + hline.data.sum <- + data.frame( + z = c(700000, 1250000, 150000, 425000, 300000), + HMDB.name = IS_sum_selection + ) +} # function for ggplot theme # see bar plots with all IS @@ -433,35 +455,41 @@ if (dims_matrix == "DBS"){ # ggplot functions IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + - geom_bar(aes(fill=HMDB.name), stat='identity') + - labs(x='', y='Intensity') + - facet_wrap(~HMDB.name, scales='free', ncol = 2) + - if(exists("hline.data.neg")){geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name))} #subset, if some IS have no data, no empty plots will be generated with a line) + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + + if (exists("hline.data.neg")) { + geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name)) + } # subset, if some IS have no data, no empty plots will be generated with a line) IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + - geom_bar(aes(fill=HMDB.name), stat='identity') + - labs(x='', y='Intensity') + - facet_wrap(~HMDB.name, scales='free', ncol = 2) + - if(exists("hline.data.pos")){geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name))} + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + + if (exists("hline.data.pos")) { + geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name)) + } IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + - geom_bar(aes(fill=HMDB.name), stat='identity') + - labs(x='', y='Intensity') + - facet_wrap(~HMDB.name, scales='free', ncol = 2) + - if(exists("hline.data.sum")){geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name))} + geom_bar(aes(fill = HMDB.name), stat = "identity") + + labs(x = "", y = "Intensity") + + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + + if (exists("hline.data.sum")) { + geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name)) + } # add theme to ggplot functions -IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot) +IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot) IS_pos_selection_barplot <- theme_IS_bar(IS_pos_selection_barplot) -IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot) +IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot) # save plots to disk plot_width <- 9 + 0.35 * sample_count -ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width/2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") ## line plots with a selection of IS @@ -471,21 +499,21 @@ ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_b # ggplot functions IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + - geom_point(aes(col=HMDB.name)) + - geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='', y='Intensity') + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + - geom_point(aes(col=HMDB.name)) + - geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='', y='Intensity') + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + - geom_point(aes(col=HMDB.name)) + - geom_line(aes(col=HMDB.name, group=HMDB.name)) + - labs(x='', y='Intensity') + geom_point(aes(col = HMDB.name)) + + geom_line(aes(col = HMDB.name, group = HMDB.name)) + + labs(x = "", y = "Intensity") # add theme to ggplot functions IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot) @@ -494,9 +522,9 @@ IS_sum_selection_lineplot <- theme_IS_line(IS_sum_selection_lineplot) # save plots to disk plot_width <- 8 + 0.2 * sample_count -ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width/2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") ### POSITIVE CONTROLS CHECK @@ -505,59 +533,59 @@ ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_ column_list <- colnames(outlist) patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_") -positive_controls_index <- grepl(pattern=paste(patterns, collapse="|"), column_list) +positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list) positivecontrol_list <- column_list[positive_controls_index] if (z_score == 1) { # find if one or more positive control samples are missing pos_contr_warning <- c() # any() grep because you get a vector of FALSE's and TRUE's. only one grep match is needed for each positive control - if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) && - any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) && - any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))){ + if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) && + any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) && + any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))) { cat("All three positive controls are present") } else { - pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse=" ") + pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse = " ") } # you need all positive control samples, thus starting the script only if all are available if (length(pos_contr_warning) == 0) { ### POSITIVE CONTROLS # make positive control excel with specific HMDB_codes in combination with specific control samples - PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] #P1001.x_Zscore - PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] #P1003.x_Zscore - LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] #P1005.x_Zscore - - PA_codes <- c('HMDB00824', 'HMDB00783', 'HMDB00123') - PKU_codes <- c('HMDB00159') - LPI_codes <- c('HMDB00904', 'HMDB00641', 'HMDB00182') - - PA_data <- outlist[PA_codes, c('HMDB_code','name', PA_sample_name)] - PA_data <- reshape2::melt(PA_data, id.vars = c('HMDB_code','name')) - colnames(PA_data) <- c('HMDB.code','HMDB.name','Sample','Zscore') - - PKU_data <- outlist[PKU_codes, c('HMDB_code','name', PKU_sample_name)] - PKU_data <- reshape2::melt(PKU_data, id.vars = c('HMDB_code','name')) - colnames(PKU_data) <- c('HMDB.code','HMDB.name','Sample','Zscore') - - LPI_data <- outlist[LPI_codes, c('HMDB_code','name', LPI_sample_name)] - LPI_data <- reshape2::melt(LPI_data, id.vars = c('HMDB_code','name')) - colnames(LPI_data) <- c('HMDB.code','HMDB.name','Sample','Zscore') - + PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] # P1001.x_Zscore + PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] # P1003.x_Zscore + LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] # P1005.x_Zscore + + PA_codes <- c("HMDB00824", "HMDB00783", "HMDB00123") + PKU_codes <- c("HMDB00159") + LPI_codes <- c("HMDB00904", "HMDB00641", "HMDB00182") + + PA_data <- outlist[PA_codes, c("HMDB_code", "name", PA_sample_name)] + PA_data <- reshape2::melt(PA_data, id.vars = c("HMDB_code", "name")) + colnames(PA_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + + PKU_data <- outlist[PKU_codes, c("HMDB_code", "name", PKU_sample_name)] + PKU_data <- reshape2::melt(PKU_data, id.vars = c("HMDB_code", "name")) + colnames(PKU_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + + LPI_data <- outlist[LPI_codes, c("HMDB_code", "name", LPI_sample_name)] + LPI_data <- reshape2::melt(LPI_data, id.vars = c("HMDB_code", "name")) + colnames(LPI_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + Pos_Contr <- rbind(PA_data, PKU_data, LPI_data) Pos_Contr$Zscore <- as.numeric(Pos_Contr$Zscore) # extra information added to excel for future reference. made in beginning of this script Pos_Contr$Matrix <- dims_matrix Pos_Contr$Rundate <- rundate Pos_Contr$Project <- project - - #Save results - save(Pos_Contr,file = paste0(outdir, "/", project, '_Pos_Contr.RData')) + + # Save results + save(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.RData")) Pos_Contr$Zscore <- round_df(Pos_Contr$Zscore, 2) # asked by Lab to round the number to 2 digits - write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, '_Pos_Contr.xlsx'), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE) - + write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.xlsx"), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE) } else { write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) - }} + } +} ### MISSING M/Z CHECK # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail @@ -571,18 +599,18 @@ rm(outlist.ident) # check for missing m/z in negative and positive mode mode <- c("Negative", "Positive") index <- 1 -results_ident <- c() #empty results list +results_ident <- c() # empty results list outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos) for (outlist.ident in outlist_ident_list) { current_mode <- mode[index] # retrieve all unique m/z values in whole numbers and check if all are available - mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits=0))) - mz_range <- seq(70, 599, by=1) #change accordingly to the machine m/z range. default = 70-600 + mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits = 0))) + mz_range <- seq(70, 599, by = 1) # change accordingly to the machine m/z range. default = 70-600 mz_missing <- c() for (mz in mz_range) { if (!mz %in% mz_values) { mz_missing <- c(mz_missing, mz) - } + } } y <- mz_missing # check if m/z are missing and make an .txt file with information @@ -595,6 +623,6 @@ for (outlist.ident in outlist_ident_list) { } index <- index + 1 # change to new mode in for loop } -lapply(results_ident, write, file=paste(outdir, "missing_mz_warning.txt", sep = "/"), append=TRUE, ncolumns=1000) +lapply(results_ident, write, file = paste(outdir, "missing_mz_warning.txt", sep = "/"), append = TRUE, ncolumns = 1000) cat("Ready excelExport.R") diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index a8b819c..896918f 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -1,10 +1,10 @@ -# For untargeted metabolomics, this tool calculates probability scores for -# metabolic disorders. In addition, it provides visual support with violin plots +# For untargeted metabolomics, this tool calculates probability scores for +# metabolic disorders. In addition, it provides visual support with violin plots # of the DIMS measurements for the lab specialists. -# Input needed: +# Input needed: # 1. Excel file in which metabolites are listed with their intensities for # controls (with C in samplename) and patients (with P in samplename) and their -# corresponding Z-scores. +# corresponding Z-scores. # 2. All files from github: https://github.com/UMCUGenetics/DIMS ## adapted from 15-dIEM_violin.R @@ -17,13 +17,10 @@ library(gridExtra) # for table top highest/lowest # define parameters - check after addition to run.sh cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) { - cat(" ", arg, "\n", sep = "") -} -run_name <- cmd_args[1] +run_name <- cmd_args[1] scripts_dir <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) +z_score <- as.numeric(cmd_args[3]) # load functions source(paste0(scripts_dir, "AddOnFunctions/check_same_samplename.R")) @@ -33,18 +30,20 @@ source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R")) source(paste0(scripts_dir, "AddOnFunctions/create_violin_plots.R")) source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R")) -# The list of parameters can be shortened for HPC. Leave for now. -top_nr_IEM <- 5 # number of diseases that score highest in algorithm to plot -threshold_IEM <- 5 # probability score cut-off for plotting the top diseases -ratios_cutoff <- -5 # z-score cutoff of axis on the left for top diseases -nr_plots_perpage <- 20 # number of violin plots per page in PDF - -# Settings from config.R -# binary variable: run function, yes(1) or no(0). Can be removed at later stage -if (z_score == 1) { - algorithm <- ratios <- violin <- 1 -} -# integer: are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) +# number of diseases that score highest in algorithm to plot +top_nr_IEM <- 5 +# probability score cut-off for plotting the top diseases +threshold_IEM <- 5 +# z-score cutoff of axis on the left for top diseases +ratios_cutoff <- -5 +# number of violin plots per page in PDF +nr_plots_perpage <- 20 + +# binary variable: run function, yes(1) or no(0) +if (z_score == 1) { + algorithm <- ratios <- violin <- 1 +} +# are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) header_row <- 1 # column name where the data starts (default B) col_start <- "B" @@ -52,15 +51,12 @@ zscore_cutoff <- 5 xaxis_cutoff <- 20 # path to DIMS excel file -path_DIMSfile <- paste0("./", run_name, ".xlsx") +path_dims_file <- paste0("./", run_name, ".xlsx") -# path: output folder for dIEM and violin plots -output_dir <- paste0("../../../dIEM") -dir.create(output_dir, showWarnings = F) -print(getwd()) -print(path_DIMSfile) +# path: output folder for dIEM and violin plots +output_dir <- "./" -# folder in which all metabolite lists are (.txt) +# folder for all metabolite lists (.txt) path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups" # file for ratios step 3 file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv" @@ -72,26 +68,26 @@ file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt" # copy list of isomers to project folder. file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir) -#### STEP 1: Preparation #### -# in: run_name, path_DIMSfile, header_row ||| out: output_dir, DIMS +#### STEP 1: Preparation #### +# in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS # Load the excel file. -dims_xls <- readWorkbook(xlsxFile = path_DIMSfile, sheet = 1, startRow = header_row) +dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row) if (exists("dims_xls")) { - cat(paste0("\nThe excel file is succesfully loaded:\n -> ",path_DIMSfile)) + cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file)) } else { - cat(paste0("\n\n**** Error: Could not find an excel file. Please check if path to excel file is correct in config.R:\n -> ",path_DIMSfile,"\n")) + cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n")) } -#### STEP 2: Edit DIMS data ##### +#### STEP 2: Edit DIMS data ##### # in: dims_xls ||| out: Data, nr_contr, nr_pat # Input: the xlsx file that comes out of the pipeline with format: # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] # Output: "_CSV.csv" file that is suited for the algorithm in shiny. # Determine the number of Contols and Patients in column names: -nr_contr <- length(grep("C",names(dims_xls)))/2 # Number of control samples -nr_pat <- length(grep("P",names(dims_xls)))/2 # Number of patient samples +nr_contr <- length(grep("C", names(dims_xls))) / 2 +nr_pat <- length(grep("P", names(dims_xls))) / 2 # total number of samples nrsamples <- nr_contr + nr_pat # check whether the number of intensity columns equals the number of Zscore columns @@ -100,19 +96,19 @@ if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { } cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) -# Move the columns HMDB_code and HMDB_name to the beginning. -HMDB_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) -other_cols <- seq_along(1:ncol(dims_xls))[-HMDB_info_cols] -dims_xls_copy <- dims_xls[ , c(HMDB_info_cols, other_cols)] +# Move the columns HMDB_code and HMDB_name to the beginning. +hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) +other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] +dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] # Remove the columns from 'name' to 'pathway' from_col <- which(colnames(dims_xls_copy) == "name") to_col <- which(colnames(dims_xls_copy) == "pathway") -dims_xls_copy <- dims_xls_copy[ , -c(from_col:to_col)] +dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] # in case the excel had an empty "plots" column, remove it -if ("plots" %in% colnames(dims_xls_copy)) { - dims_xls_copy <- dims_xls_copy[ , -grep("plots", colnames(dims_xls_copy))] -} -# Rename columns +if ("plots" %in% colnames(dims_xls_copy)) { + dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))] +} +# Rename columns names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) @@ -121,7 +117,7 @@ names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) # intensity columns and mean and standard deviation of controls numeric_cols <- c(3:ncol(dims_xls_copy)) # make sure all values are numeric -dims_xls_copy[ , numeric_cols] <- sapply(dims_xls_copy[ , numeric_cols], as.numeric) +dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric) if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { cat("\n### Step 2 # Edit dims data is done.\n") @@ -129,9 +125,9 @@ if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { cat("\n**** Error: Could not execute step 2 \n") } -#### STEP 3: Calculate ratios of intensities for metabolites #### +#### STEP 3: Calculate ratios of intensities for metabolites #### # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) -# This script loads the file with Ratios (file_ratios_metabolites) and calculates +# This script loads the file with Ratios (file_ratios_metabolites) and calculates # the ratios of the intensities of the given metabolites. It also calculates # Zs-cores based on the avg and sd of the ratios of the controls. @@ -141,38 +137,38 @@ if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { # Output: "_CSV.csv" file that is suited for the algorithm, with format: # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. -if (ratios == 1) { +if (ratios == 1) { cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) - ratio_input <- read.csv(file_ratios_metabolites, sep=';', stringsAsFactors=FALSE) - + ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) + # Prepare empty data frame to fill with ratios ratio_list <- setNames(data.frame(matrix( - ncol=ncol(dims_xls_copy), - nrow=nrow(ratio_input) + ncol = ncol(dims_xls_copy), + nrow = nrow(ratio_input) )), colnames(dims_xls_copy)) - + # put HMDB info into first two columns of ratio_list - ratio_list[ ,1:2] <- ratio_input[ ,1:2] + ratio_list[, 1:2] <- ratio_input[, 1:2] # look for intensity columns (exclude Zscore columns) control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) intensity_cols <- c(control_cols, patient_cols) - # calculate each of the ratios of intensities + # calculate each of the ratios of intensities for (ratio_index in 1:nrow(ratio_input)) { - ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] + ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] - ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] + ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] # find these HMDB IDs in dataset. Could be a sum of multiple metabolites sel_denominator <- sel_numerator <- c() - for (numerator_index in 1:length(ratio_numerator)) { - sel_numerator <- c(sel_numerator, which(dims_xls_copy[ , "HMDB.code"] == ratio_numerator[numerator_index])) + for (numerator_index in 1:length(ratio_numerator)) { + sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index])) } - for (denominator_index in 1:length(ratio_denominator)) { - # special case for sum of metabolites (dividing by one) + for (denominator_index in 1:length(ratio_denominator)) { + # special case for sum of metabolites (dividing by one) if (ratio_denominator[denominator_index] != "one") { - sel_denominator <- c(sel_denominator, which(dims_xls_copy[ , "HMDB.code"] == ratio_denominator[denominator_index])) + sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index])) } } # calculate ratio @@ -184,182 +180,178 @@ if (ratios == 1) { ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) } # calculate log of ratio - ratio_list[ratio_index, intensity_cols]<- log2(ratio_list[ratio_index, intensity_cols]) + ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols]) } - + # Calculate means and SD's of the calculated ratios for Controls - ratio_list[ , "Mean_controls"] <- apply(ratio_list[ , control_cols], 1, mean) - ratio_list[ , "SD_controls"] <- apply(ratio_list[ , control_cols], 1, sd) + ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean) + ratio_list[, "SD_controls"] <- apply(ratio_list[, control_cols], 1, sd) # Calc z-scores with the means and SD's of Controls zscore_cols <- grep("Zscore", colnames(ratio_list)) for (sample_index in 1:length(zscore_cols)) { - zscore_col <- zscore_cols[sample_index] + zscore_col <- zscore_cols[sample_index] # matching intensity column int_col <- intensity_cols[sample_index] # test on column names if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { # calculate Z-scores - ratio_list[ , zscore_col] <- (ratio_list[ , int_col] - ratio_list[ , "Mean_controls"]) / ratio_list[ , "SD_controls"] + ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"] } } - + # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) # Edit the DIMS output Zscores of all patients in format: # HMDB_code patientname1 patientname2 - names(dims_xls_ratios) <- gsub("HMDB.code","HMDB_code", names(dims_xls_ratios)) + names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios)) names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) # for debugging: - write.table(dims_xls_ratios, file=paste0(output_dir, "/ratios.txt"), sep="\t") + write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t") - # Select only the cols with zscores of the patients - zscore_patients <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] + # Select only the cols with zscores of the patients + zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] # Select only the cols with zscores of the controls - zscore_controls <- dims_xls_ratios[ , c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] + zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] } -#### STEP 4: Run the IEM algorithm ######### +#### STEP 4: Run the IEM algorithm ######### # in: algorithm, file_expected_biomarkers_IEM, zscore_patients ||| out: prob_score (+file) # algorithm taken from DOI: 10.3390/ijms21030979 if (algorithm == 1) { # Load data cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_IEM, "\n")) - expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep=';', stringsAsFactors=FALSE) + expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep = ";", stringsAsFactors = FALSE) # modify column names - names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) + names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) - + # prepare dataframe scaffold rank_patients rank_patients <- zscore_patients # Fill df rank_patients with the ranks for each patient for (patient_index in 3:ncol(zscore_patients)) { # number of positive zscores in patient - pos <- sum(zscore_patients[ , patient_index] > 0) + pos <- sum(zscore_patients[, patient_index] > 0) # sort the column on zscore; NB: this sorts the entire object, not just one column rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] # Rank all positive zscores highest to lowest rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) # Rank all negative zscores lowest to highest - rank_patients[(pos+1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos+1):nrow(rank_patients), patient_index])) + rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):nrow(rank_patients), patient_index])) } - + # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). - expected_zscores <- merge(x=expected_biomarkers, y=zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - expected_zscores_original <- expected_zscores # necessary copy? - + expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + expected_zscores_original <- expected_zscores + # determine which columns contain Z-scores and which contain disease info select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) - select_info_cols <- 1:(min(select_zscore_cols) -1) + select_info_cols <- 1:(min(select_zscore_cols) - 1) # set some zscores to zero - select_incr_indisp <- which(expected_zscores$Change=="Increase" & expected_zscores$Dispensability=="Indispensable") - expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6 , 0, x)) - select_decr_indisp <- which(expected_zscores$Change=="Decrease" & expected_zscores$Dispensability=="Indispensable") - expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2 , 0, x)) - - # calculate rank score: - expected_ranks <- merge(x=expected_biomarkers, y=rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols]/(expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols]*0.9) + select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") + expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) + select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") + expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) + + # calculate rank score: + expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) # combine disease info with rank scores expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) - + # multiply weight score and rank score weight_score <- expected_zscores - weight_score[ , select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[ , select_zscore_cols] - + weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols] + # sort table on Disease and Absolute_Weight weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] # select columns to check duplicates - dup <- weight_score[ , c('Disease', 'M.z')] - uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast=FALSE),] - + dup <- weight_score[, c("Disease", "M.z")] + uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ] + # calculate probability score - prob_score <- aggregate(uni[ , select_zscore_cols], uni["Disease"], sum) + prob_score <- aggregate(uni[, select_zscore_cols], uni["Disease"], sum) # list of all diseases that have at least one metabolite Zscore at 0 for (patient_index in 2:ncol(prob_score)) { patient_zscore_colname <- colnames(prob_score)[patient_index] matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) # determine which Zscores are 0 for this patient - zscores_zero <- which(expected_zscores[ , matching_colname_expected] == 0) - # get Disease for these + zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0) + # get Disease for these disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) - # set the probability score of these diseases to 0 - prob_score[which(prob_score$Disease %in% disease_zero), patient_index]<- 0 + # set the probability score of these diseases to 0 + prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0 } - + # determine disease rank per patient - disease_rank <- prob_score + disease_rank <- prob_score # rank diseases in decreasing order disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) # modify column names, Zscores have now been converted to probability scores - colnames(prob_score) <- gsub("_Zscore","_prob_score", colnames(prob_score)) # redundant? - colnames(disease_rank) <- gsub("_Zscore","", colnames(disease_rank)) - - # Create conditional formatting for output excel sheet. Colors according to values. + colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) + colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) + + # Create conditional formatting for output Excel sheet. Colors according to values. wb <- createWorkbook() addWorksheet(wb, "Probability Scores") writeData(wb, "Probability Scores", prob_score) - conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white","#FFFDA2","red"), rule = c(1, 10, 100)) - saveWorkbook(wb, file = paste0(output_dir,"/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) + saveWorkbook(wb, file = paste0(output_dir, "/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) # check whether prob_score df exists and has expected dimensions. if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") } else { cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") } - + rm(wb) } -#### STEP 5: Make violin plots ##### +#### STEP 5: Make violin plots ##### # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, top_diseases, top_metab, output_dir ||| out: pdf file if (violin == 1) { # make violin plots - + # preparation - # isobarics_txt <- c() - zscore_patients_copy <- zscore_patients - # keep the original for testing purposes, remove later. - colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) # for robust scaler + zscore_patients_copy <- zscore_patients + # for robust scaler, rename Z-score columns + colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) + colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) - colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) # for robust scaler colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) - + # Make patient list for violin plots - patient_list <- names(zscore_patients)[-c(1,2)] + patient_list <- names(zscore_patients)[-c(1, 2)] - # from table expected_biomarkers, choose selected columns + # from table expected_biomarkers, choose selected columns select_columns <- c("Disease", "HMDB_code", "HMDB_name") select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) - expected_biomarkers_select <- expected_biomarkers[ , select_col_nrs] + expected_biomarkers_select <- expected_biomarkers[, select_col_nrs] # remove duplicates - expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[ , c(1,2)]), ] - + expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] + # load file with explanatory information to be included in PDF. explanation <- readLines(file_explanation) - - # for debugging: - #write.table(explanation, file=paste0(outdir, "explanation_read_in.txt"), sep="\t") # first step: normal violin plots # Find all text files in the given folder, which contain metabolite lists of which # each file will be a page in the pdf with violin plots. # Make a PDF file for each of the categories in metabolite_dirs - metabolite_dirs <- list.files(path=path_metabolite_groups, full.names=FALSE, recursive=FALSE) + metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) for (metabolite_dir in metabolite_dirs) { # create a directory for the output PDFs - pdf_dir <- paste(output_dir, metabolite_dir, sep="/") - dir.create(pdf_dir, showWarnings=FALSE) + pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") + dir.create(pdf_dir, showWarnings = FALSE) cat("making plots in category:", metabolite_dir, "\n") - + # get a list of all metabolite files - metabolite_files <- list.files(path=paste(path_metabolite_groups, metabolite_dir, sep="/"), pattern="*.txt", full.names=FALSE, recursive=FALSE) + metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), pattern = "*.txt", full.names = FALSE, recursive = FALSE) # put all metabolites into one list metab_list_all <- list() metab_list_names <- c() @@ -367,20 +359,20 @@ if (violin == 1) { # make violin plots # open the text files and add each to a list of dataframes (metab_list_all) for (file_index in seq_along(metabolite_files)) { infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep="/"), sep = "\t", header = TRUE, quote="") + metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), sep = "\t", header = TRUE, quote = "") # put into list of all lists metab_list_all[[file_index]] <- metab_list metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) cat(paste0("\n", infile)) - } + } # include list of classes in metabolite list names(metab_list_all) <- metab_list_names - + # prepare list of metabolites; max nr_plots_perpage on one page metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) - + # make violin plots per patient for (pt_nr in 1:length(patient_list)) { pt_name <- patient_list[pt_nr] @@ -388,21 +380,19 @@ if (violin == 1) { # make violin plots # for category Other, make list of top highest and lowest Z-scores for this patient if (grepl("Diagnost", pdf_dir)) { top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted) - # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_alarmvalues.RData")) } else { top_metab_pt <- prepare_toplist(pt_name, zscore_patients) - # save(top_metab_pt, file=paste0(outdir, "/start_15_prepare_toplist.RData")) } # generate normal violin plots create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) - - } # end for pt_nr - } # end for metabolite_dir - + } + + } + # Second step: dIEM plots in separate directory - dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep="/") + dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") dir.create(dIEM_plot_dir) # Select the metabolites that are associated with the top highest scoring IEM, for each patient @@ -411,7 +401,7 @@ if (violin == 1) { # make violin plots pt_name <- patient_list[pt_nr] # get top diseases for this patient pt_colnr <- which(colnames(disease_rank) == pt_name) - pt_top_indices <- which(disease_rank[ , pt_colnr] <= top_nr_IEM) + pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_IEM) pt_IEMs <- disease_rank[pt_top_indices, "Disease"] pt_top_IEMs <- pt_prob_score_top_IEMs <- c() for (single_IEM in pt_IEMs) { @@ -441,7 +431,7 @@ if (violin == 1) { # make violin plots select_rows <- which(expected_biomarkers_select$Disease == single_IEM) metab_list <- expected_biomarkers_select[select_rows, ] metab_IEM_names <- c(metab_IEM_names, paste0(single_IEM, ", probability score ", single_prob_score)) - metab_list <- metab_list[ , -1] + metab_list <- metab_list[, -1] metab_IEM_all[[single_IEM_index]] <- metab_list } # put all metabolites into one list @@ -457,11 +447,10 @@ if (violin == 1) { # make violin plots create_violin_plots(dIEM_plot_dir, pt_name, dIEM_metab_perpage, top_metab_pt) } else { - cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM,". - Therefore, this pdf was not made:\t ", pt_name ,"_IEM \n")) + cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM, ". + Therefore, this pdf was not made:\t ", pt_name, "_IEM \n")) } - } # end for pt_nr - -} # end if violin = 1 + } +} diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 06e0f2c..7441959 100644 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -4,10 +4,9 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -# Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm -db_path <- cmd_args[1] # location of HMDB db file -breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -standard_run <- cmd_args[4] # "yes" +db_path <- cmd_args[1] +breaks_filepath <- cmd_args[2] +standard_run <- cmd_args[4] # Cut up entire HMDB into small parts based on the new binning/breaks load(breaks_filepath) @@ -19,11 +18,11 @@ max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) # test if standard mz range is used if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { # skip generating HMDB parts - hmdb_parts_files <- cmd_args[3] - - hmdb_parts <- list.files(hmdb_parts_files, pattern="hmdb") # all files containing hmdb in file name + hmdb_parts_path <- cmd_args[3] + # find all files containing hmdb in file name + hmdb_parts <- list.files(hmdb_parts_path, pattern = "hmdb") for (hmdb_file in hmdb_parts) { - file.copy(paste(hmdb_parts_files, hmdb_file, sep="/"), "./", recursive = TRUE) + file.copy(paste(hmdb_parts_path, hmdb_file, sep = "/"), "./", recursive = TRUE) } } else { # generate HMDB parts in case of non-standard mz range @@ -31,7 +30,6 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < ppm <- as.numeric(cmd_args[5]) scanmodes <- c("positive", "negative") - for (scanmode in scanmodes) { if (scanmode == "negative") { column_label <- "MNeg" @@ -42,48 +40,49 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < } # filter mass range meassured!!! - HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] & - HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & + HMDB_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # sort on mass - outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[,column_label])),] + outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[ , column_label])),] - n <- dim(outlist)[1] - sub <- 20000 # max rows per file + nr_rows <- dim(outlist)[1] + # maximum number of rows per file + sub <- 20000 end <- 0 - min_1_last <- sub + last_line <- sub check <- 0 outlist_part <- NULL - if (n < sub) { + if (nr_rows < sub) { outlist_part <- outlist save(outlist_part, file = paste0("./", scanmode, "_hmdb.1.RData")) } else { - if (n >= sub & (floor(n/sub) - 1) >= 2){ - for (i in 2:floor(n/sub) - 1){ - start <- -(sub - 1) + i*sub - end <- i*sub + if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) { + for (i in 2:floor(nr_rows / sub) - 1) { + start <- -(sub - 1) + i * sub + end <- i * sub if (i > 1){ outlist_i = outlist[c(start:end),] - n_moved = 0 + nr_moved = 0 - # Calculate 3ppm and replace border, avoid cut within peakgroup! - while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1,]) - outlist_i <- outlist_i[-1,] - n_moved <- n_moved + 1 + # Use ppm to replace border to avoid cut within peakgroup! + while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / + as.numeric(outlist_i[1, column_label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1, ]) + outlist_i <- outlist_i[-1, ] + nr_moved <- nr_moved + 1 } - # message(paste("Process", i-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep="."), sep="")) + save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = "")) check <- check + dim(outlist_part)[1] outlist_part <- outlist_i - min_1_last <- dim(outlist_part)[1] + last_line <- dim(outlist_part)[1] } else { outlist_part <- outlist[c(start:end),] @@ -92,27 +91,26 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < } start <- end + 1 - end <- n + end <- nr_rows outlist_i <- outlist[c(start:end),] - n_moved <- 0 + nr_moved <- 0 if (!is.null(outlist_part)) { # Calculate ppm and replace border, avoid cut within peakgroup! - while ((as.numeric(outlist_i[1,column_label]) - as.numeric(outlist_part[min_1_last,column_label]))*1e+06/as.numeric(outlist_i[1,column_label]) < ppm) { + while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / + as.numeric(outlist_i[1, column_label]) < ppm) { outlist_part <- rbind(outlist_part, outlist_i[1,]) outlist_i <- outlist_i[-1,] - n_moved <- n_moved + 1 + nr_moved <- nr_moved + 1 } - save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i, "RData", sep = "."), sep = "")) + save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i, ".RData")) check <- check + dim(outlist_part)[1] } outlist_part <- outlist_i - save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i + 1, "RData", sep="."), sep="")) + save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i + 1, ".RData")) check <- check + dim(outlist_part)[1] - # cat("\n", "Check", check == dim(outlist)[1]) - } } } diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 941d45f..00cf5c9 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -4,16 +4,14 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -db_path <- cmd_args[1] # location of HMDB db file -breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData +db_path <- cmd_args[1] +breaks_filepath <- cmd_args[2] load(db_path) load(breaks_filepath) # Cut up HMDB minus adducts minus isotopes into small parts - scanmodes <- c("positive", "negative") - for (scanmode in scanmodes) { if (scanmode == "negative") { column_label <- "MNeg" @@ -23,37 +21,38 @@ for (scanmode in scanmodes) { HMDB_add_iso <- HMDB_add_iso.Pos } - # filter mass range meassured NB: remove the last comma?! - outlist <- HMDB_add_iso[which(HMDB_add_iso[ ,column_label] >= breaks_fwhm[1] & HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + # filter mass range measured + outlist <- HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & + HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # remove adducts and isotopes, put internal standard at the beginning - outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed=TRUE), ] - outlist <- outlist[grep("HMDB", rownames(outlist), fixed=TRUE), ] - outlist <- outlist[-grep("_", rownames(outlist), fixed=TRUE), ] + outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ] + outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ] + outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ] outlist <- rbind(outlist_IS, outlist) # sort on m/z value - outlist <- outlist[order(outlist[ ,column_label]), ] + outlist <- outlist[order(outlist[ , column_label]), ] - n <- dim(outlist)[1] + nr_rows <- dim(outlist)[1] # size of hmdb parts in lines: sub <- 1000 end <- 0 check <- 0 # generate hmdb parts - if (n >= sub & (floor(n/sub)) >= 2) { - for (i in 1:floor(n/sub)){ - start <- -(sub-1) + i*sub - end <- i*sub + if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) { + for (i in 1:floor(nr_rows / sub)) { + start <- -(sub-1) + i * sub + end <- i * sub outlist_part <- outlist[c(start:end), ] - save(outlist_part, file=paste0(scanmode, "_hmdb.", i, ".RData")) + save(outlist_part, file=paste0(scanmode, "_hmdb_main_", i, ".RData")) } } } # finish last hmdb part start <- end + 1 -end <- n +end <- nr_rows outlist_part <- outlist[c(start:end),] save(outlist_part, file = paste0(scanmode, "_hmdb.", i+1, ".RData")) diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 3b125e3..33794c5 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,29 +1,29 @@ #!/usr/bin/env Rscript ## adapted from makeInit in old pipeline -args <- commandArgs(trailingOnly=TRUE) -sample_sheet <- read.csv(args[1], sep="\t") +args <- commandArgs(trailingOnly = TRUE) +sample_sheet <- read.csv(args[1], sep = "\t") nr_replicates <- as.numeric(args[2]) -sampleNames <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sampgrps <- length(sampleNames)/nr_replicates -groupNames <- trimws(as.vector(unlist(sample_sheet[2]))) -groupNames <- gsub('[^-.[:alnum:]]', '_', groupNames) -groupNamesUnique <- unique(groupNames) +sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_sampgrps <- length(sampleNames) / nr_replicates +group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +group_names <- gsub("[^-.[:alnum:]]", "_", group_names) +group_names_unique <- unique(group_names) repl_pattern <- c() for (sampgrp in 1:nr_sampgrps) { tmp <- c() for (repl in nr_replicates:1) { - index <- ((sampgrp*nr_replicates) - repl) + 1 + index <- ((sampgrp * nr_replicates) - repl) + 1 tmp <- c(tmp, sampleNames[index]) } repl_pattern <- c(repl_pattern, list(tmp)) } -names(repl_pattern) <- groupNamesUnique +names(repl_pattern) <- group_names_unique # preview the replication pattern -print(head(repl_pattern)) +print(tail(repl_pattern)) -save(repl_pattern, file="./init.RData") +save(repl_pattern, file = "./init.RData") diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 926cb77..e978d4b 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,15 +1,14 @@ #!/usr/bin/Rscript # adapted from 4-peakFinding.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData resol <- as.numeric(cmd_args[3]) scripts_dir <- cmd_args[4] - -thresh <- 2000 +thresh <- 2000 # load in function scripts source(paste0(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R")) @@ -38,15 +37,18 @@ load(breaks_filepath) # Load output of AverageTechReplicates for a sample sample_avgtechrepl <- get(load(filepath)) -if (grepl("_pos", filepath)) { scanmode = "positive" } else - if (grepl("_neg", filepath)) { scanmode = "negative" } +if (grepl("_pos", filepath)) { + scanmode <- "positive" +} else if (grepl("_neg", filepath)) { + scanmode <- "negative" +} # Initialize options(digits = 16) -int.factor <- 1*10^5 # Number of x used to calc area under Gaussian (is not analytic) +int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve scale <- 2 # Initial value used to estimate scaling parameter width <- 1024 height <- 768 # run the findPeaks function -findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index 423fb69..d04b6a2 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -1,176 +1,186 @@ #!/usr/bin/Rscript # adapted from 6-peakGrouping.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -HMDB_part_file <- cmd_args[1] +hmdb_part_file <- cmd_args[1] ppm <- as.numeric(cmd_args[2]) -options(digits=16) +options(digits = 16) # load part of the HMDB -HMDB_add_iso <- get(load(HMDB_part_file)) +hmdb_add_iso <- get(load(hmdb_part_file)) -# determine appropriate scanmode based on HMDB_part_file -if (grepl("negative", basename(HMDB_part_file))) { scanmode <- "negative" } else - if (grepl("positive", basename(HMDB_part_file))) { scanmode <- "positive" } +# determine appropriate scanmode based on hmdb_part_file +if (grepl("negative", basename(hmdb_part_file))) { + scanmode <- "negative" +} else if (grepl("positive", basename(hmdb_part_file))) { + scanmode <- "positive" +} # determine batch number of HMDB part file -batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2] +batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2] # load file with spectrum peaks -SpecPeaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") -load(SpecPeaks_file) -outlist.copy <- outlist.tot -rm(outlist.tot) +spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") +load(spec_peaks_file) +outlist_copy <- outlist_tot +rm(outlist_tot) # load replication pattern pattern_file <- paste0(scanmode, "_repl_pattern.RData") load(pattern_file) -# determine appropriate column name in HMDB part -if (scanmode=="negative") { column_label <- "MNeg" } else { column_label <- "Mpos" } +# determine appropriate column name in HMDB part +if (scanmode == "negative") { + column_label <- "MNeg" +} else { + column_label <- "Mpos" +} # Initialize -outpgrlist.identified <- NULL +peakgrouplist_identified <- NULL list_of_peaks_used_in_peak_groups_identified <- NULL # First find peak groups identified based on HMDB masses -while (dim(HMDB_add_iso)[1] > 0) { +while (dim(hmdb_add_iso)[1] > 0) { index <- 1 - + # take one m/z value from the HMDB part and calculate mass tolerance - reference_mass <- as.numeric(HMDB_add_iso[index, column_label]) + reference_mass <- as.numeric(hmdb_add_iso[index, column_label]) mass_tolerance <- (reference_mass * ppm) / 10^6 - + # find the peaks in the dataset with corresponding m/z - mzmed <- as.numeric(outlist.copy[ ,"mzmed.pkt"]) + mzmed <- as.numeric(outlist_copy[, "mzmed.pkt"]) selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance))) - tmplist <- outlist.copy[selp,,drop=FALSE] + tmplist <- outlist_copy[selp, , drop = FALSE] list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist) nrsamples <- length(selp) if (nrsamples > 0) { - mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) - mzmin.pgrp <- reference_mass - mass_tolerance - mzmax.pgrp <- reference_mass + mass_tolerance - + mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"])) + mzmin_pgrp <- reference_mass - mass_tolerance + mzmax_pgrp <- reference_mass + mass_tolerance + # determine fit quality fq - fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"])) - fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"])) - + fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"])) + fq_best_pgrp <- as.numeric(min(outlist_copy[selp, "fq"])) + # set up object for intensities for all samples - ints.allsamps <- rep(0, length(names(repl_pattern_filtered))) - names(ints.allsamps) <- names(repl_pattern_filtered) - + ints_allsamps <- rep(0, length(names(repl_pattern_filtered))) + names(ints_allsamps) <- names(repl_pattern_filtered) + # Check for each sample if multiple peaks exist, if so take the sum of the intensities - labels <- unique(tmplist[ ,"samplenr"]) - ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"])) } ))) - - # Initialize - assi_HMDB <- iso_HMDB <- HMDB_code <- NA - tmplist.mass.iso <- tmplist.mass.adduct <- NULL - + labels <- unique(tmplist[, "samplenr"]) + ints_allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { + sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"])) + }))) + + # Initialize + assi_hmdb <- iso_hmdb <- hmdb_code <- NA + tmplist_mass_iso <- tmplist_mass_adduct <- NULL + # Identification: find all entries in HMDB part with mass within ppm range - mass.all <- as.numeric(HMDB_add_iso[ , column_label]) - index <- which((mass.all > (reference_mass - mass_tolerance)) & (mass.all < (reference_mass + mass_tolerance))) - tmplist.mass <- HMDB_add_iso[index,,drop=FALSE] - - if (dim(tmplist.mass)[1]>0) { - # find isotope entries - index.iso <- grep(" iso ", tmplist.mass[, "CompoundName"], fixed = TRUE) - if (length(index.iso) > 0){ - tmplist.mass.iso <- tmplist.mass[index.iso,,drop=FALSE] - tmplist.mass <- tmplist.mass[-index.iso,,drop=FALSE] + mass_all <- as.numeric(hmdb_add_iso[, column_label]) + index <- which((mass_all > (reference_mass - mass_tolerance)) & (mass_all < (reference_mass + mass_tolerance))) + tmplist_mass <- hmdb_add_iso[index, , drop = FALSE] + + if (dim(tmplist_mass)[1] > 0) { + # find isotope entries + index_iso <- grep(" iso ", tmplist_mass[, "CompoundName"], fixed = TRUE) + if (length(index_iso) > 0) { + tmplist_mass_iso <- tmplist_mass[index_iso, , drop = FALSE] + tmplist_mass <- tmplist_mass[-index_iso, , drop = FALSE] } - - if (dim(tmplist.mass)[1] > 0) { - # find adduct entries - index.adduct <- grep(" [M", tmplist.mass[, "CompoundName"], fixed = TRUE) - if (length(index.adduct) > 0) { - tmplist.mass.adduct <- tmplist.mass[index.adduct,,drop=FALSE] - tmplist.mass <- tmplist.mass[-index.adduct,,drop=FALSE] - } - } - + + if (dim(tmplist_mass)[1] > 0) { + # find adduct entries + index_adduct <- grep(" [M", tmplist_mass[, "CompoundName"], fixed = TRUE) + if (length(index_adduct) > 0) { + tmplist_mass_adduct <- tmplist_mass[index_adduct, , drop = FALSE] + tmplist_mass <- tmplist_mass[-index_adduct, , drop = FALSE] + } + } + # Compose a list compounds, adducts or isotopes with corresponding m/z - if (dim(tmplist.mass)[1]>0) { - + if (dim(tmplist_mass)[1] > 0) { # metabolites - assi_HMDB <- as.character(paste(as.character(tmplist.mass[, "CompoundName"]), collapse = ";")) - HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass)), collapse = ";")) - theormz_HMDB <- as.numeric(tmplist.mass[1, column_label]) - + assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), collapse = ";")) + theormz_hmdb <- as.numeric(tmplist_mass[1, column_label]) + # adducts of metabolites - if (!is.null(tmplist.mass.adduct)) { - if (dim(tmplist.mass.adduct)[1] > 0) { - if (is.na(assi_HMDB)){ - assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")) - HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) + if (!is.null(tmplist_mass_adduct)) { + if (dim(tmplist_mass_adduct)[1] > 0) { + if (is.na(assi_hmdb)) { + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")) } else { - assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") - HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") + assi_hmdb <- paste(assi_hmdb, + as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";") + hmdb_code <- paste(hmdb_code, + as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";") } - } - } - + } + } + # isotopes of metabolites - if (!is.null(tmplist.mass.iso)) { - if (dim(tmplist.mass.iso)[1]>0) { - iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + if (!is.null(tmplist_mass_iso)) { + if (dim(tmplist_mass_iso)[1] > 0) { + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) } - } - - # if no metabolites have the correct m/z, look for adducts and isotopes only - } else if (!is.null(tmplist.mass.adduct)) { - - theormz_HMDB <- as.numeric(tmplist.mass.adduct[1, column_label]) - + } + + # if no metabolites have the correct m/z, look for adducts and isotopes only + } else if (!is.null(tmplist_mass_adduct)) { + theormz_hmdb <- as.numeric(tmplist_mass_adduct[1, column_label]) + # adducts of metabolites - if (!is.null(tmplist.mass.adduct)) { - if (dim(tmplist.mass.adduct)[1] > 0) { - if (is.na(assi_HMDB)) { - assi_HMDB <- as.character(paste(as.character(tmplist.mass.adduct[ , "CompoundName"]), collapse = ";")) - HMDB_code <- as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")) + if (!is.null(tmplist_mass_adduct)) { + if (dim(tmplist_mass_adduct)[1] > 0) { + if (is.na(assi_hmdb)) { + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")) } else { - assi_HMDB <- paste(assi_HMDB, as.character(paste(as.character(tmplist.mass.adduct[, "CompoundName"]), collapse = ";")), sep = ";") - HMDB_code <- paste(HMDB_code, as.character(paste(as.character(rownames(tmplist.mass.adduct)), collapse = ";")), sep = ";") + assi_hmdb <- paste(assi_hmdb, + as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";") + hmdb_code <- paste(hmdb_code, + as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";") } - } - } - + } + } + # isotopes of metabolites - if (!is.null(tmplist.mass.iso)) { - if (dim(tmplist.mass.iso)[1]>0) { - iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) + if (!is.null(tmplist_mass_iso)) { + if (dim(tmplist_mass_iso)[1] > 0) { + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) } - } - - # if no metabolites or adducts can be found, only look for isotopes - } else if (!is.null(tmplist.mass.iso)) { - - if (dim(tmplist.mass.iso)[1]>0) { - theormz_HMDB <- as.numeric(tmplist.mass.iso[1,column_label]) - iso_HMDB <- as.character(paste(as.character(tmplist.mass.iso[, "CompoundName"]), collapse = ";")) } - } - + + # if no metabolites or adducts can be found, only look for isotopes + } else if (!is.null(tmplist_mass_iso)) { + if (dim(tmplist_mass_iso)[1] > 0) { + theormz_hmdb <- as.numeric(tmplist_mass_iso[1, column_label]) + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) + } + } } - - # combine all information - outpgrlist.identified <- rbind(outpgrlist.identified, cbind(data.frame(mzmed.pgrp, "fq.best"=fq.best.pgrp, "fq.worst"=fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp), - t(as.matrix(ints.allsamps)), - data.frame(assi_HMDB, iso_HMDB, HMDB_code, theormz_HMDB))) + + # combine all information + peakgrouplist_identified <- rbind(peakgrouplist_identified, cbind( + data.frame(mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp), + t(as.matrix(ints_allsamps)), + data.frame(assi_hmdb, iso_hmdb, hmdb_code, theormz_hmdb) + )) } - - # remove index metabolite from HMDB part and continue while loop - HMDB_add_iso <- HMDB_add_iso[-index,] - + + # remove index metabolite from HMDB part and continue while loop + hmdb_add_iso <- hmdb_add_iso[-index, ] } # save peak list corresponding to masses in HMDB part save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData")) # save peak group list, identified part -save(outpgrlist.identified, file = paste0(batch_number, "_", scanmode, "_identified.RData")) +save(peakgrouplist_identified, file = paste0(batch_number, "_", scanmode, "_identified.RData")) diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index a8f33f9..a44ea06 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -1,65 +1,62 @@ #!/usr/bin/Rscript -# adapted from 5-collectSamples.R +# adapted from 5-collectSamples.R -# define parameters +# define parameters scanmodes <- c("positive", "negative") # Check whether all jobs terminated correct! -notRun = NULL +not_run <- NULL # collect spectrum peaks for each scanmode for (scanmode in scanmodes) { # load peak lists of all biological samples - input_dir <- getwd() # "./" - peaklist_files = list.files(input_dir, full.names=TRUE, pattern=paste("*_", scanmode, ".RData",sep="")) + peaklist_files <- list.files("/.", full.names = TRUE, pattern = paste0("*_", scanmode, ".RData")) # get sample names load(paste0("./", scanmode, "_repl_pattern", ".RData")) - groupNames = names(repl_pattern_filtered) - for (i in 1:length(groupNames)) { - group <- paste0(input_dir, "/", paste0(paste(groupNames[i], scanmode, sep = "_"), ".RData")) + group_names <- names(repl_pattern_filtered) + for (sample_nr in 1:length(group_names)) { + group <- paste0(input_dir, "/", paste0(paste(group_names[sample_nr], scanmode, sep = "_"), ".RData")) if (!(group %in% peaklist_files)) { - notRun = c(notRun, group) + not_run <- c(not_run, group) } } - - cat("\nCollecting samples") - outlist.tot=NULL - for (i in 1:length(peaklist_files)) { - cat("\n", peaklist_files[i]) - load(peaklist_files[i]) + # Collecting samples + outlist_total <- NULL + for (file_nr in 1:length(peaklist_files)) { + cat("\n", peaklist_files[file_nr]) + load(peaklist_files[file_nr]) if (is.null(outlist.persample) || (dim(outlist.persample)[1] == 0)) { - tmp=strsplit(peaklist_files[i], "/")[[1]] - fname = tmp[length(tmp)] - fname = strsplit(fname, ".RData")[[1]][1] - fname = substr(fname, 13, nchar(fname)) - if (i == 1) { - outlist.tot <- c(fname, rep("-1",5)) - } else { - outlist.tot <- rbind(outlist.tot, c(fname, rep("-1",5))) + tmp <- strsplit(peaklist_files[file_nr], "/")[[1]] + fname <- tmp[length(tmp)] + fname <- strsplit(fname, ".RData")[[1]][1] + fname <- substr(fname, 13, nchar(fname)) + if (file_nr == 1) { + outlist_total <- c(fname, rep("-1", 5)) + } else { + outlist_total <- rbind(outlist_total, c(fname, rep("-1", 5))) } } else { - if (i == 1) { - outlist.tot <- outlist.persample - } else { - outlist.tot <- rbind(outlist.tot, outlist.persample) + if (file_nr == 1) { + outlist_total <- outlist.persample + } else { + outlist_total <- rbind(outlist_total, outlist.persample) } } } - + # remove negative values - index <- which(outlist.tot[ ,"height.pkt"] <= 0) - if (length(index) > 0) outlist.tot <- outlist.tot[-index, ] - index <- which(outlist.tot[ ,"mzmed.pkt"] <= 0) - if (length(index) > 0) outlist.tot <- outlist.tot[-index, ] + index <- which(outlist_total[, "height.pkt"] <= 0) + if (length(index) > 0) outlist_total <- outlist_total[-index, ] + index <- which(outlist_total[, "mzmed.pkt"] <= 0) + if (length(index) > 0) outlist_total <- outlist_total[-index, ] - save(outlist.tot, file = paste0("./SpectrumPeaks_", scanmode, ".RData")) + save(outlist_total, file = paste0("./SpectrumPeaks_", scanmode, ".RData")) - if (!is.null(notRun)){ - for (i in 1:length(notRun)){ - message(paste(notRun[i], "was not generated")) + if (!is.null(not_run)) { + for (i in 1:length(not_run)) { + message(paste(not_run[i], "was not generated")) } } } - diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 716b473..b4bd332 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -1,25 +1,22 @@ #!/usr/bin/Rscript ## adapted from 11-runSumAdducts.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -# collect_file <- cmd_args[1] hmdbpart_main_file <- cmd_args[1] scripts_dir <- cmd_args[2] z_score <- as.numeric(cmd_args[3]) # NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder -if (grepl("positive_hmdb", hmdbpart_main_file)) { - scanmode <- "positive" +if (grepl("positive_hmdb", hmdbpart_main_file)) { + scanmode <- "positive" # for the adduct sum: include adducts M+Na (1) and M+K (2) - adducts = c(1,2) -} else { - if (grepl("negative_hmdb", hmdbpart_main_file)) { - scanmode <- "negative" - # for the adduct sum: include adduct M+Cl (1) - adducts <- c(1) - } + adducts <- c(1, 2) +} else if (grepl("negative_hmdb", hmdbpart_main_file)) { + scanmode <- "negative" + # for the adduct sum: include adduct M+Cl (1) + adducts <- c(1) } # load input files @@ -29,66 +26,63 @@ repl_file <- paste0(scanmode, "_repl_pattern.RData") load(repl_file) outlist_part <- get(load(hmdbpart_main_file)) -batch_number <- strsplit(basename(hmdbpart_main_file), ".",fixed = TRUE)[[1]][2] +batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2] -outlist.tot <- unique(outlist.ident) +outlist_total <- unique(outlist_ident) -sumAdducts <- function(peaklist, theor_MZ, grpnames.long, adducts, batch_number, scanmode, outdir, z_score){ - hmdb_codes <- rownames(theor_MZ) - hmdb_names <- theor_MZ[,1, drop=FALSE] +sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number, scanmode, outdir, z_score) { + hmdb_codes <- rownames(theor_mz) + hmdb_names <- theor_mz[, 1, drop = FALSE] hmdb_names[] <- lapply(hmdb_names, as.character) - + # remove isotopes!!! - index <- grep("HMDB", hmdb_codes, fixed=TRUE) + index <- grep("HMDB", hmdb_codes, fixed = TRUE) hmdb_codes <- hmdb_codes[index] - hmdb_names <- hmdb_names[index,] - index = grep("_", rownames(hmdb_codes), fixed=TRUE) + hmdb_names <- hmdb_names[index, ] + index <- grep("_", rownames(hmdb_codes), fixed = TRUE) if (length(index) > 0) hmdb_codes <- hmdb_codes[-index] if (length(index) > 0) hmdb_names <- hmdb_names[-index] - + # negative names <- NULL adductsum <- NULL names_long <- NULL - + if (length(hmdb_codes) != 0) { - - for(i in 1:length(hmdb_codes)){ + for (i in 1:length(hmdb_codes)) { compound <- hmdb_codes[i] - compound_plus <- c(compound,paste(compound, adducts, sep = "_")) - - metab <- unlist(lapply(peaklist$HMDB_code, function(x) {(length(intersect(unlist(strsplit(as.vector(x),";")),compound_plus))>0)})) - + compound_plus <- c(compound, paste(compound, adducts, sep = "_")) + + metab <- unlist(lapply(peaklist$HMDB_code, function(x) { + (length(intersect(unlist(strsplit(as.vector(x), ";")), compound_plus)) > 0) + })) + total <- c() - + # get the intensities for selected metabolite. - # NB: column numbers in previous version of code are incorrect if (z_score == 1) { int_cols_C <- grep("C", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")]) int_cols_P <- grep("P", colnames(peaklist)[1:which(colnames(peaklist) == "avg.ctrls")]) int_cols <- c(int_cols_C, int_cols_P) ints <- peaklist[metab, int_cols] - # ints <- peaklist[metab, c(15:(length(grpnames.long)+14))] } else { - ints <- peaklist[metab, c(7:(length(grpnames.long)+6))] + ints <- peaklist[metab, c(7:(length(grpnames_long) + 6))] } total <- apply(ints, 2, sum) - - if (sum(total)!=0) { + + if (sum(total) != 0) { names <- c(names, compound) - adductsum <- rbind(adductsum,total) + adductsum <- rbind(adductsum, total) names_long <- c(names_long, hmdb_names[i]) } } - - if (!is.null(adductsum)){ + + if (!is.null(adductsum)) { rownames(adductsum) <- names - adductsum <- cbind(adductsum, "HMDB_name"=names_long) - save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep="")) + adductsum <- cbind(adductsum, "HMDB_name" = names_long) + save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep = "")) } - } + } } - -sumAdducts(outlist.tot, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score) - +sum_adducts(outlist_total, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score) diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R index 525bfc0..68dbfe6 100755 --- a/DIMS/UnidentifiedCalcZscores.R +++ b/DIMS/UnidentifiedCalcZscores.R @@ -1,9 +1,8 @@ #!/usr/bin/Rscript ## adapted from 10-collectSamplesFilled.R -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") scripts_dir <- cmd_args[1] ppm <- as.numeric(cmd_args[2]) @@ -11,57 +10,50 @@ z_score <- as.numeric(cmd_args[3]) source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R")) source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R")) -# source(paste0(scripts_dir, "AddOnFunctions/normalization_2.1.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # get list of files - # filled_files <- list.files("./", full.names=TRUE, pattern=scanmode) filled_file <- paste0("./PeakGroupList_", scanmode, "_Unidentified_filled.RData") - print(filled_file) - # load file - outlist.tot <- get(load(filled_file)) + # load file + outlist_total <- get(load(filled_file)) # remove duplicates; peak groups with exactly the same m/z - outlist.tot <- mergeDuplicatedRows(outlist.tot) + outlist_total <- mergeDuplicatedRows(outlist_total) # sort on mass - outlist.tot <- outlist.tot[order(outlist.tot[ ,"mzmed.pgrp"]),] + outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] # load replication pattern pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) - # Normalization: not done. - # if (normalization != "disabled") { - # outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") - # } - if (z_score == 1) { - outlist.stats <- statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) - nr.removed.samples <- length(which(repl_pattern[]=="character(0)")) - order.index.int <- order(colnames(outlist.stats)[8:(length(repl_pattern)-nr.removed.samples+7)]) - outlist.stats.more <- cbind(outlist.stats[,1:7], - outlist.stats[,(length(repl_pattern)-nr.removed.samples+8):(length(repl_pattern)-nr.removed.samples+8+6)], - outlist.stats[,8:(length(repl_pattern)-nr.removed.samples+7)][order.index.int], - outlist.stats[,(length(repl_pattern)-nr.removed.samples+5+10):ncol(outlist.stats)]) - - tmp.index <- grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) - tmp.index.order <- order(colnames(outlist.stats.more[,tmp.index])) - tmp <- outlist.stats.more[,tmp.index[tmp.index.order]] - outlist.stats.more <- outlist.stats.more[,-tmp.index] - outlist.stats.more <- cbind(outlist.stats.more,tmp) - outlist.tot <- outlist.stats.more + # calculate Z-scores + outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE) + nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) + order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) + outlist_stats_more <- cbind(outlist_stats[, 1:7], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)], + outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]) + + tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE) + tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index])) + tmp <- outlist_stats_more[, tmp_index[tmp_index_order]] + outlist_stats_more <- outlist_stats_more[, -tmp_index] + outlist_stats_more <- cbind(outlist_stats_more, tmp) + outlist_total <- outlist_stats_more } - outlist.not.ident = outlist.tot + outlist_not_ident <- outlist_total # Extra output in Excel-readable format: remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") - remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns) - outlist.not.ident <- outlist.not.ident[ , -remove_colindex] - write.table(outlist.not.ident, file=paste0("unidentified_outlist_", scanmode, ".txt"), sep="\t", row.names = FALSE) + remove_colindex <- which(colnames(outlist_not_ident) %in% remove_columns) + outlist_not_ident <- outlist_not_ident[, -remove_colindex] + write.table(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".txt"), sep = "\t", row.names = FALSE) } diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index 322a1c1..5f1d5d7 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -1,102 +1,32 @@ #!/usr/bin/Rscript ## adapted from 7-collectSamplesGroupedHMDB.R -# load required packages -# none - -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) ppm <- as.numeric(cmd_args[1]) -outdir <- "./" +outdir <- "./" scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # get list of all files that contain lists of peaks that were used in identified peak grouping - files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep="")) + files <- list.files("./", pattern = paste(scanmode, "_peaks_used.RData", sep = "")) # load the list of all peaks - load(paste0("SpectrumPeaks_", scanmode, ".RData")) #outlist.tot + load(paste0("SpectrumPeaks_", scanmode, ".RData")) # Make a list of indexes of peaks that have been identified, then remove these from the peaklist. remove <- NULL for (i in 1:length(files)) { - load(files[i]) # outlist.grouped, now called list_of_peaks_used_in_peak_groups_identified - remove <- c(remove, which(outlist.tot[ ,"mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i ,"mzmed.pkt"])) + # load list_of_peaks_used_in_peak_groups_identified + load(files[i]) + remove <- c(remove, which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i, "mzmed.pkt"])) } - outlist.rest <- outlist.tot[-remove, ] + outlist_rest <- outlist_total[-remove, ] # sort on mass - outlist <- outlist.rest[order(as.numeric(outlist.rest[ ,"mzmed.pkt"])),] - - save(outlist, file=paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData")) - - # cut the unidentified peak list in parts of part_len length for parallel processing - # NB: the while statement below gives an error message. Skip the cutting into parts. - # part_len <- 10000 - # nr_peaks <- dim(outlist)[1] - # end <- 0 - # min_1_last <- part_len - # check <- 0 - # outlist_i_min_1 <- NULL - # part_nr <- 0 - - # if (nr_peaks >= part_len & (floor(nr_peaks/part_len) - 1) >= 2) { - # for (part_nr in 2:floor(nr_peaks/part_len) -1 ) { - # start <- -(part_len-1) + part_nr*part_len - # end <- part_nr*part_len - # - # if (part_nr > 1) { - # outlist_i <- outlist[c(start:end),] - # - # n_moved <- 0 - - # # Calculate ppm and replace border, avoid cut within peakgroup! - # while ((as.numeric(outlist_i[1, "mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last, "mzmed.pkt"])) * 1e+06/as.numeric(outlist_i[1, "mzmed.pkt"]) < ppm) { - # outlist_i_min_1 <- rbind(outlist_i_min_1, outlist_i[1,]) - # outlist_i <- outlist_i[-1, ] - # n_moved <- n_moved + 1 - # } - - # # message(paste("Process", i-1,":", dim(outlist_i_min_1)[1])) - # save(outlist_i_min_1, file = paste(outdir, paste(scanmode, paste("outlist_i_min_1", part_nr-1, "RData", sep="."), sep="_"), sep="/")) - # check <- check + dim(outlist_i_min_1)[1] - - # outlist_i_min_1 <- outlist_i - # min_1_last <- dim(outlist_i_min_1)[1] - - # } else { - # outlist_i_min_1 = outlist[c(start:end), ] - # } - # } - #} - - #start <- end + 1 - #end <- nr_peaks - #outlist_i <- outlist[c(start:end), ] - #n_moved <- 0 - - #if(!is.null(outlist_i_min_1)){ - # # Calculate ppm and replace border, avoid cut within peakgroup! - # while ((as.numeric(outlist_i[1,"mzmed.pkt"]) - as.numeric(outlist_i_min_1[min_1_last,"mzmed.pkt"]))*1e+06/as.numeric(outlist_i[1,"mzmed.pkt"]) < ppm) { - # outlist_i_min_1 = rbind(outlist_i_min_1, outlist_i[1,]) - # outlist_i = outlist_i[-1,] - # n_moved = n_moved + 1 - # } - - # cat(paste("Process", i+1-1,":", dim(outlist_i_min_1)[1])) - # save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i,"RData", sep="."), sep="_"), sep="/")) - # check=check+dim(outlist_i_min_1)[1] - #} - - #outlist_i_min_1=outlist_i - #cat("Process", i+2-1,":", dim(outlist_i_min_1)[1], "\n") - #save(outlist_i_min_1, file=paste(outdir, paste(scanmode, paste("outlist_i_min_1",i+1,"RData", sep="."), sep="_"), sep="/")) - #} + outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ] + # save output + save(outlist, file = paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData")) - #check <- check + dim(outlist_i_min_1)[1] - #if (check==dim(outlist)[1]){ - # cat("Check is oke!\n") - #} else { - # cat("Check is failed!\n") } diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R index edfc8a7..5416b15 100755 --- a/DIMS/UnidentifiedFillMissing.R +++ b/DIMS/UnidentifiedFillMissing.R @@ -3,18 +3,15 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep="") -# define parameters +# define parameters peakgrouplist_file1 <- cmd_args[1] peakgrouplist_file2 <- cmd_args[2] -scripts_dir <- cmd_args[3] -thresh <- as.numeric(cmd_args[4]) -resol <- as.numeric(cmd_args[5]) -ppm <- as.numeric(cmd_args[6]) -outdir <- "./" - -print(scripts_dir) +scripts_dir <- cmd_args[3] +thresh <- as.numeric(cmd_args[4]) +resol <- as.numeric(cmd_args[5]) +ppm <- as.numeric(cmd_args[6]) +outdir <- "./" # load in function scripts source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) @@ -30,8 +27,11 @@ source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R")) peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) for (peakgrouplist_file in peakgrouplist_files) { - if (grepl("_pos", peakgrouplist_file)) { scanmode = "positive" } else - if (grepl("_neg", peakgrouplist_file)) { scanmode = "negative" } + if (grepl("_pos", peakgrouplist_file)) { + scanmode <- "positive" + } else if (grepl("_neg", peakgrouplist_file)) { + scanmode <- "negative" + } # get replication pattern for sample names pattern_file <- paste0(scanmode, "_repl_pattern.RData") @@ -45,6 +45,6 @@ for (peakgrouplist_file in peakgrouplist_files) { # replace missing values (zeros) with random noise peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) - # save output - save(peakgrouplist_filled, file=paste0("./", outputfile_name)) + # save output + save(peakgrouplist_filled, file = paste0("./", outputfile_name)) } diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R index d7a3dee..73ac4c1 100755 --- a/DIMS/UnidentifiedPeakGrouping.R +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -1,93 +1,82 @@ #!/usr/bin/Rscript ## adapted from 8-peakGrouping.rest.R -# load required packages -# none - -# define parameters +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -resol <- as.numeric(cmd_args[1]) -ppm <- as.numeric(cmd_args[2]) +resol <- as.numeric(cmd_args[1]) +ppm <- as.numeric(cmd_args[2]) outdir <- "./" -options(digits=16) +options(digits = 16) -print(list.files(outdir, pattern="RData")) +print(list.files(outdir, pattern = "RData")) scanmodes <- c("positive", "negative") # function for grouping unidentified peaks -groupingRest <- function(outdir, unidentified_peaklist, scanmode, ppm) { - outlist.copy <- get(load(unidentified_peaklist)) - # batch = strsplit(unidentified_peaklist, ".",fixed = TRUE)[[1]][2] +grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { + outlist_copy <- get(load(unidentified_peaklist)) load(paste0("./", scanmode, "_repl_pattern.RData")) - outpgrlist <- NULL - - # Then group on highest peaks - range <- ppm*1e-06 - startcol <- 7 - - # while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) { - while (dim(outlist.copy)[1] > 0) { - - sel <- which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1] - + + # group on highest peaks + range <- ppm * 1e-06 + + while (dim(outlist_copy)[1] > 0) { + sel <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1] + # ppm range around max - mzref <- as.numeric(outlist.copy[sel, "mzmed.pkt"]) - pkmin <- -(range*mzref - mzref) - pkmax <- 2*mzref-pkmin - - selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax - tmplist <- outlist.copy[selp,,drop=FALSE] - - nrsamples <- length(unique(tmplist[,"samplenr"])) + mzref <- as.numeric(outlist_copy[sel, "mzmed.pkt"]) + pkmin <- -(range * mzref - mzref) + pkmax <- 2 * mzref - pkmin + + selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax + tmplist <- outlist_copy[selp, , drop = FALSE] + + nrsamples <- length(unique(tmplist[, "samplenr"])) if (nrsamples > 0) { - - mzmed.pgrp <- mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) - mzmin.pgrp <- -(range*mzmed.pgrp - mzmed.pgrp) - mzmax.pgrp <- 2*mzmed.pgrp - mzmin.pgrp - - selp <- as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp - tmplist <- outlist.copy[selp,,drop=FALSE] - - # remove used peaks!!! - tmp <- as.vector(which(tmplist[,"height.pkt"]==-1)) - if (length(tmp)>0) tmplist<-tmplist[-tmp,,drop=FALSE] - - nrsamples <- length(unique(tmplist[,"samplenr"])) - - fq.worst.pgrp <- as.numeric(max(outlist.copy[selp, "fq"])) - fq.best.pgrp <- as.numeric(min(outlist.copy[selp, "fq"])) - ints.allsamps <- rep(0, length(names(repl_pattern_filtered))) - names(ints.allsamps) <- names(repl_pattern_filtered) # same order as sample list!!! - - # Check for each sample if multiple peaks exists, if so take the sum! - labels <- unique(tmplist[,"samplenr"]) - ints.allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) - - outpgrlist <- rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps,NA,NA,NA,NA)) + mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"])) + mzmin_pgrp <- -(range * mzmed_pgrp - mzmed_pgrp) + mzmax_pgrp <- 2 * mzmed_pgrp - mzmin_pgrp + # select peaks within mz range + selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp + tmplist <- outlist_copy[selp, , drop = FALSE] + + # remove used peaks + tmp <- as.vector(which(tmplist[, "height.pkt"] == -1)) + if (length(tmp) > 0) tmplist <- tmplist[-tmp, , drop = FALSE] + + nrsamples <- length(unique(tmplist[, "samplenr"])) + fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"])) + fq_best_pgrp <- as.numeric(min(outlist_copy[selp, "fq"])) + ints_allsamps <- rep(0, length(names(repl_pattern_filtered))) + names(ints_allsamps) <- names(repl_pattern_filtered) + + # Check for each sample if multiple peaks exists, if so take the sum + labels <- unique(tmplist[, "samplenr"]) + ints_allsamps[labels] <- as.vector(unlist(lapply(labels, function(x) { + sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"])) + }))) + + outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA)) } - - outlist.copy <- outlist.copy[-which(selp==TRUE),,drop=FALSE] + + outlist_copy <- outlist_copy[-which(selp == TRUE), , drop = FALSE] } - - outpgrlist <- as.data.frame(outpgrlist) # ignore warnings of duplicate row names - colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") - colnames(outpgrlist)[(length(repl_pattern_filtered)+7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB") - - return(outpgrlist) - -} + outpgrlist <- as.data.frame(outpgrlist) + colnames(outpgrlist)[1:6] <- c("mzmed_pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin_pgrp", "mzmax_pgrp") + colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB") + + return(outpgrlist) +} for (scanmode in scanmodes) { - unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData") # generate peak group lists of the unidentified peaks - outpgrlist <- groupingRest(outdir, unidentified_peaklist, scanmode, ppm=ppm) + unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData") + outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) + write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) - # save output in RData format for further processing + # save output in RData format for further processing save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData")) - write.table(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) } - From 2996892753102eed865636d031799c88394ce130 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Wed, 21 Feb 2024 08:32:15 +0100 Subject: [PATCH 26/73] convert qc_value to float --- CheckQC/check_qc.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index db94a4b..d9b5609 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -162,6 +162,12 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): .loc[:, qc_metric_out.columns] ) ]) + # Try to convert column qc_value to float. + # If ValueError is raised, probably because column is a string, continue. + try: + qc_metric_out["qc_value"] = qc_metric_out["qc_value"].astype("float") + except ValueError: + pass # In case 'multiple sample qc check', # output could contain duplicate rows for individual samples used in multiple comparisons. return qc_metric_out.sort_values(by=["qc_check", "qc_status"]).drop_duplicates(keep="first") From 6177ce92b3bc5b7515b639a68812a43f98b5f127 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Wed, 21 Feb 2024 10:12:46 +0100 Subject: [PATCH 27/73] change input to qc settings metrics --- CheckQC/check_qc.py | 6 +++--- CheckQC/test_check_qc.py | 22 +++++++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index d9b5609..a0b0e31 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -64,9 +64,9 @@ def check_allowed_operators(qc_operator): raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}") -def check_required_keys_metrics(qc_settings): +def check_required_keys_metrics(qc_metrics): for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]: - if any([req_key not in setting.keys() for setting in qc_settings["metrics"]]): + if any([req_key not in setting.keys() for setting in qc_metrics]): raise KeyError(f"Required key {req_key} not in all metrics settings.") @@ -219,7 +219,7 @@ def read_and_judge_metrics(qc, metrics): def check_qc(input_files, settings, output_path, output_prefix): # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings. qc_settings = read_yaml(settings) - check_required_keys_metrics(qc_settings) + check_required_keys_metrics(qc_settings["metrics"]) duplicated_sample_file = [] for qc_metric_settings in qc_settings["metrics"]: check_allowed_operators(qc_metric_settings["operator"]) diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py index d6ef6ae..1a54060 100644 --- a/CheckQC/test_check_qc.py +++ b/CheckQC/test_check_qc.py @@ -66,29 +66,29 @@ def test_not_existing_operator(self): class TestCheckRequiredKeysMetrics(): def test_required_keys_present(self): - qc_settings = {"metrics": [ + qc_metrics = [ {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake", "report_cols": "fake"}, - ]} - check_qc.check_required_keys_metrics(qc_settings) + ] + check_qc.check_required_keys_metrics(qc_metrics) assert True @pytest.mark.parametrize( - "incomplete_qc_settings", + "incomplete_qc_metrics", [ - {"metrics": [{"filename": "fakename"}]}, - {"metrics": [ + [{"filename": "fakename"}], + [ {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake", "report_cols": "fake"}, {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols - ]}, - {"metrics": [ + ], + [ {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols - ]} + ] ] ) - def test_missing_keys(self, incomplete_qc_settings): + def test_missing_keys(self, incomplete_qc_metrics): with pytest.raises(KeyError) as required_error: - check_qc.check_required_keys_metrics(incomplete_qc_settings) + check_qc.check_required_keys_metrics(incomplete_qc_metrics) error_val = str(required_error.value) assert "not in all metrics settings." in error_val assert error_val.split(" ")[2] in ["filename", "qc_col", "threshold", "operator", "report_cols"] From e162db029d6baa9e99082352639ba223d24c2cee Mon Sep 17 00:00:00 2001 From: ellendejong Date: Wed, 21 Feb 2024 10:12:57 +0100 Subject: [PATCH 28/73] Add docstrings checkqc --- CheckQC/check_qc.py | 155 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index a0b0e31..37ea919 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -14,6 +14,18 @@ def non_empty_existing_path(file_or_dir): + """This function checks whether the provided file or dir exists and is not empty. + + Args: + file_or_dir (string): Input file or directory + + Raises: + FileNotFoundError: If input string file_or_dir is neither a file nor a dir. + OSError: If input is not a dir and file is empty. + + Returns: + string: Provided input file or directory. If dir, suffix '/' might be added. + """ input_path = Path(file_or_dir) if not input_path.is_file() and not input_path.is_dir(): raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir) @@ -25,6 +37,14 @@ def non_empty_existing_path(file_or_dir): def parse_arguments_and_check(args_in): + """Parses arguments and validates / checks format of input. + + Args: + args_in (list of strings): Commandline input arguments. + + Returns: + Namespace: Convert argument strings to objects and assign them as attributes of the namespace. + """ parser = argparse.ArgumentParser( description="Check and summarize sample quality using qc metrics and their thresholds." ) @@ -52,6 +72,17 @@ def parse_arguments_and_check(args_in): def read_yaml(yaml_file): + """Read input yaml + + Args: + yaml_file (string): String with path to yaml file + + Raises: + ValueError: If reading the file returns None (not recognized as YAML format). + + Returns: + Object: Content of the YAML file. + """ yaml_loaded = yaml.safe_load(open(yaml_file)) if not yaml_loaded: raise ValueError("Could not load YAML.") @@ -59,18 +90,43 @@ def read_yaml(yaml_file): def check_allowed_operators(qc_operator): + """Check if provided qc_operator is allowed. + + Args: + qc_operator (string): (Custom / math) operator + + Raises: + ValueError: If provided qc_operator is invalid / unsupported. + """ operators = ["<", "<=", ">", ">=", "==", "!=", "match"] if qc_operator not in operators: raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}") def check_required_keys_metrics(qc_metrics): + """Check if all required settings are included in the qc_settings + + Args: + qc_metrics (list with dicts): qc settings per qc metric + + Raises: + KeyError: Required key is not provided for the qc metric settings + """ for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]: if any([req_key not in setting.keys() for setting in qc_metrics]): raise KeyError(f"Required key {req_key} not in all metrics settings.") def select_metrics(filename, input_files): + """Using regular expression to match the qc metric filename with the input files + + Args: + filename (string): Filename of qc metric, could be regex. + input_files (list): All qc metrics input files. + + Returns: + list: Input files matching the given filename. + """ metrics = list(filter(re.compile(f".*{filename}").match, input_files)) if not metrics: warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}")) @@ -79,6 +135,20 @@ def select_metrics(filename, input_files): def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): + """Get valid columns to include in final output report + + Args: + qc_report_cols (list, string): column name(s) to include in report. + qc_metric_cols (list, string): column name(s) in qc metric. + qc_col (string): column name that contains the qc value/score. + + Raises: + TypeError: qc_report_cols is neither a string nor list. + ValueError: Provided column names in qc_report_cols does not exist in the qc metric. + + Returns: + list of strings: Valid column names to include in report. + """ not_existing_cols = list(set(qc_report_cols) - set(qc_metric_cols)) if qc_report_cols == "@all": qc_report_cols = qc_metric_cols @@ -94,6 +164,18 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshold): + """Add and rename columns in qc_metric. + + Args: + qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values + qc_title (string): Title of the qc check + qc_col (string): qc column name with qc value/score + qc_operator (string): (Custom / math) operator + qc_threshold (string, int or float): qc threshold + + Returns: + Pandas DataFrame: DataFrame with qc metric. + """ qc_metric_assigned = qc_metric.assign( qc_title=qc_title.lower(), qc_status="PASS", @@ -105,6 +187,20 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): + """Get rows that fail provided qc threshold + + Args: + qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values + qc_col (string): column name that contains the qc value/score. + qc_operator (string): (Custom / math) operator + qc_threshold (string, int or float): qc threshold + + Raises: + TypeError: If qc threshold is neither 'match', str, int or float. + + Returns: + object: The indexes of failed rows in qc_metric + """ # Select failed rows using qc_threshold regex pattern and qc_operator 'match' if qc_operator == "match" and isinstance(qc_threshold, str): return qc_metric[qc_col].str.match(qc_threshold) @@ -121,6 +217,18 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): + """Failed samples are added to the output metric, and removed from qc_metric. + + Args: + qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values + failed_rows (object): Object with indexes of failed rows in qc_metric + report_cols (list): Valid column names (strings) to include in report. + sample_cols (list): Columnames (strings) of sample names. + + Returns: + qc_metric (DataFrame): DataFrame of qc metric without failed rows + qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows + """ qc_metric_out = DataFrame(columns=["sample", "qc_check", "qc_status", "qc_msg", "qc_value"]) failed_samples = [] if failed_rows.to_list(): @@ -152,6 +260,16 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): + """Passed samples are added to the output metric + + Args: + qc_metric (DataFrame): DataFrame of qc metric without failed rows + qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows + sample_cols (list): Columnames (strings) of sample names. + + Returns: + pandas DataFrame: Sorted qc metric for both passed and failed samples, without duplicates. + """ # Add passed samples to output for sample_col in sample_cols: qc_metric_out = concat([ @@ -174,6 +292,13 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): def create_and_write_output(qc_output, output_path, output_prefix): + """Joined qc metrics is created and written to output file. + + Args: + qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates. + output_path (string): Relative or absolute path where output should be saved. + output_prefix (string): Output prefix for output file. + """ # Add qc_summary qc_output.insert(1, "qc_summary", "PASS") qc_output.loc[qc_output.isin(["FAIL"]).any(axis=1), "qc_summary"] = "FAIL" @@ -182,6 +307,15 @@ def create_and_write_output(qc_output, output_path, output_prefix): def read_and_judge_metrics(qc, metrics): + """Read and judge each single qc metric and join results. + + Args: + qc (dict): qc settings of the metric + metrics (list): List of input files specific for single qc metric + + Returns: + pandas DataFrame: Joined and judged qc metrics. + """ for qc_file in metrics: qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"') report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"]) @@ -217,6 +351,27 @@ def read_and_judge_metrics(qc, metrics): def check_qc(input_files, settings, output_path, output_prefix): + """ + Main function to judge input files on configured qc settings. + It creates a single results table, each row representing + sample (string): sample name + qc_summary: Summarized status of all qcs for sample (pass or fail) + qc columns (5 per each qc metric); + qc_check (string): QC check consiting of qc title, operator and threshold + qc_status (string): Status of performed qc check (pass or fail) + qc_msg (string): String with human readable message if sample failed qc check, empty if passed. + qc_value (string, int, float): qc value/score to check. + + Args: + input_files (list): All qc metrics input files. + settings (string): Path to yaml file + output_path (string): Relative or absolute path where output should be saved. + output_prefix (string): Output prefix for output file. + + Raises: + ValueError: No input files found to match any qc metric patterns defined in settings. + ValueError: Duplicated samples with different values found in some of the input files. + """ # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings. qc_settings = read_yaml(settings) check_required_keys_metrics(qc_settings["metrics"]) From e5e3acb535283131496bf5c068fc35797b501fa8 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Wed, 21 Feb 2024 10:13:38 +0100 Subject: [PATCH 29/73] Split awk and rename output --- Utils/EditSummaryFileHappy.nf | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf index 83ef755..8050c46 100644 --- a/Utils/EditSummaryFileHappy.nf +++ b/Utils/EditSummaryFileHappy.nf @@ -8,10 +8,10 @@ process EditSummaryFileHappy { tuple(val(meta), path(summary_csv)) output: - path("INDEL_PASS_summary.csv"), emit: indel_pass_summary_csv - path("INDEL_ALL_summary.csv"), emit: indel_all_summary_csv - path("SNP_PASS_summary.csv"), emit: snp_pass_summary_csv - path("SNP_ALL_summary.csv"), emit: snp_all_summary_csv + path("${meta.truth}_${meta.query}_INDEL_PASS.csv"), emit: indel_pass_csv + path("${meta.truth}_${meta.query}_INDEL_ALL.csv"), emit: indel_all_csv + path("${meta.truth}_${meta.query}_SNP_PASS.csv"), emit: snp_pass_csv + path("${meta.truth}_${meta.query}_SNP_ALL.csv"), emit: snp_all_csv script: """ @@ -19,7 +19,10 @@ process EditSummaryFileHappy { sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp # Split file including header (first line) - awk -F',' 'FNR==1{hdr=\$0;next} {print hdr>\$4"_"\$5"_summary.csv"; print \$0>>\$4"_"\$5"_summary.csv"}' ${summary_csv}.tmp + awk -F',' 'FNR==1{hdr=\$0;next} { + print hdr>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv"; + print \$0>>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv" + }' ${summary_csv}.tmp # Remove tmp files rm ${summary_csv}.tmp From 7d03db3cb48fb0111a7c3e1a81d24f25ee047d6b Mon Sep 17 00:00:00 2001 From: ellendejong Date: Wed, 21 Feb 2024 11:51:48 +0100 Subject: [PATCH 30/73] Only add .* if string is provided --- CheckQC/check_qc.py | 7 +++++-- CheckQC/test_check_qc.py | 43 +++++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index 37ea919..e6f2cc1 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -126,8 +126,11 @@ def select_metrics(filename, input_files): Returns: list: Input files matching the given filename. - """ - metrics = list(filter(re.compile(f".*{filename}").match, input_files)) + """ + # If filename is string, change into regex to match absolute and relative paths in input_files. + if filename.isalpha(): + filename=".*" + filename + metrics = list(filter(re.compile(f"{filename}").match, input_files)) if not metrics: warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}")) return None diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py index 1a54060..a757fac 100644 --- a/CheckQC/test_check_qc.py +++ b/CheckQC/test_check_qc.py @@ -95,12 +95,45 @@ def test_missing_keys(self, incomplete_qc_metrics): class TestSelectMetrics(): - @pytest.mark.parametrize("input_files,expected", [ - (["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]), # multi match - (["test1.txt", "fake2.txt"], ["test1.txt"]), # single match + @pytest.mark.parametrize("filename_or_regex,input_files,expected", [ + # multi match + ("test", ["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]), + # single match + ("test", ["test1.txt", "fake2.txt"], ["test1.txt"]), + # # match with relative path + ("test", ["./random/path/to/test1.txt"], ["./random/path/to/test1.txt"]), + # match with absolute path + ("test", ["/random/path/to/test1.txt"], ["/random/path/to/test1.txt"]), + # match regex: kinship file suffix + ( + ".*.kinship_check.out$", + ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"], + ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"] + ), + # match on word truth and SNP + ( + ".*truth.*SNP", + ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'], + ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] + ), + # match on word truth and SNP + ( + ".*truth.*SNP", + ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', 'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'], + ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] + ), + # match if 'truth' is absent and contains 'SNP' + # ?: Match expression but do not capture it + # ?! Match if 'truth' is absent. + ( + "(?:(?!truth).)*SNP.*$", + ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'], + ['12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'] + ), + ]) - def test_select_metric(self, input_files, expected): - metrics = check_qc.select_metrics("test", input_files) + def test_select_metric(self, filename_or_regex, input_files, expected): + metrics = check_qc.select_metrics(filename_or_regex, input_files) assert metrics == expected def test_no_match(self): From 277d756e733d5ffabd950be60c8b535327f5618a Mon Sep 17 00:00:00 2001 From: mraves2 Date: Thu, 22 Feb 2024 10:05:29 +0100 Subject: [PATCH 31/73] fixed output for TIC plots --- DIMS/AverageTechReplicates.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 4c3d49b..f3117d0 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -169,5 +169,5 @@ tic_plot_pdf <- marrangeGrob( )) ) # save to file -ggsave(filename = paste0("./../../../Bioinformatics/", run_name, "_TICplots.pdf"), +ggsave(filename = paste0("./", run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") From a38c3131c50875b8e245d0efdd0ecb5ea0949c44 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 26 Feb 2024 11:11:32 +0100 Subject: [PATCH 32/73] fixed code after cleaning with lintr --- DIMS/AverageTechReplicates.R | 19 ++++++++++--------- DIMS/AverageTechReplicates.nf | 5 +++-- DIMS/CollectFilled.R | 12 ++++++------ DIMS/CollectFilled.nf | 2 +- DIMS/FillMissing.R | 1 - DIMS/FillMissing.nf | 2 +- DIMS/GenerateBreaks.R | 16 ++++++++-------- DIMS/GenerateBreaks.nf | 4 ++-- DIMS/MakeInit.R | 14 +++++++------- DIMS/PeakGrouping.R | 9 +++++---- DIMS/PeakGrouping.nf | 4 ++-- DIMS/SpectrumPeakFinding.R | 4 ++-- DIMS/SumAdducts.R | 3 ++- DIMS/SumAdducts.nf | 4 ++-- 14 files changed, 51 insertions(+), 48 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 4c3d49b..d470d7d 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -4,13 +4,14 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -init_filepath <- cmd_args[1] -nr_replicates <- as.numeric(cmd_args[2]) -thresh2remove <- 1000000000 -dims_thresh <- 100 -run_name <- cmd_args[3] -dims_matrix <- cmd_args[4] -highest_mz <- as.numeric(cmd_args[5]) +init_filepath <- cmd_args[1] +nr_replicates <- as.numeric(cmd_args[2]) +thresh2remove <- 1000000000 +dims_thresh <- 100 +run_name <- cmd_args[3] +dims_matrix <- cmd_args[4] +highest_mz_file <- cmd_args[5] +highest_mz <- get(load(highest_mz_file)) # lower the threshold below which a sample will be removed for DBS and for high m/z if (dims_matrix == "DBS") { @@ -147,7 +148,7 @@ for (sample_nr in c(1:length(repl_pattern))) { tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + geom_line(linewidth = 0.3) + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + - labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[j], " || ", sample_name)) + + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_name], " || ", sample_name)) + theme(plot.background = element_rect(fill = plot_color), axis.text = element_text(size = 4), axis.title = element_text(size = 4), @@ -169,5 +170,5 @@ tic_plot_pdf <- marrangeGrob( )) ) # save to file -ggsave(filename = paste0("./../../../Bioinformatics/", run_name, "_TICplots.pdf"), +ggsave(filename = paste0("./", run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 8775976..264d2f7 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -8,9 +8,10 @@ process AverageTechReplicates { path(RData_file) // input files need to be linked, but called within R script path(TIC_txt_files) // input files need to be linked, but called within R script path(init_filepath) + val(nr_replicates) val(analysis_id) val(matrix) - val(highest_mz) + path(highest_mz) output: path('*_repl_pattern.RData'), emit: patterns @@ -20,7 +21,7 @@ process AverageTechReplicates { script: """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix $highest_mz """ } diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 9e97db2..ae18ad8 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -16,11 +16,11 @@ scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # get list of files - filled_files <- list.files("./", full.names = TRUE, pattern = scanmode) + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled")) # load files and combine into one object outlist_total <- NULL for (file_nr in 1:length(filled_files)) { - load(filled_files[file_nr]) + peakgrouplist_filled <- get(load(filled_files[file_nr])) outlist_total <- rbind(outlist_total, peakgrouplist_filled) } @@ -45,10 +45,10 @@ for (scanmode in scanmodes) { outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)] ) - tmp_index <- grep("_Zscore", colnames(outlist_stats.more), fixed = TRUE) - tmp_index_order <- order(colnames(outlist_stats.more[, tmp_index])) - tmp <- outlist_stats.more[, tmp_index[tmp_index_order]] - outlist_stats_more <- outlist_stats.more[, -tmp_index] + tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE) + tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index])) + tmp <- outlist_stats_more[, tmp_index[tmp_index_order]] + outlist_stats_more <- outlist_stats_more[, -tmp_index] outlist_stats_more <- cbind(outlist_stats_more, tmp) outlist_total <- outlist_stats_more } diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index a7819f4..4261410 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -6,7 +6,7 @@ process CollectFilled { input: path(filled_files) - path(replication_pattern) // input files need to be linked, but called within R script + each path(replication_pattern) // input files need to be linked, but called within R script output: path('outlist*.txt') diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index 0364ad2..1c1540c 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -3,7 +3,6 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n", sep = "") # define parameters peakgrouplist_file <- cmd_args[1] diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index 3a1f61b..4f71b77 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -6,7 +6,7 @@ process FillMissing { input: path(GroupedList_file) - path(replication_pattern) // input files need to be linked, but called within R script + each path(replication_pattern) // input files need to be linked, but called within R script output: path('*_filled.RData') diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index fd60723..0de2426 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -8,17 +8,17 @@ suppressPackageStartupMessages(library("xcms")) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -filepath <- cmd_args[1] # 1 of the mzML files +filepath <- cmd_args[1] outdir <- cmd_args[2] -trim <- as.numeric(cmd_args[3]) # 0.1 -resol <- as.numeric(cmd_args[4]) # 140000 +trim <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) # initialize trim_left <- NULL trim_right <- NULL -breaks_fwhm <- NULL -breaks_fwhm_avg <- NULL -bins <- NULL +breaks_fwhm <- NULL +breaks_fwhm_avg <- NULL +bins <- NULL # read in mzML file raw_data <- suppressMessages(xcmsRaw(filepath)) @@ -39,7 +39,7 @@ segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) for (i in 1:nr_segments) { start_segment <- segment[i] end_segment <- segment[i+1] - resol_mz <- resol*(1 / sqrt(2) ^ (log2(start_segment / 200))) + resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) fwhm_segment <- start_segment / resol_mz breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) # average the m/z instead of start value @@ -50,4 +50,4 @@ for (i in 1:nr_segments) { # generate output file save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "./breaks.fwhm.RData") -write(high_mz, file = "./highest_mz.txt") +save(high_mz, file = "./highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index 8f37438..4e855f2 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -9,8 +9,8 @@ process GenerateBreaks { output: - path('breaks.fwhm.RData') - path('highest_mz.txt'), emit: highest_mz + path('breaks.fwhm.RData'), emit: breaks + path('highest_mz.RData'), emit: highest_mz script: """ diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 33794c5..7d8dd1d 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -5,18 +5,18 @@ args <- commandArgs(trailingOnly = TRUE) sample_sheet <- read.csv(args[1], sep = "\t") nr_replicates <- as.numeric(args[2]) -sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sampgrps <- length(sampleNames) / nr_replicates -group_names <- trimws(as.vector(unlist(sample_sheet[2]))) -group_names <- gsub("[^-.[:alnum:]]", "_", group_names) +sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_sample_groups <- length(sample_names) / nr_replicates +group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +group_names <- gsub("[^-.[:alnum:]]", "_", group_names) group_names_unique <- unique(group_names) repl_pattern <- c() -for (sampgrp in 1:nr_sampgrps) { +for (sample_group in 1:nr_sample_groups) { tmp <- c() for (repl in nr_replicates:1) { - index <- ((sampgrp * nr_replicates) - repl) + 1 - tmp <- c(tmp, sampleNames[index]) + index <- ((sample_group * nr_replicates) - repl) + 1 + tmp <- c(tmp, sample_names[index]) } repl_pattern <- c(repl_pattern, list(tmp)) } diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index d04b6a2..b574206 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -25,8 +25,8 @@ batch_number <- strsplit(basename(hmdb_part_file), ".", fixed = TRUE)[[1]][2] # load file with spectrum peaks spec_peaks_file <- paste0("SpectrumPeaks_", scanmode, ".RData") load(spec_peaks_file) -outlist_copy <- outlist_tot -rm(outlist_tot) +outlist_copy <- outlist_total +rm(outlist_total) # load replication pattern pattern_file <- paste0(scanmode, "_repl_pattern.RData") @@ -169,9 +169,10 @@ while (dim(hmdb_add_iso)[1] > 0) { # combine all information peakgrouplist_identified <- rbind(peakgrouplist_identified, cbind( - data.frame(mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp), + data.frame("mzmed.pgrp" = mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, + "mzmin.pgrp" = mzmin_pgrp, "mzmax.pgrp" = mzmax_pgrp), t(as.matrix(ints_allsamps)), - data.frame(assi_hmdb, iso_hmdb, hmdb_code, theormz_hmdb) + data.frame("assi_HMDB" = assi_hmdb, "iso_HMDB" = iso_hmdb, "HMDB_code" = hmdb_code, "theormz_HMDB" = theormz_hmdb) )) } diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 8da1e76..5c48cde 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -6,8 +6,8 @@ process PeakGrouping { input: path(HMDBpart_file) - path(SpectrumPeak_file) // input files need to be linked, but called within R script - path(pattern_file) + each path(SpectrumPeak_file) // input files need to be linked, but called within R script + each path(pattern_file) // Execute the process for each element in the input collection (HMDBpart_file) output: path '*_peaks_used.RData', emit: peaks_used diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index a44ea06..5c77b12 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -10,13 +10,13 @@ not_run <- NULL # collect spectrum peaks for each scanmode for (scanmode in scanmodes) { # load peak lists of all biological samples - peaklist_files <- list.files("/.", full.names = TRUE, pattern = paste0("*_", scanmode, ".RData")) + peaklist_files <- list.files(pattern = paste0("_", scanmode, ".RData")) # get sample names load(paste0("./", scanmode, "_repl_pattern", ".RData")) group_names <- names(repl_pattern_filtered) for (sample_nr in 1:length(group_names)) { - group <- paste0(input_dir, "/", paste0(paste(group_names[sample_nr], scanmode, sep = "_"), ".RData")) + group <- paste0("./", group_names[sample_nr], "_", scanmode, ".RData") if (!(group %in% peaklist_files)) { not_run <- c(not_run, group) } diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index b4bd332..e9fa189 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -26,7 +26,8 @@ repl_file <- paste0(scanmode, "_repl_pattern.RData") load(repl_file) outlist_part <- get(load(hmdbpart_main_file)) -batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2] +# get the number from the file name +batch_number <- strsplit(strsplit(basename(hmdbpart_main_file), "main_", fixed = TRUE)[[1]][2], ".RData")[[1]] outlist_total <- unique(outlist_ident) diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf index 531308e..66a404f 100644 --- a/DIMS/SumAdducts.nf +++ b/DIMS/SumAdducts.nf @@ -5,8 +5,8 @@ process SumAdducts { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(collect_file) // input files need to be linked, but called within R script - path(replication_pattern) // input files need to be linked, but called within R script + each path(collect_file) // input files need to be linked, but called within R script + each path(replication_pattern) // input files need to be linked, but called within R script path(HMDBpart_main_file) output: From 5631ab52966f3789917fc271145bb88132199fa4 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Thu, 29 Feb 2024 09:28:00 +0100 Subject: [PATCH 33/73] corrections after code clean-up with lintr --- DIMS/GenerateExcel.R | 32 ++++++++++++++++---------------- DIMS/GenerateExcel.nf | 3 ++- DIMS/HMDBparts_main.R | 4 ++-- DIMS/SumAdducts.R | 2 +- DIMS/UnidentifiedCollectPeaks.nf | 2 +- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 029498f..3e55f7e 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -591,21 +591,22 @@ if (z_score == 1) { # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail print("Nu in missing m/z check") # Load the outlist_identified files + remove the loaded files -load(paste0(outdir, "RData/outlist_identified_negative.RData")) -outlist.ident.neg <- outlist.ident -load(paste0(outdir, "RData/outlist_identified_positive.RData")) -outlist.ident.pos <- outlist.ident -rm(outlist.ident) +load(paste0(outdir, "/outlist_identified_negative.RData")) +outlist_ident_neg <- outlist_ident +load(paste0(outdir, "/outlist_identified_positive.RData")) +outlist_ident_pos <- outlist_ident +rm(outlist_ident) # check for missing m/z in negative and positive mode -mode <- c("Negative", "Positive") +scanmode <- c("Negative", "Positive") index <- 1 -results_ident <- c() # empty results list -outlist_ident_list <- list(outlist.ident.neg, outlist.ident.pos) -for (outlist.ident in outlist_ident_list) { - current_mode <- mode[index] +results_ident <- c() +outlist_ident_list <- list(outlist_ident_neg, outlist_ident_pos) +for (outlist_ident in outlist_ident_list) { + current_mode <- scanmode[index] # retrieve all unique m/z values in whole numbers and check if all are available - mz_values <- as.numeric(unique(format(outlist.ident$mzmed.pgrp, digits = 0))) - mz_range <- seq(70, 599, by = 1) # change accordingly to the machine m/z range. default = 70-600 + mz_values <- as.numeric(unique(format(outlist_ident$mzmed.pgrp, digits = 0))) + # m/z range for a standard run = 70-600 + mz_range <- seq(70, 599, by = 1) mz_missing <- c() for (mz in mz_range) { if (!mz %in% mz_values) { @@ -621,8 +622,7 @@ for (outlist.ident in outlist_ident_list) { } else { results_ident <- c(results_ident, paste0(current_mode, " mode did not have missing mz values")) } - index <- index + 1 # change to new mode in for loop + # change to other scanmode + index <- index + 1 } -lapply(results_ident, write, file = paste(outdir, "missing_mz_warning.txt", sep = "/"), append = TRUE, ncolumns = 1000) - -cat("Ready excelExport.R") +lapply(results_ident, write, file = paste0(outdir, "/missing_mz_warning.txt"), append = TRUE, ncolumns = 1000) diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 9e861e2..f73b881 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -6,12 +6,13 @@ process GenerateExcel { input: path(collect_file) // input files need to be linked, but called within R script + path(identified_file) // input files need to be lined, but called within R script path(init_filepath) val(analysis_id) path(relevance_file) output: - path('AdductSums_*.RData') + path('AdductSums_*.txt') path('*.xlsx'), emit: excel_file script: diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 00cf5c9..ff468c7 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -45,7 +45,7 @@ for (scanmode in scanmodes) { start <- -(sub-1) + i * sub end <- i * sub outlist_part <- outlist[c(start:end), ] - save(outlist_part, file=paste0(scanmode, "_hmdb_main_", i, ".RData")) + save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData")) } } } @@ -55,4 +55,4 @@ start <- end + 1 end <- nr_rows outlist_part <- outlist[c(start:end),] -save(outlist_part, file = paste0(scanmode, "_hmdb.", i+1, ".RData")) +save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i+1, ".RData")) diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index e9fa189..db74623 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -27,7 +27,7 @@ load(repl_file) outlist_part <- get(load(hmdbpart_main_file)) # get the number from the file name -batch_number <- strsplit(strsplit(basename(hmdbpart_main_file), "main_", fixed = TRUE)[[1]][2], ".RData")[[1]] +batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2] outlist_total <- unique(outlist_ident) diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf index 475b0a4..bcd1264 100644 --- a/DIMS/UnidentifiedCollectPeaks.nf +++ b/DIMS/UnidentifiedCollectPeaks.nf @@ -5,7 +5,7 @@ process UnidentifiedCollectPeaks { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(SpectrumPeaks_file) + path(SpectrumPeaks_file) // input files need to be linked, but called within R script path(PeakList_identified) output: From 8f99bb80c8d89f355b0c2b31ff87fd359dec5042 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 26 Mar 2024 08:27:59 +0100 Subject: [PATCH 34/73] rename output checkqc --- CheckQC/CheckQC.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CheckQC/CheckQC.nf b/CheckQC/CheckQC.nf index 2fe48db..cb62014 100644 --- a/CheckQC/CheckQC.nf +++ b/CheckQC/CheckQC.nf @@ -9,10 +9,10 @@ process CheckQC { path(input_files, stageAs: "?/*") output: - path("${identifier}_summary.csv", emit: qc_output) + path("${identifier}_checkqc_summary.csv", emit: qc_output) script: """ - python ${baseDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier} ${input_files} + python ${baseDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier}_checkqc ${input_files} """ } From a662b5e54b2bccd24c3a6656c3d1496ef72b6735 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 26 Mar 2024 13:33:44 +0100 Subject: [PATCH 35/73] add versions yaml to VersionLog --- Utils/VersionLog.nf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf index 4a295b1..bdc5f4d 100644 --- a/Utils/VersionLog.nf +++ b/Utils/VersionLog.nf @@ -10,6 +10,7 @@ process VersionLog { output: path('repository_version.log') + path("versions.yml", emit: versions) script: """ @@ -17,6 +18,8 @@ process VersionLog { do echo "\${git_dir}" >> repository_version.log git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log + described_tags=$(git describe --tags) + echo "\${git_dir}: \"${described_tags}\"" >> versions.yml done """ } \ No newline at end of file From 00ff0e6495a5b19a3cec1d803ca8b5a2fa952eba Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 26 Mar 2024 14:01:03 +0100 Subject: [PATCH 36/73] escape $ --- Utils/VersionLog.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf index bdc5f4d..91bf377 100644 --- a/Utils/VersionLog.nf +++ b/Utils/VersionLog.nf @@ -18,7 +18,7 @@ process VersionLog { do echo "\${git_dir}" >> repository_version.log git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log - described_tags=$(git describe --tags) + described_tags=\$(git describe --tags) echo "\${git_dir}: \"${described_tags}\"" >> versions.yml done """ From 27f7ed550dd5612f1388985a6a72690c5f1c4f78 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 26 Mar 2024 14:02:58 +0100 Subject: [PATCH 37/73] escape $ --- Utils/VersionLog.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf index 91bf377..70179c3 100644 --- a/Utils/VersionLog.nf +++ b/Utils/VersionLog.nf @@ -19,7 +19,7 @@ process VersionLog { echo "\${git_dir}" >> repository_version.log git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log described_tags=\$(git describe --tags) - echo "\${git_dir}: \"${described_tags}\"" >> versions.yml + echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml done """ } \ No newline at end of file From 2cc7d389de5437d5412125c75d97087aeb6daa60 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Tue, 26 Mar 2024 14:17:52 +0100 Subject: [PATCH 38/73] add git dir to describe --- Utils/VersionLog.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf index 70179c3..9d5c654 100644 --- a/Utils/VersionLog.nf +++ b/Utils/VersionLog.nf @@ -18,7 +18,7 @@ process VersionLog { do echo "\${git_dir}" >> repository_version.log git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log - described_tags=\$(git describe --tags) + described_tags=\$(git --git-dir=\${git_dir}/.git describe --tags) echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml done """ From e7eabb3917df25b099ca1aaf97d050817d7c7695 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Wed, 27 Mar 2024 17:09:52 +0100 Subject: [PATCH 39/73] code review suggestions applied --- DIMS/AssignToBins.R | 53 +++-- DIMS/AssignToBins.nf | 8 +- DIMS/AverageTechReplicates.R | 103 ++++---- DIMS/AverageTechReplicates.nf | 17 +- DIMS/CollectFilled.R | 22 +- DIMS/CollectFilled.nf | 2 +- DIMS/CollectSumAdducts.R | 4 +- DIMS/CollectSumAdducts.nf | 2 +- DIMS/FillMissing.R | 36 ++- DIMS/FillMissing.nf | 8 +- DIMS/GenerateBreaks.R | 35 ++- DIMS/GenerateBreaks.nf | 2 +- DIMS/GenerateExcel.R | 397 ++++++++++++++++--------------- DIMS/GenerateExcel.nf | 10 +- DIMS/GenerateViolinPlots.R | 214 ++++++++++------- DIMS/GenerateViolinPlots.nf | 15 +- DIMS/HMDBparts.R | 115 +++++---- DIMS/HMDBparts.nf | 2 - DIMS/HMDBparts_main.R | 20 +- DIMS/HMDBparts_main.nf | 1 - DIMS/MakeInit.R | 17 +- DIMS/MakeInit.nf | 3 +- DIMS/PeakFinding.R | 82 ++++--- DIMS/PeakFinding.nf | 6 +- DIMS/PeakGrouping.R | 46 ++-- DIMS/PeakGrouping.nf | 10 +- DIMS/SpectrumPeakFinding.R | 6 +- DIMS/SpectrumPeakFinding.nf | 4 +- DIMS/SumAdducts.R | 47 ++-- DIMS/SumAdducts.nf | 6 +- DIMS/ThermoRawFileParser.nf | 5 +- DIMS/UnidentifiedCalcZscores.R | 18 +- DIMS/UnidentifiedCalcZscores.nf | 4 +- DIMS/UnidentifiedCollectPeaks.R | 11 +- DIMS/UnidentifiedCollectPeaks.nf | 8 +- DIMS/UnidentifiedFillMissing.R | 31 ++- DIMS/UnidentifiedFillMissing.nf | 8 +- DIMS/UnidentifiedPeakGrouping.R | 31 ++- DIMS/UnidentifiedPeakGrouping.nf | 6 +- 39 files changed, 751 insertions(+), 664 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index ed6984a..7c7a95a 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -7,40 +7,31 @@ suppressPackageStartupMessages(library("xcms")) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -filepath <- cmd_args[1] +mzml_filepath <- cmd_args[1] breaks_filepath <- cmd_args[2] -resol <- as.numeric(cmd_args[3]) -trim <- 0.1 -dims_thresh <- 100 +resol <- as.numeric(cmd_args[3]) +trim <- 0.1 +dims_thresh <- 100 + +# load breaks_fwhm +load(breaks_filepath) # get sample name -sample_name <- sub("\\..*$", "", basename(filepath)) +sample_name <- sub("\\..*$", "", basename(mzml_filepath)) options(digits = 16) # Initialize -int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve -scale <- 2 # Initial value used to estimate scaling parameter -width <- 1024 -height <- 768 -trim_left <- NULL -trim_right <- NULL -breaks_fwhm <- NULL -breaks_fwhm_avg <- NULL -bins <- NULL -pos_results <- NULL -neg_results <- NULL +pos_results <- NULL +neg_results <- NULL # read in the data for 1 sample -raw_data <- suppressMessages(xcmsRaw(filepath)) +raw_data <- suppressMessages(xcmsRaw(mzml_filepath)) # for TIC plots: prepare txt files with data for plots tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) colnames(tic_intensity_persample) <- c("retention_time", "tic_intensity") -write.table(tic_intensity_persample, file = paste0("./", sample_name, "_TIC.txt")) - -# load breaks_fwhm -load(breaks_filepath) +write.table(tic_intensity_persample, file = paste0(sample_name, "_TIC.txt")) # Create empty placeholders for later use bins <- rep(0, length(breaks_fwhm) - 1) @@ -65,10 +56,20 @@ pos_raw_data_matrix <- raw_data_matrix[pos_index, ] neg_raw_data_matrix <- raw_data_matrix[neg_index, ] # Get index for binning intensity values -bin_indices_pos <- cut(pos_raw_data_matrix[, "mz"], breaks_fwhm, - include.lowest = TRUE, right = TRUE, labels = FALSE) -bin_indices_neg <- cut(neg_raw_data_matrix[, "mz"], breaks_fwhm, - include.lowest = TRUE, right = TRUE, labels = FALSE) +bin_indices_pos <- cut( + pos_raw_data_matrix[, "mz"], + breaks_fwhm, + include.lowest = TRUE, + right = TRUE, + labels = FALSE +) +bin_indices_neg <- cut( + neg_raw_data_matrix[, "mz"], + breaks_fwhm, + include.lowest = TRUE, + right = TRUE, + labels = FALSE +) # Get the list of intensity values for each bin, and add the # intensity values which are in the same bin @@ -121,4 +122,4 @@ neg_results_final <- t(neg_results_transpose) peak_list <- list("pos" = pos_results_final, "neg" = neg_results_final, "breaksFwhm" = breaks_fwhm) -save(peak_list, file = paste0("./", sample_name, ".RData")) +save(peak_list, file = paste0(sample_name, ".RData")) diff --git a/DIMS/AssignToBins.nf b/DIMS/AssignToBins.nf index 25b3891..d0a7098 100644 --- a/DIMS/AssignToBins.nf +++ b/DIMS/AssignToBins.nf @@ -5,15 +5,15 @@ process AssignToBins { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple val(file_id), path(mzML_filename) , path(breaks_file) + tuple(val(file_id), path(mzML_file), path(breaks_file)) output: - path("${file_id}.RData"), emit: RData_files - path("${file_id}_TIC.txt"), emit: TIC_txt_files + path("${file_id}.RData"), emit: rdata_file + path("${file_id}_TIC.txt"), emit: tic_txt_file script: """ - Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_filename $breaks_file $params.resolution + Rscript ${baseDir}/CustomModules/DIMS/AssignToBins.R $mzML_file $breaks_file $params.resolution """ } diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index d470d7d..b243025 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,38 +1,31 @@ #!/usr/bin/Rscript # adapted from 3-AverageTechReplicates.R +# load packages +library("ggplot2") +library("gridExtra") + # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -init_filepath <- cmd_args[1] -nr_replicates <- as.numeric(cmd_args[2]) -thresh2remove <- 1000000000 -dims_thresh <- 100 -run_name <- cmd_args[3] -dims_matrix <- cmd_args[4] +init_file <- cmd_args[1] +nr_replicates <- as.numeric(cmd_args[2]) +run_name <- cmd_args[3] +dims_matrix <- cmd_args[4] highest_mz_file <- cmd_args[5] highest_mz <- get(load(highest_mz_file)) - -# lower the threshold below which a sample will be removed for DBS and for high m/z -if (dims_matrix == "DBS") { - thresh2remove <- 50000000 -} -if (highest_mz > 700) { - thresh2remove <- 1000000 -} - -library("ggplot2") -library("gridExtra") +thresh2remove <- 1000000000 +dims_thresh <- 100 remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { - tmp <- repl_pattern + # collect list of samples to remove from replication pattern remove_from_group <- NULL - for (sample_nr in 1:length(tmp)){ - tmp2 <- repl_pattern[[sample_nr]] + for (sample_nr in 1:length(repl_pattern)){ + repl_pattern_1sample <- repl_pattern[[sample_nr]] remove <- NULL - for (file_name in 1:length(tmp2)) { - if (tmp2[file_name] %in% bad_samples) { - remove <- c(remove, file_name) + for (file_nr in 1:length(repl_pattern_1sample)) { + if (repl_pattern_1sample[file_nr] %in% bad_samples) { + remove <- c(remove, file_nr) } } if (length(remove) == nr_replicates) { @@ -49,26 +42,35 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { } # get repl_pattern -load("./init.RData") +load(init_file) +# lower the threshold below which a sample will be removed for DBS and for high m/z +if (dims_matrix == "DBS") { + thresh2remove <- 50000000 +} +if (highest_mz > 700) { + thresh2remove <- 1000000 +} + +# remove technical replicates which are below the threshold remove_neg <- NULL remove_pos <- NULL cat("Pklist sum threshold to remove technical replicate:", thresh2remove, "\n") for (sample_nr in 1:length(repl_pattern)) { + tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) tech_reps_array_pos <- NULL tech_reps_array_neg <- NULL - tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) sum_neg <- 0 sum_pos <- 0 nr_pos <- 0 nr_neg <- 0 - for (file_name in 1:length(tech_reps)) { - load(paste("./", tech_reps[file_name], ".RData", sep = "")) + for (file_nr in 1:length(tech_reps)) { + load(paste(tech_reps[file_nr], ".RData", sep = "")) # negative scanmode cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) if (sum(peak_list$neg[, 1]) < thresh2remove) { cat(" ... Removed") - remove_neg <- c(remove_neg, tech_reps[file_name]) + remove_neg <- c(remove_neg, tech_reps[file_nr]) } else { nr_neg <- nr_neg + 1 sum_neg <- sum_neg + peak_list$neg @@ -78,7 +80,7 @@ for (sample_nr in 1:length(repl_pattern)) { cat("\n\tPositive peak_list sum", sum(peak_list$pos[, 1])) if (sum(peak_list$pos[, 1]) < thresh2remove) { cat(" ... Removed") - remove_pos <- c(remove_pos, tech_reps[file_name]) + remove_pos <- c(remove_pos, tech_reps[file_nr]) } else { nr_pos <- nr_pos + 1 sum_pos <- sum_pos + peak_list$pos @@ -89,28 +91,38 @@ for (sample_nr in 1:length(repl_pattern)) { if (nr_neg != 0) { sum_neg[, 1] <- sum_neg[, 1] / nr_neg colnames(sum_neg) <- names(repl_pattern)[sample_nr] - save(sum_neg, file = paste0("./", names(repl_pattern)[sample_nr], "_neg_avg.RData")) + save(sum_neg, file = paste0(names(repl_pattern)[sample_nr], "_neg_avg.RData")) } if (nr_pos != 0) { sum_pos[, 1] <- sum_pos[, 1] / nr_pos colnames(sum_pos) <- names(repl_pattern)[sample_nr] - save(sum_pos, file = paste0("./", names(repl_pattern)[sample_nr], "_pos_avg.RData")) + save(sum_pos, file = paste0(names(repl_pattern)[sample_nr], "_pos_avg.RData")) } } pattern_list <- remove_from_repl_pattern(remove_neg, repl_pattern, nr_replicates) repl_pattern_filtered <- pattern_list$pattern -save(repl_pattern_filtered, file = "./negative_repl_pattern.RData") -write.table(remove_neg, file = "./miss_infusions_negative.txt", - row.names = FALSE, col.names = FALSE, sep = "\t") +save(repl_pattern_filtered, file = "negative_repl_pattern.RData") +write.table( + remove_neg, + file = "miss_infusions_negative.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" +) pattern_list <- remove_from_repl_pattern(remove_pos, repl_pattern, nr_replicates) repl_pattern_filtered <- pattern_list$pattern -save(repl_pattern_filtered, file = "./positive_repl_pattern.RData") -write.table(remove_pos, file = "./miss_infusions_positive.txt", - row.names = FALSE, col.names = FALSE, sep = "\t") +save(repl_pattern_filtered, file = "positive_repl_pattern.RData") +write.table( + remove_pos, + file = "miss_infusions_positive.txt", + row.names = FALSE, + col.names = FALSE, + sep = "\t" +) -# New: generate TIC plots +## generate TIC plots # get all txt files tic_files <- list.files("./", full.names = TRUE, pattern = "*TIC.txt") all_samps <- sub("_TIC\\..*$", "", basename(tic_files)) @@ -126,16 +138,17 @@ for (file in tic_files) { } } +# create a list with information for all TIC plots tic_plot_list <- list() plot_nr <- 0 for (sample_nr in c(1:length(repl_pattern))) { tech_reps <- as.vector(unlist(repl_pattern[sample_nr])) sample_name <- names(repl_pattern)[sample_nr] - for (file_name in 1:length(tech_reps)) { + for (file_nr in 1:length(tech_reps)) { plot_nr <- plot_nr + 1 - repl1_nr <- read.table(tic_files[file_name]) - bad_color_pos <- tech_reps[file_name] %in% remove_pos[[1]] - bad_color_neg <- tech_reps[file_name] %in% remove_neg[[1]] + repl1_nr <- read.table(tic_files[file_nr]) + bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]] + bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]] if (bad_color_neg & bad_color_pos) { plot_color <- "#F8766D" } else if (bad_color_pos) { @@ -148,7 +161,7 @@ for (sample_nr in c(1:length(repl_pattern))) { tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + geom_line(linewidth = 0.3) + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + - labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_name], " || ", sample_name)) + + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + theme(plot.background = element_rect(fill = plot_color), axis.text = element_text(size = 4), axis.title = element_text(size = 4), @@ -156,6 +169,7 @@ for (sample_nr in c(1:length(repl_pattern))) { tic_plot_list[[plot_nr]] <- tic_plot } } + # create a layout matrix dependent on number of replicates layout <- matrix(1:(10 * nr_replicates), 10, nr_replicates, TRUE) # put TIC plots in matrix @@ -169,6 +183,7 @@ tic_plot_pdf <- marrangeGrob( g, "/", npages )) ) + # save to file -ggsave(filename = paste0("./", run_name, "_TICplots.pdf"), +ggsave(filename = paste0(run_name, "_TICplots.pdf"), tic_plot_pdf, width = 21, height = 29.7, units = "cm") diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 264d2f7..9fe9a98 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -5,23 +5,26 @@ process AverageTechReplicates { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(RData_file) // input files need to be linked, but called within R script - path(TIC_txt_files) // input files need to be linked, but called within R script - path(init_filepath) + path(rdata_file) + path(tic_txt_files) + path(init_file) val(nr_replicates) val(analysis_id) val(matrix) - path(highest_mz) + path(highest_mz_file) output: - path('*_repl_pattern.RData'), emit: patterns - path('*_avg.RData'), emit: binned + path('*_repl_pattern.RData'), emit: pattern_files + path('*_avg.RData'), emit: binned_files path('miss_infusions_negative.txt') path('miss_infusions_positive.txt') script: """ - Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_filepath $params.nr_replicates $analysis_id $matrix $highest_mz + Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ + $params.nr_replicates \ + $analysis_id $matrix \ + $highest_mz_file """ } diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index ae18ad8..f28e1c9 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -4,16 +4,15 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -scripts_dir <- cmd_args[1] -ppm <- as.numeric(cmd_args[2]) +scripts_dir <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) z_score <- as.numeric(cmd_args[3]) -source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R")) -source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R")) +source(paste0(scripts_dir, "mergeDuplicatedRows.R")) +source(paste0(scripts_dir, "statistics_z.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") - for (scanmode in scanmodes) { # get list of files filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, "_identified_filled")) @@ -23,17 +22,14 @@ for (scanmode in scanmodes) { peakgrouplist_filled <- get(load(filled_files[file_nr])) outlist_total <- rbind(outlist_total, peakgrouplist_filled) } - # remove duplicates; peak groups with exactly the same m/z outlist_total <- mergeDuplicatedRows(outlist_total) - # sort on mass outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] - # load replication pattern pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) - + # calculate Z-scores if (z_score == 1) { outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE) nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) @@ -44,7 +40,7 @@ for (scanmode in scanmodes) { outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int], outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)] ) - + # sort Z-score columns and append to peak group list tmp_index <- grep("_Zscore", colnames(outlist_stats_more), fixed = TRUE) tmp_index_order <- order(colnames(outlist_stats_more[, tmp_index])) tmp <- outlist_stats_more[, tmp_index[tmp_index_order]] @@ -52,9 +48,10 @@ for (scanmode in scanmodes) { outlist_stats_more <- cbind(outlist_stats_more, tmp) outlist_total <- outlist_stats_more } - + + # make a copy of the outlist outlist_ident <- outlist_total - + # select identified peak groups if ppm deviation is within limits if (z_score == 1) { outlist_ident$ppmdev <- as.numeric(outlist_ident$ppmdev) outlist_ident <- outlist_ident[which(outlist_ident["ppmdev"] >= -ppm & outlist_ident["ppmdev"] <= ppm), ] @@ -70,5 +67,6 @@ for (scanmode in scanmodes) { remove_colindex <- which(colnames(outlist_ident) %in% remove_columns) outlist_ident <- outlist_ident[, -remove_colindex] write.table(outlist_ident, file = paste0("outlist_identified_", scanmode, ".txt"), sep = "\t", row.names = FALSE) + # output in RData format save(outlist_ident, file = paste0("outlist_identified_", scanmode, ".RData")) } diff --git a/DIMS/CollectFilled.nf b/DIMS/CollectFilled.nf index 4261410..53b1967 100644 --- a/DIMS/CollectFilled.nf +++ b/DIMS/CollectFilled.nf @@ -6,7 +6,7 @@ process CollectFilled { input: path(filled_files) - each path(replication_pattern) // input files need to be linked, but called within R script + each path(replication_pattern) output: path('outlist*.txt') diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R index e935eb8..dcd05af 100755 --- a/DIMS/CollectSumAdducts.R +++ b/DIMS/CollectSumAdducts.R @@ -4,10 +4,10 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) +# collect all AdductSums part files for each scanmode scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { - # collect all AdductSums part files for each scanmode adductsum_part_files <- list.files("./", pattern = scanmode) outlist.tot <- NULL @@ -17,6 +17,6 @@ for (scanmode in scanmodes) { } # save output file - save(outlist.tot, file=paste0("AdductSums_", scanmode, ".RData")) + save(outlist.tot, file = paste0("AdductSums_", scanmode, ".RData")) } diff --git a/DIMS/CollectSumAdducts.nf b/DIMS/CollectSumAdducts.nf index da7ab0c..2bd43ff 100644 --- a/DIMS/CollectSumAdducts.nf +++ b/DIMS/CollectSumAdducts.nf @@ -5,7 +5,7 @@ process CollectSumAdducts { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(collect_file) // input files need to be linked, but called within R script + path(collect_files) output: path('AdductSums_*.RData') diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index 1c1540c..f7166d0 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -4,37 +4,35 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -# define parameters peakgrouplist_file <- cmd_args[1] -scripts_dir <- cmd_args[2] -thresh <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) -ppm <- as.numeric(cmd_args[5]) +scripts_dir <- cmd_args[2] +thresh <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) +ppm <- as.numeric(cmd_args[5]) outdir <- "./" +# load in function scripts +source(paste0(scripts_dir, "replaceZeros.R")) +source(paste0(scripts_dir, "generateGaussian.R")) +source(paste0(scripts_dir, "getFwhm.R")) +source(paste0(scripts_dir, "getSD.R")) +source(paste0(scripts_dir, "getArea.R")) +source(paste0(scripts_dir, "optimizeGauss.R")) +source(paste0(scripts_dir, "ident.hires.noise.HPC.R")) +source(paste0(scripts_dir, "elementInfo.R")) +source(paste0(scripts_dir, "globalAssignments.HPC.R")) + +# determine scan mode if (grepl("_pos", peakgrouplist_file)) { scanmode <- "positive" } else if (grepl("_neg", peakgrouplist_file)) { scanmode <- "negative" } -# load in function scripts -source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) -source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) -source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R")) -source(paste0(scripts_dir, "AddOnFunctions/getSD.R")) -source(paste0(scripts_dir, "AddOnFunctions/getArea.R")) -source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R")) -source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R")) -source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R")) -source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R")) - # get replication pattern for sample names pattern_file <- paste0(scanmode, "_repl_pattern.RData") repl_pattern <- get(load(pattern_file)) -print(head(repl_pattern)) - # load peak group list and determine output file name outpgrlist_identified <- get(load(peakgrouplist_file)) @@ -44,4 +42,4 @@ outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) # save output -save(peakgrouplist_filled, file = paste0("./", outputfile_name)) +save(peakgrouplist_filled, file = outputfile_name) diff --git a/DIMS/FillMissing.nf b/DIMS/FillMissing.nf index 4f71b77..b0d1850 100644 --- a/DIMS/FillMissing.nf +++ b/DIMS/FillMissing.nf @@ -1,18 +1,18 @@ process FillMissing { - tag "DIMS FillMissing ${GroupedList_file}" + tag "DIMS FillMissing ${peakgrouplist_file}" label 'FillMissing' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(GroupedList_file) - each path(replication_pattern) // input files need to be linked, but called within R script + path(peakgrouplist_file) + each path(replication_pattern) output: path('*_filled.RData') script: """ - Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/FillMissing.R $peakgrouplist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm """ } diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 0de2426..3e38f1c 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,6 +1,5 @@ -#### GenerateBreaks.R #### -## adapted from 1-generateBreaksFwhm.HPC.R ## #!/usr/bin/Rscript +## adapted from 1-generateBreaksFwhm.HPC.R ## # load required package suppressPackageStartupMessages(library("xcms")) @@ -9,16 +8,16 @@ suppressPackageStartupMessages(library("xcms")) cmd_args <- commandArgs(trailingOnly = TRUE) filepath <- cmd_args[1] -outdir <- cmd_args[2] -trim <- as.numeric(cmd_args[3]) -resol <- as.numeric(cmd_args[4]) +outdir <- cmd_args[2] +trim <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) # initialize -trim_left <- NULL -trim_right <- NULL -breaks_fwhm <- NULL -breaks_fwhm_avg <- NULL -bins <- NULL +trim_left <- NULL +trim_right <- NULL +breaks_fwhm <- NULL +breaks_fwhm_avg <- NULL +bins <- NULL # read in mzML file raw_data <- suppressMessages(xcmsRaw(filepath)) @@ -33,15 +32,15 @@ high_mz <- raw_data@mzrange[2] # determine number of segments (bins) nr_segments <- 2 * (high_mz - low_mz) -segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) +segment <- seq(from = low_mz, to = high_mz, length.out = nr_segments + 1) # determine start and end of each bin. for (i in 1:nr_segments) { - start_segment <- segment[i] - end_segment <- segment[i+1] - resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) - fwhm_segment <- start_segment / resol_mz - breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) + start_segment <- segment[i] + end_segment <- segment[i+1] + resol_mz <- resol * (1 / sqrt(2) ^ (log2(start_segment / 200))) + fwhm_segment <- start_segment / resol_mz + breaks_fwhm <- c(breaks_fwhm, seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment)) # average the m/z instead of start value range <- seq(from = (start_segment + fwhm_segment), to = end_segment, by = 0.2 * fwhm_segment) delta_mz <- range[2] - range[1] @@ -49,5 +48,5 @@ for (i in 1:nr_segments) { } # generate output file -save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "./breaks.fwhm.RData") -save(high_mz, file = "./highest_mz.RData") +save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "breaks.fwhm.RData") +save(high_mz, file = "highest_mz.RData") diff --git a/DIMS/GenerateBreaks.nf b/DIMS/GenerateBreaks.nf index 4e855f2..af617a8 100644 --- a/DIMS/GenerateBreaks.nf +++ b/DIMS/GenerateBreaks.nf @@ -5,7 +5,7 @@ process GenerateBreaks { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple val(file_id), path(mzML_file) + tuple(val(file_id), path(mzML_file)) output: diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 3e55f7e..420b2e1 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -1,15 +1,6 @@ #!/usr/bin/Rscript ## adapted from 13-excelExport.R -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) - -init_filepath <- cmd_args[1] -project <- cmd_args[2] -dims_matrix <- cmd_args[3] -hmdb <- cmd_args[4] -z_score <- as.numeric(cmd_args[5]) - # load required packages library("ggplot2") library("reshape2") @@ -18,55 +9,65 @@ library("loder") suppressMessages(library("dplyr")) suppressMessages(library("stringr")) +# define parameters +cmd_args <- commandArgs(trailingOnly = TRUE) + +init_file <- cmd_args[1] +project <- cmd_args[2] +dims_matrix <- cmd_args[3] +hmdb_file <- cmd_args[4] +z_score <- as.numeric(cmd_args[5]) + round_df <- function(df, digits) { - #' function for rounding numbers to x digits for numeric values + #' Round numbers to a set number of digits for numeric values #' - #' @param df dataframe - #' @param digits integer number of digits to round off to + #' @param df: Dataframe containing numeric values + #' @param digits: Number of digits to round off to (integer) #' - #' @return df + #' @return df: Dataframe with rounded numbers numeric_columns <- sapply(df, mode) == "numeric" df[numeric_columns] <- round(df[numeric_columns], digits) return(df) } robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { - #' calculate robust scaler: Z-score based on controls without outliers + #' Calculate robust scaler: Z-score based on controls without outliers #' - #' @param control_intensities matrix with intensities for control samples - #' @param control_col_ids vector with column names for control samples - #' @param perc float percentage of outliers which will be removed from controls + #' @param control_intensities: Matrix with intensities for control samples + #' @param control_col_ids: Vector with column names for control samples + #' @param perc: Percentage of outliers which will be removed from controls (float) #' - #' @return trimmed_control_intensities + #' @return trimmed_control_intensities: Intensities trimmed for outliers nr_toremove <- ceiling(length(control_col_ids) * perc / 100) sorted_control_intensities <- sort(as.numeric(control_intensities)) - trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : (length(sorted_control_intensities) - nr_toremove)] + trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : + (length(sorted_control_intensities) - nr_toremove)] return(trimmed_control_intensities) } -# Initialise and load data -plot <- TRUE +# Initialise +plot <- TRUE export <- TRUE control_label <- "C" -case_label <- "P" +case_label <- "P" imagesize_multiplier <- 2 -# setting outdir to export files to the project directory +# setting outdir to export files to the working directory outdir <- "./" # percentage of outliers to remove from calculation of robust scaler perc <- 5 # load information on samples -load(init_filepath) +load(init_file) # load the HMDB file with info on biological relevance of metabolites -load(hmdb) +load(hmdb_file) # get current date rundate <- Sys.Date() # create a directory for plots in project directory -plotdir <- paste0(outdir, "/plots/adducts") dir.create(paste0(outdir, "/plots"), showWarnings = FALSE) -dir.create(plotdir, showWarnings = FALSE) +plot_dir <- paste0(outdir, "/plots/adducts") +dir.create(plot_dir, showWarnings = FALSE) # set the number of digits for floats options(digits = 16) @@ -128,11 +129,14 @@ if (z_score == 1) { # Get columns with control intensities control_col_ids <- grep(control_label, colnames(outlist), fixed = TRUE) - control_columns <- outlist[, control_col_ids] + control_columns <- as.data.frame(outlist[, control_col_ids]) + colnames(control_columns) <- colnames(outlist)[control_col_ids] # Get columns with patient intensities patient_col_ids <- grep(case_label, colnames(outlist), fixed = TRUE) - patient_columns <- outlist[, patient_col_ids] + patient_columns <- as.data.frame(outlist[, patient_col_ids]) + colnames(patient_columns) <- colnames(outlist)[patient_col_ids] + intensity_col_ids <- c(control_col_ids, patient_col_ids) # if there are any intensities of 0 left, set them to NA for stats @@ -198,7 +202,13 @@ if (z_score == 1) { for (i in 1:length(patient_ids)) { id <- patient_ids[i] # combine all intensities that start with the same string for patients - patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ])[startsWith(names(patient_columns[1, ]), paste0(id, "."))]]))) + # exception: if there is only one patient id, skip this step; nothing to combine + if (ncol(patient_columns) > 1) { + patient_int <- as.numeric(as.vector(unlist(outlist[p, names(patient_columns[1, ]) + [startsWith(names(patient_columns[1, ]), paste0(id, "."))]]))) + } else { + patient_int <- as.numeric(unlist(as.vector(patient_columns))) + } intensities[[i + 1]] <- patient_int } intensities <- setNames(intensities, labels) @@ -207,7 +217,7 @@ if (z_score == 1) { plot.new() if (export) { - png(filename = paste0(plotdir, "/", hmdb_name, "_box.png"), + png(filename = paste0(plot_dir, "/", hmdb_name, "_box.png"), width = plot_width, height = 280) } @@ -220,7 +230,7 @@ if (z_score == 1) { main = hmdb_name) dev.off() - file_png <- paste0(plotdir, "/", hmdb_name, "_box.png") + file_png <- paste0(plot_dir, "/", hmdb_name, "_box.png") if (is.null(temp_png)) { temp_png <- readPng(file_png) img_dim <- dim(temp_png)[c(1, 2)] @@ -248,23 +258,25 @@ if (z_score == 1) { setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) } + # write Excel file writeData(wb, sheet = 1, outlist, startCol = 1) xlsx_name <- paste0(outdir, "/", project, ".xlsx") saveWorkbook(wb, xlsx_name, overwrite = TRUE) -cat(xlsx_name) rm(wb) -#### INTERNE STANDAARDEN #### -IS <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] -IS_codes <- rownames(IS) -cat(IS_codes, "\n") +#### INTERNAL STANDARDS #### +is_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] +is_codes <- rownames(is_list) -# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates because of too little data (threshold parameter) the init.RData (repl_pattern) will contain more sample_names then the peak data (IS), -# thus this data needs to be removed first, before the retrieval of the summed adducts. Write sample_names to a log file, to let user know that this sample_name contained no data. -sample_names_nodata <- setdiff(names(repl_pattern), names(IS)) +# if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates +# because of too little data (threshold parameter)i, the init.RData (repl_pattern) will contain more sample_names +# than the peak data (IS), so this data needs to be removed first, before the retrieval of the summed adducts. +# Write sample_names to a log file, to let user know that this sample_name contained no data. +sample_names_nodata <- setdiff(names(repl_pattern), names(is_list)) if (!is.null(sample_names_nodata)) { - write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), + row.names = FALSE, col.names = FALSE, quote = FALSE) cat(sample_names_nodata, "\n") for (sample_name in sample_names_nodata) { repl_pattern[[sample_name]] <- NULL @@ -272,324 +284,327 @@ if (!is.null(sample_names_nodata)) { } # Retrieve IS summed adducts -IS_summed <- IS[c(names(repl_pattern), "HMDB_code")] -IS_summed$HMDB.name <- IS$name -IS_summed <- reshape2::melt(IS_summed, id.vars = c("HMDB_code", "HMDB.name")) -colnames(IS_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -IS_summed$Intensity <- as.numeric(IS_summed$Intensity) -IS_summed$Matrix <- dims_matrix -IS_summed$Rundate <- rundate -IS_summed$Project <- project -IS_summed$Intensity <- as.numeric(as.character(IS_summed$Intensity)) +is_summed <- is_list[c(names(repl_pattern), "HMDB_code")] +is_summed$HMDB.name <- is_list$name +is_summed <- reshape2::melt(is_summed, id.vars = c("HMDB_code", "HMDB.name")) +colnames(is_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +is_summed$Intensity <- as.numeric(is_summed$Intensity) +is_summed$Matrix <- dims_matrix +is_summed$Rundate <- rundate +is_summed$Project <- project +is_summed$Intensity <- as.numeric(as.character(is_summed$Intensity)) # Retrieve IS positive mode -IS_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% IS_codes)) -IS_pos$HMDB_name <- IS[match(row.names(IS_pos), IS$HMDB_code, nomatch = NA), "name"] -IS_pos$HMDB.code <- row.names(IS_pos) -IS_pos <- reshape2::melt(IS_pos, id.vars = c("HMDB.code", "HMDB_name")) -colnames(IS_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -IS_pos$Matrix <- dims_matrix -IS_pos$Rundate <- rundate -IS_pos$Project <- project -IS_pos$Intensity <- as.numeric(as.character(IS_pos$Intensity)) +is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) +is_pos$HMDB_name <- is_list[match(row.names(is_pos), is_list$HMDB_code, nomatch = NA), "name"] +is_pos$HMDB.code <- row.names(is_pos) +is_pos <- reshape2::melt(is_pos, id.vars = c("HMDB.code", "HMDB_name")) +colnames(is_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +is_pos$Matrix <- dims_matrix +is_pos$Rundate <- rundate +is_pos$Project <- project +is_pos$Intensity <- as.numeric(as.character(is_pos$Intensity)) # Retrieve IS negative mode -IS_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% IS_codes)) -IS_neg$HMDB_name <- IS[match(row.names(IS_neg), IS$HMDB_code, nomatch = NA), "name"] -IS_neg$HMDB.code <- row.names(IS_neg) -IS_neg <- reshape2::melt(IS_neg, id.vars = c("HMDB.code", "HMDB_name")) -colnames(IS_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -IS_neg$Matrix <- dims_matrix -IS_neg$Rundate <- rundate -IS_neg$Project <- project -IS_neg$Intensity <- as.numeric(as.character(IS_neg$Intensity)) +is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) +is_neg$HMDB_name <- is_list[match(row.names(is_neg), is_list$HMDB_code, nomatch = NA), "name"] +is_neg$HMDB.code <- row.names(is_neg) +is_neg <- reshape2::melt(is_neg, id.vars = c("HMDB.code", "HMDB_name")) +colnames(is_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +is_neg$Matrix <- dims_matrix +is_neg$Rundate <- rundate +is_neg$Project <- project +is_neg$Intensity <- as.numeric(as.character(is_neg$Intensity)) # Save results -save(IS_pos, IS_neg, IS_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) +save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) # number of samples, for plotting length and width sample_count <- length(repl_pattern) # change the order of the x-axis summed plots to a natural sorted one -Sample_naturalorder <- unique(as.character(IS_summed$Sample)) -Sample_naturalorder <- str_sort(Sample_naturalorder, numeric = TRUE) -IS_summed$Sample_level <- factor(IS_summed$Sample, levels = c(Sample_naturalorder)) -IS_pos$Sample_level <- factor(IS_pos$Sample, levels = c(Sample_naturalorder)) -IS_neg$Sample_level <- factor(IS_neg$Sample, levels = c(Sample_naturalorder)) +sample_naturalorder <- unique(as.character(is_summed$Sample)) +sample_naturalorder <- str_sort(sample_naturalorder, numeric = TRUE) +is_summed$Sample_level <- factor(is_summed$Sample, levels = c(sample_naturalorder)) +is_pos$Sample_level <- factor(is_pos$Sample, levels = c(sample_naturalorder)) +is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder)) ## bar plots with all IS - -# function for ggplot theme # theme for all IS bar plots -theme_IS_bar <- function(myPlot) { - myPlot + +theme_is_bar <- function(my_plot) { + my_plot + scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + - theme( - legend.position = "none", - axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), - axis.text.y = element_text(size = 6) + theme(legend.position = "none", + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), + axis.text.y = element_text(size = 6) ) } # ggplot functions -IS_neg_bar_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + +is_neg_bar_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free_y") -IS_pos_bar_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + +is_pos_bar_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free_y") -IS_sum_bar_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + +is_sum_bar_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Summed)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free_y") # add theme to ggplot functions -IS_neg_bar_plot <- theme_IS_bar(IS_neg_bar_plot) -IS_pos_bar_plot <- theme_IS_bar(IS_pos_bar_plot) -IS_sum_bar_plot <- theme_IS_bar(IS_sum_bar_plot) +is_neg_bar_plot <- theme_is_bar(is_neg_bar_plot) +is_pos_bar_plot <- theme_is_bar(is_pos_bar_plot) +is_sum_bar_plot <- theme_is_bar(is_sum_bar_plot) # save plots to disk plot_width <- 9 + 0.35 * sample_count -ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), plot = IS_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), plot = IS_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), plot = IS_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), + plot = is_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), + plot = is_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), + plot = is_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") ## Line plots with all IS - # function for ggplot theme # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines -theme_IS_line <- function(myPlot) { - myPlot + - guides( - shape = guide_legend(override.aes = list(size = 0.5)), - color = guide_legend(override.aes = list(size = 0.5)) +theme_is_line <- function(my_plot) { + my_plot + + guides(shape = guide_legend(override.aes = list(size = 0.5)), + color = guide_legend(override.aes = list(size = 0.5)) ) + - theme( - legend.title = element_text(size = 8), - legend.text = element_text(size = 6), - legend.key.size = unit(0.7, "line"), - axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8) + theme(legend.title = element_text(size = 8), + legend.text = element_text(size = 6), + legend.key.size = unit(0.7, "line"), + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 8) ) } # ggplot functions -IS_neg_line_plot <- ggplot(IS_neg, aes(Sample_level, Intensity)) + +is_neg_line_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -IS_pos_line_plot <- ggplot(IS_pos, aes(Sample_level, Intensity)) + +is_pos_line_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -IS_sum_line_plot <- ggplot(IS_summed, aes(Sample_level, Intensity)) + +is_sum_line_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") # add theme to ggplot functions -IS_sum_line_plot <- theme_IS_line(IS_sum_line_plot) -IS_neg_line_plot <- theme_IS_line(IS_neg_line_plot) -IS_pos_line_plot <- theme_IS_line(IS_pos_line_plot) +is_sum_line_plot <- theme_is_line(is_sum_line_plot) +is_neg_line_plot <- theme_is_line(is_neg_line_plot) +is_pos_line_plot <- theme_is_line(is_pos_line_plot) # save plots to disk plot_width <- 8 + 0.2 * sample_count -ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), plot = IS_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), plot = IS_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), plot = IS_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), + plot = is_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), + plot = is_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), + plot = is_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") ## bar plots with a selection of IS -IS_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)") -IS_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", "2H9-Isovalerylcarnitine (IS)") -IS_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)") +is_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", + "13C6-Tyrosine (IS)") +is_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", + "2H9-Isovalerylcarnitine (IS)") +is_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", + "13C6-Tyrosine (IS)") # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) if (dims_matrix == "DBS") { - hline.data.neg <- + hline_data_neg <- data.frame( z = c(15000, 200000, 130000, 18000, 50000), - HMDB.name = IS_neg_selection + HMDB.name = is_neg_selection ) - hline.data.pos <- + hline_data_pos <- data.frame( z = c(150000, 3300000, 1750000, 150000, 270000), - HMDB.name = IS_pos_selection + HMDB.name = is_pos_selection ) - hline.data.sum <- + hline_data_sum <- data.frame( z = c(1300000, 2500000, 500000, 1800000, 1400000), - HMDB.name = IS_sum_selection + HMDB.name = is_sum_selection ) } else if (dims_matrix == "Plasma") { - hline.data.neg <- + hline_data_neg <- data.frame( z = c(6500, 100000, 75000, 7500, 25000), - HMDB.name = IS_neg_selection + HMDB.name = is_neg_selection ) - hline.data.pos <- + hline_data_pos <- data.frame( z = c(85000, 1000000, 425000, 70000, 180000), - HMDB.name = IS_pos_selection + HMDB.name = is_pos_selection ) - hline.data.sum <- + hline_data_sum <- data.frame( z = c(700000, 1250000, 150000, 425000, 300000), - HMDB.name = IS_sum_selection + HMDB.name = is_sum_selection ) } -# function for ggplot theme -# see bar plots with all IS - # ggplot functions -IS_neg_selection_barplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + +is_neg_selection_barplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + - if (exists("hline.data.neg")) { - geom_hline(aes(yintercept = z), subset(hline.data.neg, HMDB.name %in% IS_neg$HMDB.name)) - } # subset, if some IS have no data, no empty plots will be generated with a line) + if (exists("hline_data_neg")) { + geom_hline(aes(yintercept = z), subset(hline_data_neg, HMDB.name %in% is_neg$HMDB.name)) + } -IS_pos_selection_barplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + +is_pos_selection_barplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + - if (exists("hline.data.pos")) { - geom_hline(aes(yintercept = z), subset(hline.data.pos, HMDB.name %in% IS_pos$HMDB.name)) + if (exists("hline_data_pos")) { + geom_hline(aes(yintercept = z), subset(hline_data_pos, HMDB.name %in% is_pos$HMDB.name)) } -IS_sum_selection_barplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + +is_sum_selection_barplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + - if (exists("hline.data.sum")) { - geom_hline(aes(yintercept = z), subset(hline.data.sum, HMDB.name %in% IS_summed$HMDB.name)) + if (exists("hline_data_sum")) { + geom_hline(aes(yintercept = z), subset(hline_data_sum, HMDB.name %in% is_summed$HMDB.name)) } # add theme to ggplot functions -IS_neg_selection_barplot <- theme_IS_bar(IS_neg_selection_barplot) -IS_pos_selection_barplot <- theme_IS_bar(IS_pos_selection_barplot) -IS_sum_selection_barplot <- theme_IS_bar(IS_sum_selection_barplot) +is_neg_selection_barplot <- theme_is_bar(is_neg_selection_barplot) +is_pos_selection_barplot <- theme_is_bar(is_pos_selection_barplot) +is_sum_selection_barplot <- theme_is_bar(is_sum_selection_barplot) # save plots to disk plot_width <- 9 + 0.35 * sample_count -ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), plot = IS_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), plot = IS_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), plot = IS_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), + plot = is_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), + plot = is_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), + plot = is_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") ## line plots with a selection of IS - -# function for ggplot theme -# see line plots with all IS - # ggplot functions -IS_neg_selection_lineplot <- ggplot(subset(IS_neg, HMDB.name %in% IS_neg_selection), aes(Sample_level, Intensity)) + +is_neg_selection_lineplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -IS_pos_selection_lineplot <- ggplot(subset(IS_pos, HMDB.name %in% IS_pos_selection), aes(Sample_level, Intensity)) + +is_pos_selection_lineplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -IS_sum_selection_lineplot <- ggplot(subset(IS_summed, HMDB.name %in% IS_sum_selection), aes(Sample_level, Intensity)) + +is_sum_selection_lineplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") # add theme to ggplot functions -IS_neg_selection_lineplot <- theme_IS_line(IS_neg_selection_lineplot) -IS_pos_selection_lineplot <- theme_IS_line(IS_pos_selection_lineplot) -IS_sum_selection_lineplot <- theme_IS_line(IS_sum_selection_lineplot) +is_neg_selection_lineplot <- theme_is_line(is_neg_selection_lineplot) +is_pos_selection_lineplot <- theme_is_line(is_pos_selection_lineplot) +is_sum_selection_lineplot <- theme_is_line(is_sum_selection_lineplot) # save plots to disk plot_width <- 8 + 0.2 * sample_count -ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), plot = IS_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), plot = IS_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") -ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), plot = IS_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), + plot = is_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), + plot = is_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") +ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), + plot = is_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") ### POSITIVE CONTROLS CHECK -# these positive controls need to be in the samplesheet, in order to make the Pos_Contr.RData file +# these positive controls need to be in the samplesheet, in order to make the positive_control.RData file # Positive control samples all have the format P1002.x, P1003.x and P1005.x (where x is a number) column_list <- colnames(outlist) patterns <- c("^(P1002\\.)[[:digit:]]+_", "^(P1003\\.)[[:digit:]]+_", "^(P1005\\.)[[:digit:]]+_") positive_controls_index <- grepl(pattern = paste(patterns, collapse = "|"), column_list) -positivecontrol_list <- column_list[positive_controls_index] +positive_control_list <- column_list[positive_controls_index] if (z_score == 1) { # find if one or more positive control samples are missing pos_contr_warning <- c() - # any() grep because you get a vector of FALSE's and TRUE's. only one grep match is needed for each positive control - if (any(grep("^(P1002\\.)[[:digit:]]+_", positivecontrol_list)) && - any(grep("^(P1003\\.)[[:digit:]]+_", positivecontrol_list)) && - any(grep("^(P1005\\.)[[:digit:]]+_", positivecontrol_list))) { + if (any(grep("^(P1002\\.)[[:digit:]]+_", positive_control_list)) && + any(grep("^(P1003\\.)[[:digit:]]+_", positive_control_list)) && + any(grep("^(P1005\\.)[[:digit:]]+_", positive_control_list))) { cat("All three positive controls are present") } else { - pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", positivecontrol_list, " is/are present"), collapse = " ") + pos_contr_warning <- paste0(c("positive controls list is not complete. Only ", + positive_control_list, " is/are present"), collapse = " ") } # you need all positive control samples, thus starting the script only if all are available if (length(pos_contr_warning) == 0) { - ### POSITIVE CONTROLS # make positive control excel with specific HMDB_codes in combination with specific control samples - PA_sample_name <- positivecontrol_list[grepl("P1002", positivecontrol_list)] # P1001.x_Zscore - PKU_sample_name <- positivecontrol_list[grepl("P1003", positivecontrol_list)] # P1003.x_Zscore - LPI_sample_name <- positivecontrol_list[grepl("P1005", positivecontrol_list)] # P1005.x_Zscore + pa_sample_name <- positive_control_list[grepl("P1002", positive_control_list)] + pku_sample_name <- positive_control_list[grepl("P1003", positive_control_list)] + lpi_sample_name <- positive_control_list[grepl("P1005", positive_control_list)] - PA_codes <- c("HMDB00824", "HMDB00783", "HMDB00123") - PKU_codes <- c("HMDB00159") - LPI_codes <- c("HMDB00904", "HMDB00641", "HMDB00182") + pa_codes <- c("HMDB00824", "HMDB00783", "HMDB00123") + pku_codes <- c("HMDB00159") + lpi_codes <- c("HMDB00904", "HMDB00641", "HMDB00182") - PA_data <- outlist[PA_codes, c("HMDB_code", "name", PA_sample_name)] - PA_data <- reshape2::melt(PA_data, id.vars = c("HMDB_code", "name")) - colnames(PA_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + pa_data <- outlist[pa_codes, c("HMDB_code", "name", pa_sample_name)] + pa_data <- reshape2::melt(pa_data, id.vars = c("HMDB_code", "name")) + colnames(pa_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") - PKU_data <- outlist[PKU_codes, c("HMDB_code", "name", PKU_sample_name)] - PKU_data <- reshape2::melt(PKU_data, id.vars = c("HMDB_code", "name")) - colnames(PKU_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + pku_data <- outlist[pku_codes, c("HMDB_code", "name", pku_sample_name)] + pku_data <- reshape2::melt(pku_data, id.vars = c("HMDB_code", "name")) + colnames(pku_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") - LPI_data <- outlist[LPI_codes, c("HMDB_code", "name", LPI_sample_name)] - LPI_data <- reshape2::melt(LPI_data, id.vars = c("HMDB_code", "name")) - colnames(LPI_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + lpi_data <- outlist[lpi_codes, c("HMDB_code", "name", lpi_sample_name)] + lpi_data <- reshape2::melt(lpi_data, id.vars = c("HMDB_code", "name")) + colnames(lpi_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") - Pos_Contr <- rbind(PA_data, PKU_data, LPI_data) - Pos_Contr$Zscore <- as.numeric(Pos_Contr$Zscore) + positive_control <- rbind(pa_data, pku_data, lpi_data) + positive_control$Zscore <- as.numeric(positive_control$Zscore) # extra information added to excel for future reference. made in beginning of this script - Pos_Contr$Matrix <- dims_matrix - Pos_Contr$Rundate <- rundate - Pos_Contr$Project <- project + positive_control$Matrix <- dims_matrix + positive_control$Rundate <- rundate + positive_control$Project <- project # Save results - save(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.RData")) - Pos_Contr$Zscore <- round_df(Pos_Contr$Zscore, 2) # asked by Lab to round the number to 2 digits - write.xlsx(Pos_Contr, file = paste0(outdir, "/", project, "_Pos_Contr.xlsx"), sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE) + save(positive_control, file = paste0(outdir, "/", project, "_positive_control.RData")) + # round the Z-scores to 2 digits + positive_control$Zscore <- round_df(positive_control$Zscore, 2) + write.xlsx(positive_control, file = paste0(outdir, "/", project, "_positive_control.xlsx"), + sheetName = "Sheet1", col.names = TRUE, row.names = TRUE, append = FALSE) } else { - write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) + write.table(pos_contr_warning, file = paste(outdir, "positive_controls_warning.txt", sep = "/"), + row.names = FALSE, col.names = FALSE, quote = FALSE) } } ### MISSING M/Z CHECK # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail -print("Nu in missing m/z check") # Load the outlist_identified files + remove the loaded files load(paste0(outdir, "/outlist_identified_negative.RData")) outlist_ident_neg <- outlist_ident diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index f73b881..32ec928 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -5,18 +5,20 @@ process GenerateExcel { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(collect_file) // input files need to be linked, but called within R script - path(identified_file) // input files need to be lined, but called within R script - path(init_filepath) + path(collect_files) + path(identified_files) + path(init_file) val(analysis_id) path(relevance_file) output: path('AdductSums_*.txt') + path('*IS_results.RData') path('*.xlsx'), emit: excel_file + path('plots'), emit: plot_files script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_filepath $analysis_id $params.matrix $relevance_file $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_file $analysis_id $params.matrix $relevance_file $params.zscore """ } diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 896918f..097d030 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -7,35 +7,70 @@ # corresponding Z-scores. # 2. All files from github: https://github.com/UMCUGenetics/DIMS +#!/usr/bin/Rscript ## adapted from 15-dIEM_violin.R -library(dplyr) # tidytable is for other_isobaric.R (left_join) -library(reshape2) # used in prepare_data.R -library(openxlsx) # for opening Excel file -library(ggplot2) # for plotting -library(gridExtra) # for table top highest/lowest +# load packages +library(dplyr) +library(reshape2) +library(openxlsx) +library(ggplot2) +library(gridExtra) -# define parameters - check after addition to run.sh +# define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -run_name <- cmd_args[1] +run_name <- cmd_args[1] scripts_dir <- cmd_args[2] -z_score <- as.numeric(cmd_args[3]) +z_score <- as.numeric(cmd_args[3]) +path_metabolite_groups <- cmd_args[4] +file_ratios_metabolites <- cmd_args[5] +file_expected_biomarkers_iem <- cmd_args[6] +file_explanation <- cmd_args[7] +file_isomers <- cmd_args[8] + +print("arguments") +print(run_name) +print(scripts_dir) +print(z_score) +print(path_metabolite_groups) +print(file_ratios_metabolites) +print(file_expected_biomarkers_iem) +print(file_explanation) +print(file_isomers) +print("end arguments") + +# path: output folder for dIEM and violin plots +output_dir <- "./" + +file.copy(file_isomers, output_dir) + +# folder for all metabolite lists (.txt) +# path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups" +# file for ratios step 3 +# file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv" +# file for algorithm step 4 +# file_expected_biomarkers_iem <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv" +# explanation: file with text to be included in violin plots +# file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt" + +# copy list of isomers to project folder. +# file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir) # load functions -source(paste0(scripts_dir, "AddOnFunctions/check_same_samplename.R")) -source(paste0(scripts_dir, "AddOnFunctions/prepare_data.R")) -source(paste0(scripts_dir, "AddOnFunctions/prepare_data_perpage.R")) -source(paste0(scripts_dir, "AddOnFunctions/prepare_toplist.R")) -source(paste0(scripts_dir, "AddOnFunctions/create_violin_plots.R")) -source(paste0(scripts_dir, "AddOnFunctions/prepare_alarmvalues.R")) +source(paste0(scripts_dir, "check_same_samplename.R")) +source(paste0(scripts_dir, "prepare_data.R")) +source(paste0(scripts_dir, "prepare_data_perpage.R")) +source(paste0(scripts_dir, "prepare_toplist.R")) +source(paste0(scripts_dir, "create_violin_plots.R")) +source(paste0(scripts_dir, "prepare_alarmvalues.R")) # number of diseases that score highest in algorithm to plot -top_nr_IEM <- 5 +top_nr_iem <- 5 # probability score cut-off for plotting the top diseases -threshold_IEM <- 5 +threshold_iem <- 5 # z-score cutoff of axis on the left for top diseases -ratios_cutoff <- -5 +ratios_cutoff <- -5 # number of violin plots per page in PDF nr_plots_perpage <- 20 @@ -46,31 +81,16 @@ if (z_score == 1) { # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) header_row <- 1 # column name where the data starts (default B) -col_start <- "B" +col_start <- "B" zscore_cutoff <- 5 -xaxis_cutoff <- 20 - -# path to DIMS excel file -path_dims_file <- paste0("./", run_name, ".xlsx") - -# path: output folder for dIEM and violin plots -output_dir <- "./" - -# folder for all metabolite lists (.txt) -path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups" -# file for ratios step 3 -file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv" -# file for algorithm step 4 -file_expected_biomarkers_IEM <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv" -# explanation: file with text to be included in violin plots -file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt" - -# copy list of isomers to project folder. -file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir) +xaxis_cutoff <- 20 #### STEP 1: Preparation #### # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS +# path to DIMS excel file +path_dims_file <- paste0(run_name, ".xlsx") + # Load the excel file. dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row) if (exists("dims_xls")) { @@ -98,11 +118,11 @@ cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n- # Move the columns HMDB_code and HMDB_name to the beginning. hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) -other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] -dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] +other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] +dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] # Remove the columns from 'name' to 'pathway' -from_col <- which(colnames(dims_xls_copy) == "name") -to_col <- which(colnames(dims_xls_copy) == "pathway") +from_col <- which(colnames(dims_xls_copy) == "name") +to_col <- which(colnames(dims_xls_copy) == "pathway") dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] # in case the excel had an empty "plots" column, remove it if ("plots" %in% colnames(dims_xls_copy)) { @@ -146,18 +166,19 @@ if (ratios == 1) { ncol = ncol(dims_xls_copy), nrow = nrow(ratio_input) )), colnames(dims_xls_copy)) + ratio_list <- as.data.frame(ratio_list) # put HMDB info into first two columns of ratio_list ratio_list[, 1:2] <- ratio_input[, 1:2] # look for intensity columns (exclude Zscore columns) - control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) intensity_cols <- c(control_cols, patient_cols) # calculate each of the ratios of intensities for (ratio_index in 1:nrow(ratio_input)) { - ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] - ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] + ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] + ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] # find these HMDB IDs in dataset. Could be a sum of multiple metabolites @@ -219,13 +240,13 @@ if (ratios == 1) { } #### STEP 4: Run the IEM algorithm ######### -# in: algorithm, file_expected_biomarkers_IEM, zscore_patients ||| out: prob_score (+file) +# in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file) # algorithm taken from DOI: 10.3390/ijms21030979 if (algorithm == 1) { # Load data - cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_IEM, "\n")) - expected_biomarkers <- read.csv(file_expected_biomarkers_IEM, sep = ";", stringsAsFactors = FALSE) + cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_iem, "\n")) + expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) # modify column names names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) @@ -241,7 +262,8 @@ if (algorithm == 1) { # Rank all positive zscores highest to lowest rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) # Rank all negative zscores lowest to highest - rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1):nrow(rank_patients), patient_index])) + rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1): + nrow(rank_patients), patient_index])) } # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). @@ -253,13 +275,16 @@ if (algorithm == 1) { select_info_cols <- 1:(min(select_zscore_cols) - 1) # set some zscores to zero select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) + expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, + select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) + expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, + select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) # calculate rank score: expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) + rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / + (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) # combine disease info with rank scores expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) @@ -292,7 +317,8 @@ if (algorithm == 1) { # determine disease rank per patient disease_rank <- prob_score # rank diseases in decreasing order - disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) + disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) + as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) # modify column names, Zscores have now been converted to probability scores colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) @@ -301,8 +327,9 @@ if (algorithm == 1) { wb <- createWorkbook() addWorksheet(wb, "Probability Scores") writeData(wb, "Probability Scores", prob_score) - conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) - saveWorkbook(wb, file = paste0(output_dir, "/algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), + type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) + saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) # check whether prob_score df exists and has expected dimensions. if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") @@ -314,9 +341,10 @@ if (algorithm == 1) { } #### STEP 5: Make violin plots ##### -# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, top_diseases, top_metab, output_dir ||| out: pdf file +# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, +# top_diseases, top_metab, output_dir ||| out: pdf file -if (violin == 1) { # make violin plots +if (violin == 1) { # preparation zscore_patients_copy <- zscore_patients @@ -351,7 +379,8 @@ if (violin == 1) { # make violin plots cat("making plots in category:", metabolite_dir, "\n") # get a list of all metabolite files - metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), pattern = "*.txt", full.names = FALSE, recursive = FALSE) + metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), + pattern = "*.txt", full.names = FALSE, recursive = FALSE) # put all metabolites into one list metab_list_all <- list() metab_list_names <- c() @@ -359,7 +388,8 @@ if (violin == 1) { # make violin plots # open the text files and add each to a list of dataframes (metab_list_all) for (file_index in seq_along(metabolite_files)) { infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), sep = "\t", header = TRUE, quote = "") + metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), + sep = "\t", header = TRUE, quote = "") # put into list of all lists metab_list_all[[file_index]] <- metab_list metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) @@ -392,8 +422,8 @@ if (violin == 1) { # make violin plots } # Second step: dIEM plots in separate directory - dIEM_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") - dir.create(dIEM_plot_dir) + diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") + dir.create(diem_plot_dir) # Select the metabolites that are associated with the top highest scoring IEM, for each patient # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. @@ -401,54 +431,54 @@ if (violin == 1) { # make violin plots pt_name <- patient_list[pt_nr] # get top diseases for this patient pt_colnr <- which(colnames(disease_rank) == pt_name) - pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_IEM) - pt_IEMs <- disease_rank[pt_top_indices, "Disease"] - pt_top_IEMs <- pt_prob_score_top_IEMs <- c() - for (single_IEM in pt_IEMs) { + pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem) + pt_iems <- disease_rank[pt_top_indices, "Disease"] + pt_top_iems <- pt_prob_score_top_iems <- c() + for (single_iem in pt_iems) { # get the probability score - prob_score_IEM <- prob_score[which(prob_score$Disease == single_IEM), pt_colnr] + prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr] # use only diseases for which probability score is above threshold - if (prob_score_IEM >= threshold_IEM) { - pt_top_IEMs <- c(pt_top_IEMs, single_IEM) - pt_prob_score_top_IEMs <- c(pt_prob_score_top_IEMs, prob_score_IEM) + if (prob_score_iem >= threshold_iem) { + pt_top_iems <- c(pt_top_iems, single_iem) + pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem) } } # prepare data for plotting dIEM violin plots - # If prob_score_top_IEM is an empty list, don't make a plot - if (length(pt_top_IEMs) > 0) { - # Sorting from high to low, both prob_score_top_IEMs and pt_top_IEMs. - pt_prob_score_order <- order(-pt_prob_score_top_IEMs) - pt_prob_score_top_IEMs <- round(pt_prob_score_top_IEMs, 1) - pt_prob_score_top_IEM_sorted <- pt_prob_score_top_IEMs[pt_prob_score_order] - pt_top_IEM_sorted <- pt_top_IEMs[pt_prob_score_order] - # getting metabolites for each top_IEM disease exactly like in metab_list_all - metab_IEM_all <- list() - metab_IEM_names <- c() - for (single_IEM_index in 1:length(pt_top_IEM_sorted)) { - single_IEM <- pt_top_IEM_sorted[single_IEM_index] - single_prob_score <- pt_prob_score_top_IEM_sorted[single_IEM_index] - select_rows <- which(expected_biomarkers_select$Disease == single_IEM) + # If prob_score_top_iem is an empty list, don't make a plot + if (length(pt_top_iems) > 0) { + # Sorting from high to low, both prob_score_top_iems and pt_top_iems. + pt_prob_score_order <- order(-pt_prob_score_top_iems) + pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1) + pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order] + pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order] + # getting metabolites for each top_iem disease exactly like in metab_list_all + metab_iem_all <- list() + metab_iem_names <- c() + for (single_iem_index in 1:length(pt_top_iem_sorted)) { + single_iem <- pt_top_iem_sorted[single_iem_index] + single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index] + select_rows <- which(expected_biomarkers_select$Disease == single_iem) metab_list <- expected_biomarkers_select[select_rows, ] - metab_IEM_names <- c(metab_IEM_names, paste0(single_IEM, ", probability score ", single_prob_score)) + metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score)) metab_list <- metab_list[, -1] - metab_IEM_all[[single_IEM_index]] <- metab_list + metab_iem_all[[single_iem_index]] <- metab_list } # put all metabolites into one list - names(metab_IEM_all) <- metab_IEM_names + names(metab_iem_all) <- metab_iem_names # get Zscore information from zscore_patients_copy, similar to normal violin plots - metab_IEM_sorted <- prepare_data(metab_IEM_all, zscore_patients_copy) - metab_IEM_controls <- prepare_data(metab_IEM_all, zscore_controls) + metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy) + metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls) # make sure every page has 20 metabolites - dIEM_metab_perpage <- prepare_data_perpage(metab_IEM_sorted, metab_IEM_controls, nr_plots_perpage, nr_pat) + diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat) # generate dIEM violin plots - create_violin_plots(dIEM_plot_dir, pt_name, dIEM_metab_perpage, top_metab_pt) + create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt) } else { - cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_IEM, ". - Therefore, this pdf was not made:\t ", pt_name, "_IEM \n")) + cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ". + Therefore, this pdf was not made:\t ", pt_name, "_iem \n")) } } diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index f002341..27bc20b 100644 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -5,15 +5,22 @@ process GenerateViolinPlots { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(excel_file) // input files need to be linked, but called within R script + path(excel_file) val(analysis_id) output: - // path('*.pdf') // pdf files are generated, but in different directory - path('*.xlsx') + path('Diagnostics/*.pdf'), emit: diag_plot_files + path('Other/*.pdf'), emit: other_plot_files + path('dIEM/*.pdf'), emit: diem_plot_files + path('*.xlsx'), emit: excel_file script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/GenerateViolinPlots.R $analysis_id $params.scripts_dir $params.zscore \ + $params.path_metabolite_groups \ + $params.file_ratios_metabolites \ + $params.file_expected_biomarkers_IEM \ + $params.file_explanation \ + $params.file_isomers """ } diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 7441959..4cb343f 100644 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -4,18 +4,16 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -db_path <- cmd_args[1] -breaks_filepath <- cmd_args[2] -standard_run <- cmd_args[4] +db_file <- cmd_args[1] +breaks_file <- cmd_args[2] +standard_run <- cmd_args[4] -# Cut up entire HMDB into small parts based on the new binning/breaks -load(breaks_filepath) - -# In case of a standard run (m/z 69-606) use external HMDB parts +# load file with binning breaks +load(breaks_file) min_mz <- round(breaks_fwhm[1]) max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) -# test if standard mz range is used +# In case of a standard run (m/z 69-606) use external HMDB parts if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { # skip generating HMDB parts hmdb_parts_path <- cmd_args[3] @@ -26,7 +24,7 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < } } else { # generate HMDB parts in case of non-standard mz range - load(db_path) + load(db_file) ppm <- as.numeric(cmd_args[5]) scanmodes <- c("positive", "negative") @@ -39,79 +37,74 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < HMDB_add_iso <- HMDB_add_iso.Pos } - # filter mass range meassured!!! + # filter mass range meassured HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & HMDB_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # sort on mass outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[ , column_label])),] + nr_rows <- dim(outlist)[1] - nr_rows <- dim(outlist)[1] # maximum number of rows per file - sub <- 20000 - end <- 0 - last_line <- sub - check <- 0 + sub <- 20000 + end <- 0 + last_line <- sub + check <- 0 outlist_part <- NULL - + # create parts and save to file if (nr_rows < sub) { outlist_part <- outlist - save(outlist_part, file = paste0("./", scanmode, "_hmdb.1.RData")) - } else { - - if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) { - for (i in 2:floor(nr_rows / sub) - 1) { - start <- -(sub - 1) + i * sub - end <- i * sub - - if (i > 1){ - outlist_i = outlist[c(start:end),] - - nr_moved = 0 - - # Use ppm to replace border to avoid cut within peakgroup! - while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / - as.numeric(outlist_i[1, column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1, ]) - outlist_i <- outlist_i[-1, ] - nr_moved <- nr_moved + 1 - } - - save(outlist_part, file = paste("./", scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = "")) - check <- check + dim(outlist_part)[1] - - outlist_part <- outlist_i - last_line <- dim(outlist_part)[1] - - } else { - outlist_part <- outlist[c(start:end),] - } - } - } - - start <- end + 1 - end <- nr_rows - outlist_i <- outlist[c(start:end),] - nr_moved <- 0 - - if (!is.null(outlist_part)) { - # Calculate ppm and replace border, avoid cut within peakgroup! + save(outlist_part, file = paste0(scanmode, "_hmdb.1.RData")) + } else if (nr_rows >= sub & (floor(nr_rows / sub) - 1) >= 2) { + for (i in 2:floor(nr_rows / sub) - 1) { + start <- -(sub - 1) + i * sub + end <- i * sub + + if (i > 1){ + outlist_i = outlist[c(start:end),] + nr_moved = 0 + # Use ppm to replace border to avoid cut within peak group while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / as.numeric(outlist_i[1, column_label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1,]) - outlist_i <- outlist_i[-1,] + outlist_part <- rbind(outlist_part, outlist_i[1, ]) + outlist_i <- outlist_i[-1, ] nr_moved <- nr_moved + 1 } - save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i, ".RData")) + save(outlist_part, file = paste(scanmode, "_", paste("hmdb", i-1, "RData", sep = "."), sep = "")) check <- check + dim(outlist_part)[1] + + outlist_part <- outlist_i + last_line <- dim(outlist_part)[1] + + } else { + outlist_part <- outlist[c(start:end),] + } + } + + start <- end + 1 + end <- nr_rows + outlist_i <- outlist[c(start:end), ] + nr_moved <- 0 + + if (!is.null(outlist_part)) { + # Calculate ppm and replace border, avoid cut within peak group + while ((as.numeric(outlist_i[1, column_label]) - as.numeric(outlist_part[last_line, column_label])) * 1e+06 / + as.numeric(outlist_i[1, column_label]) < ppm) { + outlist_part <- rbind(outlist_part, outlist_i[1, ]) + outlist_i <- outlist_i[-1, ] + nr_moved <- nr_moved + 1 } - outlist_part <- outlist_i - save(outlist_part, file = paste0("./", scanmode, "_hmdb_", i + 1, ".RData")) + save(outlist_part, file = paste0(scanmode, "_hmdb_", i, ".RData")) check <- check + dim(outlist_part)[1] } + + outlist_part <- outlist_i + save(outlist_part, file = paste0(scanmode, "_hmdb_", i + 1, ".RData")) + check <- check + dim(outlist_part)[1] } + } } diff --git a/DIMS/HMDBparts.nf b/DIMS/HMDBparts.nf index 50786af..5f19f75 100644 --- a/DIMS/HMDBparts.nf +++ b/DIMS/HMDBparts.nf @@ -1,6 +1,5 @@ process HMDBparts { tag "DIMS HMDBparts" - // Custom process to cut HMDB db into parts label 'HMDBparts' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] @@ -13,7 +12,6 @@ process HMDBparts { path('*.RData') script: - """ Rscript ${baseDir}/CustomModules/DIMS/HMDBparts.R $hmdb_db_file $breaks_file $params.hmdb_parts_files $params.standard_run $params.ppm """ diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index ff468c7..0486b03 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -4,11 +4,11 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -db_path <- cmd_args[1] -breaks_filepath <- cmd_args[2] +db_file <- cmd_args[1] +breaks_file <- cmd_args[2] -load(db_path) -load(breaks_filepath) +load(db_file) +load(breaks_file) # Cut up HMDB minus adducts minus isotopes into small parts scanmodes <- c("positive", "negative") @@ -26,14 +26,14 @@ for (scanmode in scanmodes) { HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # remove adducts and isotopes, put internal standard at the beginning - outlist_IS <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ] + outlist_is <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ] outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ] outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ] - outlist <- rbind(outlist_IS, outlist) + outlist <- rbind(outlist_is, outlist) # sort on m/z value outlist <- outlist[order(outlist[ , column_label]), ] - nr_rows <- dim(outlist)[1] + # size of hmdb parts in lines: sub <- 1000 end <- 0 @@ -42,7 +42,7 @@ for (scanmode in scanmodes) { # generate hmdb parts if (nr_rows >= sub & (floor(nr_rows / sub)) >= 2) { for (i in 1:floor(nr_rows / sub)) { - start <- -(sub-1) + i * sub + start <- -(sub - 1) + i * sub end <- i * sub outlist_part <- outlist[c(start:end), ] save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData")) @@ -54,5 +54,5 @@ for (scanmode in scanmodes) { start <- end + 1 end <- nr_rows -outlist_part <- outlist[c(start:end),] -save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i+1, ".RData")) +outlist_part <- outlist[c(start:end), ] +save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData")) diff --git a/DIMS/HMDBparts_main.nf b/DIMS/HMDBparts_main.nf index 3f9845b..b38bac0 100644 --- a/DIMS/HMDBparts_main.nf +++ b/DIMS/HMDBparts_main.nf @@ -1,6 +1,5 @@ process HMDBparts_main { tag "DIMS HMDBparts_main" - // Custom process to cut HMDB db into parts for main entry only, no adducts, no isotopes label 'HMDBparts_main' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index 7d8dd1d..b01a31b 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,16 +1,19 @@ #!/usr/bin/env Rscript ## adapted from makeInit in old pipeline -args <- commandArgs(trailingOnly = TRUE) -sample_sheet <- read.csv(args[1], sep = "\t") +# define parameters +args <- commandArgs(trailingOnly = TRUE) + +sample_sheet <- read.csv(args[1], sep = "\t") nr_replicates <- as.numeric(args[2]) -sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) -nr_sample_groups <- length(sample_names) / nr_replicates -group_names <- trimws(as.vector(unlist(sample_sheet[2]))) -group_names <- gsub("[^-.[:alnum:]]", "_", group_names) +sample_names <- trimws(as.vector(unlist(sample_sheet[1]))) +nr_sample_groups <- length(sample_names) / nr_replicates +group_names <- trimws(as.vector(unlist(sample_sheet[2]))) +group_names <- gsub("[^-.[:alnum:]]", "_", group_names) group_names_unique <- unique(group_names) +# generate the replication pattern repl_pattern <- c() for (sample_group in 1:nr_sample_groups) { tmp <- c() @@ -26,4 +29,4 @@ names(repl_pattern) <- group_names_unique # preview the replication pattern print(tail(repl_pattern)) -save(repl_pattern, file = "./init.RData") +save(repl_pattern, file = "init.RData") diff --git a/DIMS/MakeInit.nf b/DIMS/MakeInit.nf index 48b1c9b..7aae0e4 100644 --- a/DIMS/MakeInit.nf +++ b/DIMS/MakeInit.nf @@ -5,7 +5,8 @@ process MakeInit { shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple path(samplesheet), val(nr_replicates) + path(samplesheet) + val(nr_replicates) output: path('init.RData') diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index e978d4b..e2edc85 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -4,51 +4,63 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -filepath <- cmd_args[1] -breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -resol <- as.numeric(cmd_args[3]) -scripts_dir <- cmd_args[4] -thresh <- 2000 +sample_file <- cmd_args[1] +breaks_file <- cmd_args[2] +resol <- as.numeric(cmd_args[3]) +scripts_dir <- cmd_args[4] +thresh <- 2000 +outdir <- "./" # load in function scripts -source(paste0(scripts_dir, "AddOnFunctions/findPeaks.Gauss.HPC.R")) -source(paste0(scripts_dir, "AddOnFunctions/searchMZRange.R")) -source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) -source(paste0(scripts_dir, "AddOnFunctions/fitGaussian.R")) -source(paste0(scripts_dir, "AddOnFunctions/fitGaussianInit.R")) -source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R")) -source(paste0(scripts_dir, "AddOnFunctions/getSD.R")) -source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit1Peak.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit2peaks.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit3peaks.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit4peaks.R")) -source(paste0(scripts_dir, "AddOnFunctions/fitG.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit2G.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit3G.R")) -source(paste0(scripts_dir, "AddOnFunctions/fit4G.R")) -source(paste0(scripts_dir, "AddOnFunctions/getArea.R")) -source(paste0(scripts_dir, "AddOnFunctions/getFitQuality.R")) -source(paste0(scripts_dir, "AddOnFunctions/checkOverlap.R")) -source(paste0(scripts_dir, "AddOnFunctions/sumCurves.R")) -source(paste0(scripts_dir, "AddOnFunctions/isWithinXppm.R")) - -load(breaks_filepath) +source(paste0(scripts_dir, "findPeaks.Gauss.HPC.R")) +source(paste0(scripts_dir, "searchMZRange.R")) +source(paste0(scripts_dir, "generateGaussian.R")) +source(paste0(scripts_dir, "fitGaussian.R")) +source(paste0(scripts_dir, "fitGaussianInit.R")) +source(paste0(scripts_dir, "getFwhm.R")) +source(paste0(scripts_dir, "getSD.R")) +source(paste0(scripts_dir, "optimizeGauss.R")) +source(paste0(scripts_dir, "fit1Peak.R")) +source(paste0(scripts_dir, "fit2peaks.R")) +source(paste0(scripts_dir, "fit3peaks.R")) +source(paste0(scripts_dir, "fit4peaks.R")) +source(paste0(scripts_dir, "fitG.R")) +source(paste0(scripts_dir, "fit2G.R")) +source(paste0(scripts_dir, "fit3G.R")) +source(paste0(scripts_dir, "fit4G.R")) +source(paste0(scripts_dir, "getArea.R")) +source(paste0(scripts_dir, "getFitQuality.R")) +source(paste0(scripts_dir, "checkOverlap.R")) +source(paste0(scripts_dir, "sumCurves.R")) +source(paste0(scripts_dir, "isWithinXppm.R")) + +load(breaks_file) # Load output of AverageTechReplicates for a sample -sample_avgtechrepl <- get(load(filepath)) -if (grepl("_pos", filepath)) { +sample_avgtechrepl <- get(load(sample_file)) +if (grepl("_pos", sample_file)) { scanmode <- "positive" -} else if (grepl("_neg", filepath)) { +} else if (grepl("_neg", sample_file)) { scanmode <- "negative" } # Initialize options(digits = 16) int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve -scale <- 2 # Initial value used to estimate scaling parameter -width <- 1024 -height <- 768 +scale <- 2 # Initial value used to estimate scaling parameter +width <- 1024 +height <- 768 # run the findPeaks function -findPeaks.Gauss.HPC(sample_avgtechrepl, breaks.fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +print(head(sample_avgtechrepl)) +print(head(breaks_fwhm)) +print(int_factor) +print(scale) +print(resol) +print(outdir) +print(scanmode) +print(thresh) +print(width) +print(height) + +findPeaks.Gauss.HPC(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/PeakFinding.nf b/DIMS/PeakFinding.nf index 1752a74..0097d12 100644 --- a/DIMS/PeakFinding.nf +++ b/DIMS/PeakFinding.nf @@ -1,17 +1,17 @@ process PeakFinding { - tag "DIMS PeakFinding ${RData_file}" + tag "DIMS PeakFinding ${rdata_file}" label 'PeakFinding' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple(path(RData_file), path(breaks_file)) + tuple(path(rdata_file), path(breaks_file)) output: path '*tive.RData' script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $RData_file $breaks_file $params.resolution $params.scripts_dir + Rscript ${baseDir}/CustomModules/DIMS/PeakFinding.R $rdata_file $breaks_file $params.resolution $params.scripts_dir """ } diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index b574206..8da74d9 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -56,8 +56,8 @@ while (dim(hmdb_add_iso)[1] > 0) { selp <- which((mzmed > (reference_mass - mass_tolerance)) & (mzmed < (reference_mass + mass_tolerance))) tmplist <- outlist_copy[selp, , drop = FALSE] list_of_peaks_used_in_peak_groups_identified <- rbind(list_of_peaks_used_in_peak_groups_identified, tmplist) - nrsamples <- length(selp) + # if peaks have been found, create a peak group if (nrsamples > 0) { mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"])) mzmin_pgrp <- reference_mass - mass_tolerance @@ -81,7 +81,7 @@ while (dim(hmdb_add_iso)[1] > 0) { assi_hmdb <- iso_hmdb <- hmdb_code <- NA tmplist_mass_iso <- tmplist_mass_adduct <- NULL - # Identification: find all entries in HMDB part with mass within ppm range + # find all entries in HMDB part with mass within ppm range mass_all <- as.numeric(hmdb_add_iso[, column_label]) index <- which((mass_all > (reference_mass - mass_tolerance)) & (mass_all < (reference_mass + mass_tolerance))) tmplist_mass <- hmdb_add_iso[index, , drop = FALSE] @@ -103,24 +103,30 @@ while (dim(hmdb_add_iso)[1] > 0) { } } - # Compose a list compounds, adducts or isotopes with corresponding m/z + # Compose a list of compounds, adducts or isotopes with corresponding m/z if (dim(tmplist_mass)[1] > 0) { # metabolites - assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), collapse = ";")) - hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), collapse = ";")) + assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), + collapse = ";")) theormz_hmdb <- as.numeric(tmplist_mass[1, column_label]) # adducts of metabolites if (!is.null(tmplist_mass_adduct)) { if (dim(tmplist_mass_adduct)[1] > 0) { if (is.na(assi_hmdb)) { - assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")) - hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")) + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")) } else { assi_hmdb <- paste(assi_hmdb, - as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";") + as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")), sep = ";") hmdb_code <- paste(hmdb_code, - as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";") + as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")), sep = ";") } } } @@ -128,7 +134,8 @@ while (dim(hmdb_add_iso)[1] > 0) { # isotopes of metabolites if (!is.null(tmplist_mass_iso)) { if (dim(tmplist_mass_iso)[1] > 0) { - iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + collapse = ";")) } } @@ -140,13 +147,17 @@ while (dim(hmdb_add_iso)[1] > 0) { if (!is.null(tmplist_mass_adduct)) { if (dim(tmplist_mass_adduct)[1] > 0) { if (is.na(assi_hmdb)) { - assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")) - hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")) + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")) } else { assi_hmdb <- paste(assi_hmdb, - as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";") + as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")), sep = ";") hmdb_code <- paste(hmdb_code, - as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";") + as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")), sep = ";") } } } @@ -154,7 +165,8 @@ while (dim(hmdb_add_iso)[1] > 0) { # isotopes of metabolites if (!is.null(tmplist_mass_iso)) { if (dim(tmplist_mass_iso)[1] > 0) { - iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + collapse = ";")) } } @@ -162,7 +174,8 @@ while (dim(hmdb_add_iso)[1] > 0) { } else if (!is.null(tmplist_mass_iso)) { if (dim(tmplist_mass_iso)[1] > 0) { theormz_hmdb <- as.numeric(tmplist_mass_iso[1, column_label]) - iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + collapse = ";")) } } } @@ -180,7 +193,6 @@ while (dim(hmdb_add_iso)[1] > 0) { hmdb_add_iso <- hmdb_add_iso[-index, ] } - # save peak list corresponding to masses in HMDB part save(list_of_peaks_used_in_peak_groups_identified, file = paste0(batch_number, "_", scanmode, "_peaks_used.RData")) # save peak group list, identified part diff --git a/DIMS/PeakGrouping.nf b/DIMS/PeakGrouping.nf index 5c48cde..ae4382c 100644 --- a/DIMS/PeakGrouping.nf +++ b/DIMS/PeakGrouping.nf @@ -1,13 +1,13 @@ process PeakGrouping { - tag "DIMS PeakGrouping ${HMDBpart_file}" + tag "DIMS PeakGrouping ${hmdbpart_file}" label 'PeakGrouping' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(HMDBpart_file) - each path(SpectrumPeak_file) // input files need to be linked, but called within R script - each path(pattern_file) // Execute the process for each element in the input collection (HMDBpart_file) + path(hmdbpart_file) + each path(spectrumpeak_file) + each path(pattern_file) output: path '*_peaks_used.RData', emit: peaks_used @@ -15,6 +15,6 @@ process PeakGrouping { script: """ - Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $HMDBpart_file $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/PeakGrouping.R $hmdbpart_file $params.ppm """ } diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index 5c77b12..cf6665c 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -4,7 +4,7 @@ # define parameters scanmodes <- c("positive", "negative") -# Check whether all jobs terminated correct! +# Check whether all jobs terminated correctly not_run <- NULL # collect spectrum peaks for each scanmode @@ -13,10 +13,10 @@ for (scanmode in scanmodes) { peaklist_files <- list.files(pattern = paste0("_", scanmode, ".RData")) # get sample names - load(paste0("./", scanmode, "_repl_pattern", ".RData")) + load(paste0(scanmode, "_repl_pattern.RData")) group_names <- names(repl_pattern_filtered) for (sample_nr in 1:length(group_names)) { - group <- paste0("./", group_names[sample_nr], "_", scanmode, ".RData") + group <- paste0(group_names[sample_nr], "_", scanmode, ".RData") if (!(group %in% peaklist_files)) { not_run <- c(not_run, group) } diff --git a/DIMS/SpectrumPeakFinding.nf b/DIMS/SpectrumPeakFinding.nf index 775cd2c..3590265 100644 --- a/DIMS/SpectrumPeakFinding.nf +++ b/DIMS/SpectrumPeakFinding.nf @@ -5,8 +5,8 @@ process SpectrumPeakFinding { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(RData_file) // input files need to be linked, but called within R script - path(replication_pattern) // input files need to be linked, but called within R script + path(rdata_files) + path(replication_pattern) output: path 'SpectrumPeaks_*.RData' diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index db74623..870a914 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -7,36 +7,13 @@ cmd_args <- commandArgs(trailingOnly = TRUE) hmdbpart_main_file <- cmd_args[1] scripts_dir <- cmd_args[2] z_score <- as.numeric(cmd_args[3]) -# NB: scripts_dir not used yet, but function SumAdducts needs to be placed in AddOnFunctions folder - -if (grepl("positive_hmdb", hmdbpart_main_file)) { - scanmode <- "positive" - # for the adduct sum: include adducts M+Na (1) and M+K (2) - adducts <- c(1, 2) -} else if (grepl("negative_hmdb", hmdbpart_main_file)) { - scanmode <- "negative" - # for the adduct sum: include adduct M+Cl (1) - adducts <- c(1) -} - -# load input files -collect_file <- paste0("outlist_identified_", scanmode, ".RData") -load(collect_file) -repl_file <- paste0(scanmode, "_repl_pattern.RData") -load(repl_file) -outlist_part <- get(load(hmdbpart_main_file)) - -# get the number from the file name -batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2] - -outlist_total <- unique(outlist_ident) sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number, scanmode, outdir, z_score) { hmdb_codes <- rownames(theor_mz) hmdb_names <- theor_mz[, 1, drop = FALSE] hmdb_names[] <- lapply(hmdb_names, as.character) - # remove isotopes!!! + # remove isotopes index <- grep("HMDB", hmdb_codes, fixed = TRUE) hmdb_codes <- hmdb_codes[index] hmdb_names <- hmdb_names[index, ] @@ -86,4 +63,26 @@ sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number } } +if (grepl("positive_hmdb", hmdbpart_main_file)) { + scanmode <- "positive" + # for the adduct sum: include adducts M+Na (1) and M+K (2) + adducts <- c(1, 2) +} else if (grepl("negative_hmdb", hmdbpart_main_file)) { + scanmode <- "negative" + # for the adduct sum: include adduct M+Cl (1) + adducts <- c(1) +} + +# load input files +collect_file <- paste0("outlist_identified_", scanmode, ".RData") +load(collect_file) +repl_file <- paste0(scanmode, "_repl_pattern.RData") +load(repl_file) +outlist_part <- get(load(hmdbpart_main_file)) + +# get the number from the file name +batch_number <- strsplit(basename(hmdbpart_main_file), ".", fixed = TRUE)[[1]][2] + +outlist_total <- unique(outlist_ident) + sum_adducts(outlist_total, outlist_part, names(repl_pattern_filtered), adducts, batch_number, scanmode, outdir, z_score) diff --git a/DIMS/SumAdducts.nf b/DIMS/SumAdducts.nf index 66a404f..bbd6420 100644 --- a/DIMS/SumAdducts.nf +++ b/DIMS/SumAdducts.nf @@ -1,12 +1,12 @@ process SumAdducts { - tag "DIMS SumAdducts ${collect_file}" + tag "DIMS SumAdducts" label 'SumAdducts' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - each path(collect_file) // input files need to be linked, but called within R script - each path(replication_pattern) // input files need to be linked, but called within R script + each path(collect_files) + each path(replication_pattern) path(HMDBpart_main_file) output: diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf index 17cee27..6e51984 100644 --- a/DIMS/ThermoRawFileParser.nf +++ b/DIMS/ThermoRawFileParser.nf @@ -1,14 +1,13 @@ process ConvertRawFile { tag "DIMS ConvertRawFile ${file_id}" - // Custom process to convert raw file to mzML format label 'ThermoRawFileParser_1_1_11' shell = ['/bin/bash', '-euo', 'pipefail'] input: - tuple val(file_id), path(raw_file) + tuple(val(file_id), path(raw_file)) output: - tuple val(file_id), path("${file_id}.mzML") + tuple(val(file_id), path("${file_id}.mzML")) script: diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R index 68dbfe6..2bce990 100755 --- a/DIMS/UnidentifiedCalcZscores.R +++ b/DIMS/UnidentifiedCalcZscores.R @@ -4,19 +4,19 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -scripts_dir <- cmd_args[1] -ppm <- as.numeric(cmd_args[2]) +scripts_dir <- cmd_args[1] +ppm <- as.numeric(cmd_args[2]) z_score <- as.numeric(cmd_args[3]) -source(paste0(scripts_dir, "AddOnFunctions/mergeDuplicatedRows.R")) -source(paste0(scripts_dir, "AddOnFunctions/statistics_z.R")) +source(paste0(scripts_dir, "mergeDuplicatedRows.R")) +source(paste0(scripts_dir, "statistics_z.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { # get list of files - filled_file <- paste0("./PeakGroupList_", scanmode, "_Unidentified_filled.RData") + filled_file <- paste0("PeakGroupList_", scanmode, "_Unidentified_filled.RData") # load file outlist_total <- get(load(filled_file)) @@ -36,7 +36,8 @@ for (scanmode in scanmodes) { nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) outlist_stats_more <- cbind(outlist_stats[, 1:7], - outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8):(length(repl_pattern) - nr_removed_samples + 8 + 6)], + outlist_stats[, (length(repl_pattern) - nr_removed_samples + 8): + (length(repl_pattern) - nr_removed_samples + 8 + 6)], outlist_stats[, 8:(length(repl_pattern) - nr_removed_samples + 7)][order_index_int], outlist_stats[, (length(repl_pattern) - nr_removed_samples + 5 + 10):ncol(outlist_stats)]) @@ -50,10 +51,9 @@ for (scanmode in scanmodes) { outlist_not_ident <- outlist_total - # Extra output in Excel-readable format: + # Save output remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") remove_colindex <- which(colnames(outlist_not_ident) %in% remove_columns) outlist_not_ident <- outlist_not_ident[, -remove_colindex] - write.table(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".txt"), sep = "\t", row.names = FALSE) - + save(outlist_not_ident, file = paste0("unidentified_outlist_", scanmode, ".RData")) } diff --git a/DIMS/UnidentifiedCalcZscores.nf b/DIMS/UnidentifiedCalcZscores.nf index 1687b76..675d9cd 100644 --- a/DIMS/UnidentifiedCalcZscores.nf +++ b/DIMS/UnidentifiedCalcZscores.nf @@ -6,10 +6,10 @@ process UnidentifiedCalcZscores { input: path(unidentified_filled_files) - path(replication_pattern) // input files need to be linked, but called within R script + each path(replication_pattern) output: - path('unidentified_outlist*.txt') + path('unidentified_outlist*.RData') script: """ diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index 5f1d5d7..a5674f1 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -5,7 +5,6 @@ cmd_args <- commandArgs(trailingOnly = TRUE) ppm <- as.numeric(cmd_args[1]) -outdir <- "./" scanmodes <- c("positive", "negative") @@ -17,16 +16,16 @@ for (scanmode in scanmodes) { # Make a list of indexes of peaks that have been identified, then remove these from the peaklist. remove <- NULL - for (i in 1:length(files)) { + for (file_index in 1:length(files)) { # load list_of_peaks_used_in_peak_groups_identified - load(files[i]) - remove <- c(remove, which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[i, "mzmed.pkt"])) + load(files[file_index]) + remove <- c(remove, + which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[file_index, "mzmed.pkt"])) } outlist_rest <- outlist_total[-remove, ] # sort on mass outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ] # save output - save(outlist, file = paste0(outdir, "/SpectrumPeaks_", scanmode, "_Unidentified.RData")) - + save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")) } diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf index bcd1264..6b2656e 100644 --- a/DIMS/UnidentifiedCollectPeaks.nf +++ b/DIMS/UnidentifiedCollectPeaks.nf @@ -5,16 +5,14 @@ process UnidentifiedCollectPeaks { shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(SpectrumPeaks_file) // input files need to be linked, but called within R script - path(PeakList_identified) + path(spectrumpeaks_file) + path(peaklist_identified) output: path('SpectrumPeaks_*_Unidentified.RData') - // path('SpectrumPeaks_negative_Unidentified.RData') - // path('SpectrumPeaks_positive_Unidentified.RData') script: """ - Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $SpectrumPeaks_file $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $spectrumpeaks_file $params.ppm """ } diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R index 5416b15..2ee085f 100755 --- a/DIMS/UnidentifiedFillMissing.R +++ b/DIMS/UnidentifiedFillMissing.R @@ -4,25 +4,24 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -# define parameters peakgrouplist_file1 <- cmd_args[1] peakgrouplist_file2 <- cmd_args[2] -scripts_dir <- cmd_args[3] -thresh <- as.numeric(cmd_args[4]) -resol <- as.numeric(cmd_args[5]) -ppm <- as.numeric(cmd_args[6]) -outdir <- "./" +scripts_dir <- cmd_args[3] +thresh <- as.numeric(cmd_args[4]) +resol <- as.numeric(cmd_args[5]) +ppm <- as.numeric(cmd_args[6]) +outdir <- "./" # load in function scripts -source(paste0(scripts_dir, "AddOnFunctions/replaceZeros.R")) -source(paste0(scripts_dir, "AddOnFunctions/generateGaussian.R")) -source(paste0(scripts_dir, "AddOnFunctions/getFwhm.R")) -source(paste0(scripts_dir, "AddOnFunctions/getSD.R")) -source(paste0(scripts_dir, "AddOnFunctions/getArea.R")) -source(paste0(scripts_dir, "AddOnFunctions/optimizeGauss.R")) -source(paste0(scripts_dir, "AddOnFunctions/ident.hires.noise.HPC.R")) -source(paste0(scripts_dir, "AddOnFunctions/elementInfo.R")) -source(paste0(scripts_dir, "AddOnFunctions/globalAssignments.HPC.R")) +source(paste0(scripts_dir, "replaceZeros.R")) +source(paste0(scripts_dir, "generateGaussian.R")) +source(paste0(scripts_dir, "getFwhm.R")) +source(paste0(scripts_dir, "getSD.R")) +source(paste0(scripts_dir, "getArea.R")) +source(paste0(scripts_dir, "optimizeGauss.R")) +source(paste0(scripts_dir, "ident.hires.noise.HPC.R")) +source(paste0(scripts_dir, "elementInfo.R")) +source(paste0(scripts_dir, "globalAssignments.HPC.R")) peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) for (peakgrouplist_file in peakgrouplist_files) { @@ -46,5 +45,5 @@ for (peakgrouplist_file in peakgrouplist_files) { peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) # save output - save(peakgrouplist_filled, file = paste0("./", outputfile_name)) + save(peakgrouplist_filled, file = outputfile_name) } diff --git a/DIMS/UnidentifiedFillMissing.nf b/DIMS/UnidentifiedFillMissing.nf index 50c9783..3f9098e 100644 --- a/DIMS/UnidentifiedFillMissing.nf +++ b/DIMS/UnidentifiedFillMissing.nf @@ -1,18 +1,18 @@ process UnidentifiedFillMissing { - tag "DIMS UnidentifiedFillMissing ${GroupedList_file}" + tag "DIMS UnidentifiedFillMissing ${groupedlist_file}" label 'UnidentifiedFillMissing' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(GroupedList_file) - path(replication_pattern) // input files need to be linked, but called within R script + path(groupedlist_file) + each path(replication_pattern) output: path('*_filled.RData') script: """ - Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $GroupedList_file $params.scripts_dir $params.thresh $params.resolution $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedFillMissing.R $groupedlist_file $params.scripts_dir $params.thresh $params.resolution $params.ppm """ } diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R index 73ac4c1..16dd8c1 100755 --- a/DIMS/UnidentifiedPeakGrouping.R +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -4,25 +4,27 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -resol <- as.numeric(cmd_args[1]) -ppm <- as.numeric(cmd_args[2]) +resol <- as.numeric(cmd_args[1]) +ppm <- as.numeric(cmd_args[2]) outdir <- "./" -options(digits = 16) - -print(list.files(outdir, pattern = "RData")) -scanmodes <- c("positive", "negative") +options(digits = 16) # function for grouping unidentified peaks grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { outlist_copy <- get(load(unidentified_peaklist)) - load(paste0("./", scanmode, "_repl_pattern.RData")) + load(paste0(scanmode, "_repl_pattern.RData")) outpgrlist <- NULL # group on highest peaks range <- ppm * 1e-06 - while (dim(outlist_copy)[1] > 0) { + # temporary: speed up this step by limiting the number of rows used in while loop + # this script needs to be parallellized + nrow_div <- nrow(outlist_copy) / 1.1 + + # while (dim(outlist_copy)[1] > 0) { + while (dim(outlist_copy)[1] > nrow_div) { sel <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1] # ppm range around max @@ -58,22 +60,27 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { sum(as.numeric(tmplist[which(tmplist[, "samplenr"] == x), "height.pkt"])) }))) - outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA)) + # combine all information + outpgrlist <- rbind(outpgrlist, c(mzmed_pgrp, fq_best_pgrp, fq_worst_pgrp, nrsamples, + mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA)) } outlist_copy <- outlist_copy[-which(selp == TRUE), , drop = FALSE] } outpgrlist <- as.data.frame(outpgrlist) - colnames(outpgrlist)[1:6] <- c("mzmed_pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin_pgrp", "mzmax_pgrp") - colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", "HMDB_code", "theormz_HMDB") + colnames(outpgrlist)[1:6] <- c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") + colnames(outpgrlist)[(length(repl_pattern_filtered) + 7):ncol(outpgrlist)] <- c("assi_HMDB", "iso_HMDB", + "HMDB_code", "theormz_HMDB") return(outpgrlist) } +scanmodes <- c("positive", "negative") + for (scanmode in scanmodes) { # generate peak group lists of the unidentified peaks - unidentified_peaklist <- paste0("./SpectrumPeaks_", scanmode, "_Unidentified.RData") + unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData") outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf index 00fc576..2988195 100644 --- a/DIMS/UnidentifiedPeakGrouping.nf +++ b/DIMS/UnidentifiedPeakGrouping.nf @@ -1,12 +1,12 @@ process UnidentifiedPeakGrouping { - tag "DIMS UnidentifiedPeakGrouping ${UnidentifiedSpectrumPeaks_file}" + tag "DIMS UnidentifiedPeakGrouping ${unidentified_spectrumpeaks_files}" label 'UnidentifiedPeakGrouping' container = 'docker://umcugenbioinf/dims:1.3' shell = ['/bin/bash', '-euo', 'pipefail'] input: - path(UnidentifiedSpectrumPeaks_file) // input files need to be linked, but called within R script - path(replication_pattern) // input files need to be linked, but called within R script + path(unidentified_spectrumpeaks_files) + each path(replication_pattern) output: path('*_Unidentified.txt') From 95df3e945ffc6a8e31097d5171ba9ecde4802bd7 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Wed, 27 Mar 2024 17:10:56 +0100 Subject: [PATCH 40/73] cleaned up AddOnFunctions --- DIMS/Utils/do_peakfinding.R | 58 +++++++++++ DIMS/Utils/fit_gaussians.R | 198 ++++++++++++++++++++++++++++++++++++ DIMS/Utils/fit_init.R | 49 +++++++++ DIMS/Utils/get_fwhm.R | 21 ++++ DIMS/Utils/get_stdev.R | 23 +++++ DIMS/Utils/search_mzrange.R | 188 ++++++++++++++++++++++++++++++++++ 6 files changed, 537 insertions(+) create mode 100644 DIMS/Utils/do_peakfinding.R create mode 100644 DIMS/Utils/fit_gaussians.R create mode 100644 DIMS/Utils/fit_init.R create mode 100644 DIMS/Utils/get_fwhm.R create mode 100644 DIMS/Utils/get_stdev.R create mode 100644 DIMS/Utils/search_mzrange.R diff --git a/DIMS/Utils/do_peakfinding.R b/DIMS/Utils/do_peakfinding.R new file mode 100644 index 0000000..004d96f --- /dev/null +++ b/DIMS/Utils/do_peakfinding.R @@ -0,0 +1,58 @@ +## adapted from findpeaks.Gauss.HPC.R +# NB: this function will be taken up into PeakFinding.R +# variables with fixed values will be removed from function parameters +# int_factor, scale, outdir, plot, thresh, width, height +do_peakfinding <- function(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { + #' start peak finding + #' + #' @param sample_avgtechrepl: Dataframe with binned intensities averaged over technical replicates for a sample + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param thresh: Value for noise level threshold (integer) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' + #' @return save output to file + + sample_name <- colnames(sample_avgtechrepl)[1] + + # turn dataframe with intensities into a named list + ints_fullrange <- as.vector(sample_avgtechrepl) + names(ints_fullrange) <- rownames(sample_avgtechrepl) + + # initialise list to store results for all peaks + allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, + "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) + + # look for m/z range for all peaks + allpeaks_values <- search_mzrange(ints_fullrange, allpeaks_values, int_factor, scale, resol, + outdir, sample_name, scanmode, + plot, width, height, thresh) + + # turn the list into a dataframe + outlist_persample <- NULL + outlist_persample <- cbind("samplenr" = allpeaks_values$nr, + "mzmed.pkt" = allpeaks_values$mean, + "fq" = allpeaks_values$qual, + "mzmin.pkt" = allpeaks_values$min, + "mzmax.pkt" = allpeaks_values$max, + "height.pkt" = allpeaks_values$area) + + # remove peaks with height = 0 + index <- which(outlist_persample[, "height.pkt"] == 0) + if (length(index) > 0) { + outlist_persample <- outlist_persample[-index, ] + } + + # save output to file + save(outlist_persample, file = paste0(sample_name, "_", scanmode, ".RData")) + + # generate text output to log file on number of spikes for this sample + # spikes are peaks that are too narrow, e.g. 1 data point + cat(paste("There were", allpeaks_values$spikes, "spikes")) +} + diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R new file mode 100644 index 0000000..007b8ed --- /dev/null +++ b/DIMS/Utils/fit_gaussians.R @@ -0,0 +1,198 @@ +# Gaussian fit functions +## adapted from fitG.R, fit2G.R, fit3G.R and fit4G.R (combined) +fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use_bounds) { + #' Fit a Gaussian curve for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma: Value for width of the peak (float) + #' @param query_mass: Value for mass at center of peak (float) + #' @param scale: Value for scaling intensities (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for fitted Gaussian curve + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000), method = "L-BFGS-B", + lower = lower, upper = upper) + } else { + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} + + +fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, + query_mass1, scale1, + query_mass2, scale2, use_bounds) { + #' Fit two Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for 2 fitted Gaussian curves + if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + sigma2 <- sigma1 + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } + } else { + if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { + sigma2 <- sigma1 + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000)) + } else { + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000)) + } + } + return(opt_fit) +} + + +fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, + query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, use_bounds) { + #' Fit three Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param sigma3: Value for width of the third peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param query_mass3: Value for mass at center of third peak (float) + #' @param scale3: Value for scaling intensities for third peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + + params[6] * dnorm(mass_vector, mean = params[5], sd = sigma3) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf, + mass_vector[length(mass_vector)], Inf) + # get optimal value for 3 fitted Gaussian curves + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} + + +fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma4, + query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, + query_mass4, scale4, use_bounds) { + #' Fit four Gaussian curves for a peak with given parameters + #' + #' @param mass_vector: Vector of masses (float) + #' @param int_vector: Vector of intensities (float) + #' @param sigma1: Value for width of the first peak (float) + #' @param sigma2: Value for width of the second peak (float) + #' @param sigma3: Value for width of the third peak (float) + #' @param sigma4: Value for width of the fourth peak (float) + #' @param query_mass1: Value for mass at center of first peak (float) + #' @param scale1: Value for scaling intensities for first peak (float) + #' @param query_mass2: Value for mass at center of second peak (float) + #' @param scale2: Value for scaling intensities for second peak (float) + #' @param query_mass3: Value for mass at center of third peak (float) + #' @param scale3: Value for scaling intensities for third peak (float) + #' @param query_mass4: Value for mass at center of fourth peak (float) + #' @param scale4: Value for scaling intensities for fourth peak (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return opt_fit: list of parameters and values describing the optimal fit + + # define optimization function for optim based on normal distribution + opt_f <- function(params) { + d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma1) + + params[4] * dnorm(mass_vector, mean = params[3], sd = sigma2) + + params[6] * dnorm(mass_vector, mean = params[5], sd = sigma3) + + params[8] * dnorm(mass_vector, mean = params[7], sd = sigma4) + sum((d - int_vector) ^ 2) + } + + if (use_bounds) { + # determine lower and upper boundaries + lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) + upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf, + mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) + # get optimal value for 3 fitted Gaussian curves + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, + query_mass4, scale4), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper) + } else { + opt_fit <- optim(c(query_mass1, scale1, + query_mass2, scale2, + query_mass3, scale3, + query_mass4, scale4), + opt_f, control = list(maxit = 10000)) + } + return(opt_fit) +} diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R new file mode 100644 index 0000000..5e0b823 --- /dev/null +++ b/DIMS/Utils/fit_init.R @@ -0,0 +1,49 @@ +## adapted from fitGaussianInit.R +# variables with fixed values will be removed from function parameters +# int_factor, scale, outdir, plot, thresh, width, height +# mz_index, start_index, end_index, sample_name not used. +# fit_gaussian should be defined before this function is called. +fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, + mz_index, start_index, end_index) { + #' Initial fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param sample_name: Sample name (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' @param mz_index: Index of m/z value with non-zero intensity (integer) + #' @param start_index: Index of start of m/z range in mass_vector (integer) + #' @param end_index: Index of end of m/z range in mass_vector (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # define mass_diff as difference between last and first value of mass_vector + mass_diff <- mass_vector[length(mass_vector)] - mass_vector[1] + # generate a second mass_vector with equally spaced m/z values + mass_vector2 <- seq(mass_vector[1], mass_vector[length(mass_vector)], + length = mass_diff * int_factor) + + # Find the index in int_vector with the highest intensity + max_index <- which(int_vector == max(int_vector)) + + roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, + scale, resol, outdir, force = length(max_index), + useBounds = FALSE, plot, scanmode, int_factor, width, height) + + roi_value_list <- list("mean" = roi_values$mean, + "area" = roi_values$area, + "qual" = roi_values$qual, + "min" = roi_values$min, + "max" = roi_values$max) + + return(roi_value_list) +} + diff --git a/DIMS/Utils/get_fwhm.R b/DIMS/Utils/get_fwhm.R new file mode 100644 index 0000000..7543921 --- /dev/null +++ b/DIMS/Utils/get_fwhm.R @@ -0,0 +1,21 @@ +## adapted from getFwhm.R +get_fwhm <- function(query_mass, resol) { + #' Calculate fwhm (full width at half maximum intensity) for a peak + #' + #' @param query_mass: Value for mass (float) + #' @param resol: Value for resolution (integer) + #' + #' @return fwhm: Value for full width at half maximum (float) + + # set aberrant values of query_mass to zero + if (is.nan(query_mass)) query_mass <- 0 + if (is.na(query_mass)) query_mass <- 0 + if (is.null(query_mass)) query_mass <- 0 + if (query_mass < 0) query_mass <- 0 + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(query_mass / 200))) + # calculate full width at half maximum + fwhm <- query_mass / resol_mz + return(fwhm) +} + diff --git a/DIMS/Utils/get_stdev.R b/DIMS/Utils/get_stdev.R new file mode 100644 index 0000000..cfe0aa4 --- /dev/null +++ b/DIMS/Utils/get_stdev.R @@ -0,0 +1,23 @@ +## adapted from getSD.R +get_stdev <- function(mass_vector, int_vector, resol = 140000) { + #' Calculate standard deviation to determine width of a peak + #' + #' @param mass_vector: Vector of 3 mass values (float) + #' @param int_vector: Vector of 3 intensities (float) + #' @param resol: Value for resolution (integer) + #' + #' @return stdev: Value for standard deviation + + # find maximum intensity in vector + max_index <- which(int_vector == max(int_vector)) + # find corresponding mass at maximum intensity + max_mass <- mass_vector[max_index] + # calculate resolution at given m/z value + resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200))) + # calculate full width at half maximum + fwhm <- mean / resol_mz + # calculate standard deviation + stdev <- (fwhm / 2) * 0.85 + return(stdev) +} + diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R new file mode 100644 index 0000000..aa0abbd --- /dev/null +++ b/DIMS/Utils/search_mzrange.R @@ -0,0 +1,188 @@ +## adapted from searchMZRange.R +# variables with fixed values will be removed from function parameters +# int_factor, scale, outdir, plot, thresh, width, height +# allpeaks_values should be generated here, not passed on from do_peakfinding +search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, thresh) { + #' Divide the full m/z range into regions of interest with min, max and mean m/z + #' + #' @param ints_fullrange: Named list of intensities (float) + #' @param allpeaks_values: Empty list to store results for all peaks + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param sample_name: Sample name (string) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' @param thresh: Value for noise level threshold (integer) + #' + #' @return allpeaks_values: list of m/z regions of interest + + # initialise list to store results for all peaks. Currently in do_peakfinding + # allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, + # "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) + + # find indices where intensity is not equal to zero + nonzero_indices <- as.vector(which(ints_fullrange != 0)) + + # bad infusion. These should have been taken out in AverageTechReplicates + if (length(nonzero_indices) == 0) return(allpeaks_values) + + # initialize + end_index <- NULL + start_index <- nonzero_indices[1] + # maximum length of region of interest + max_roi_length <- 15 + + # find regions of interest + for (mz_index in 1:length(nonzero_indices)) { + # check whether mz_index is smaller than length(nonzero_indices). + # only false if mz_index == length(nonzero_indixes). + # second check is true at the end of a peak. + if (mz_index < length(nonzero_indices) && (nonzero_indices[mz_index + 1] - nonzero_indices[mz_index]) > 1) { + end_index <- nonzero_indices[mz_index] + # get m/z values and intensities for this region of interest + mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)]) + int_vector <- as.vector(ints_fullrange[c(start_index:end_index)]) + # check whether the vector of intensities is not empty. + if (length(int_vector) != 0) { + # check if intensity is above threshold or the maximum intensity is NaN + if (max(int_vector) < thresh || is.nan(max(int_vector))) { + # go to next region of interest + start_index <- nonzero_indices[mz_index + 1] + next + } + # check if there are more intensities than maximum for region of interest + if (length(int_vector) > max_roi_length) { + # trim lowest intensities to zero + int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 + # split the range into multiple sub ranges + sub_range <- int_vector + names(sub_range) <- mass_vector + allpeaks_values <- search_mzrange(sub_range, allpeaks_values, int_factor, + scale, resol, outdir, sample_name, scanmode, + plot, width, height, thresh) + # A proper peak needs to have at least 3 intensities above threshold + } else if (length(int_vector) > 3) { + # check if the sum of intensities is above zero. Why is this necessary? + if (sum(int_vector) == 0) next + # get initial fit values + roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, + mz_index, start_index, end_index) + + if (roi_values$qual[1] == 1) { + # get optimized fit values + roi_values <- generateGaussian(mass_vector, int_vector, resol, plot, + scanmode, int_factor, width, height) + # add region of interest to list of all peaks + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + + } else { + for (j in 1:length(roi_values$mean)){ + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j]) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j]) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1]) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1]) + allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1]) + } + } + + } else { + + roi_values <- generateGaussian(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) + + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + } + } + start_index <- nonzero_indices[mz_index + 1] + } + } + + # last little range + end_index <- nonzero_indices[length(nonzero_indices)] + mass_vector <- as.numeric(names(ints_fullrange)[c(start_index:end_index)]) + int_vector <- as.vector(ints_fullrange[c(start_index:end_index)]) + + if (length(int_vector) != 0) { + # check if intensity above threshold + if (max(int_vector) < thresh || is.nan(max(int_vector))) { + # do nothing + } else { + # check if there are more intensities than maximum for region of interest + if (length(int_vector) > max_roi_length) { + # trim lowest intensities to zero + int_vector[which(int_vector < min(int_vector) * 1.1)] <- 0 + # split the range into multiple sub ranges + sub_range <- int_vector + names(sub_range) <- mass_vector + + allpeaks_values <- search_mzrange(sub_range, allpeaks_values, int_factor, scale, resol, + outdir, sample_name, scanmode, + plot, width, height, thresh) + + } else if (length(int_vector) > 3) { + # Check only zeros + if (sum(int_vector) == 0) next + + roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, + resol, outdir, sample_name, scanmode, + plot, width, height, + mz_index, start_index, end_index) + + if (roi_values$qual[1] == 1) { + roi_values <- generateGaussian(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) + + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + + } else { + for (j in 1:length(roi_values$mean)){ + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean[j]) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area[j]) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min[1]) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max[1]) + allpeaks_values$qual <- c(allpeaks_values$qual, roi_values$qual[1]) + } + } + } else { + roi_values <- generateGaussian(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) + + allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) + allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) + allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) + allpeaks_values$min <- c(allpeaks_values$min, roi_values$min) + allpeaks_values$max <- c(allpeaks_values$max, roi_values$max) + allpeaks_values$qual <- c(allpeaks_values$qual, 0) + allpeaks_values$spikes <- allpeaks_values$spikes + 1 + } + } + } + return(allpeaks_values) +} + From 77a53c18cdf61cba24265adffca2de3bc1a01cae Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Wed, 27 Mar 2024 17:41:26 +0100 Subject: [PATCH 41/73] cleaned up AddOnFunctions --- DIMS/Utils/fit_init.R | 6 +++--- DIMS/Utils/fit_optim.R | 48 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 3 deletions(-) create mode 100644 DIMS/Utils/fit_optim.R diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R index 5e0b823..f9e0c27 100644 --- a/DIMS/Utils/fit_init.R +++ b/DIMS/Utils/fit_init.R @@ -1,12 +1,12 @@ ## adapted from fitGaussianInit.R # variables with fixed values will be removed from function parameters -# int_factor, scale, outdir, plot, thresh, width, height +# scale, outdir, plot, width, height # mz_index, start_index, end_index, sample_name not used. # fit_gaussian should be defined before this function is called. fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, outdir, sample_name, scanmode, plot, width, height, mz_index, start_index, end_index) { - #' Initial fit of Gaussian curve to small region of m/z + #' Determine initial fit of Gaussian curve to small region of m/z #' #' @param mass_vector: Vector of m/z values for a region of interest (float) #' @param int_vector: Vector of intensities for a region of interest (float) @@ -37,7 +37,7 @@ fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = length(max_index), useBounds = FALSE, plot, scanmode, int_factor, width, height) - + # put all values for this region of interest into a list roi_value_list <- list("mean" = roi_values$mean, "area" = roi_values$area, "qual" = roi_values$qual, diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R new file mode 100644 index 0000000..ac3a9ee --- /dev/null +++ b/DIMS/Utils/fit_optim.R @@ -0,0 +1,48 @@ +## adapted from generateGaussian.R +# variables with fixed values will be removed from function parameters +# plot, width, height +# fit_gaussian should be defined before this function is called. +fit_op4tim <- function(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) { + #' Determine optimized fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + factor <- 1.5 + # Find the index in int_vector with the highest intensity + max_index <- which(int_vector == max(int_vector))[1] + mass_max <- mass_vector[max_index] + int_max <- int_vector[max_index] + # get peak width + fwhm <- get_fwhm(mass_max, resol) + # simplify the peak shape: represent it by a triangle + mass_max_simple <- c(mass_max - factor * fwhm, mass_max, mass_max + factor * fwhm) + int_max_simple <- c(0, int_max, 0) + + # define mass_diff as difference between last and first value of mass_max_simple + mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] + # generate a second mass_vector with equally spaced m/z values + mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], + length = mz_diff * int_factor) + sigma <- get_stdev(mass_vector2, int_max_simple) + scale <- optimizeGauss(mass_vector2, int_max_simple, sigma, mass_max) + + # get an estimate of the area under the peak + area <- getArea(mass_max, resol, scale, sigma, int_factor) + + # put all values for this region of interest into a list + roi_value_list <- list("mean" = mass_max, + "area" = area, + "min" = mass_vector2[1], + "max" = mass_vector2[length(mass_vector2)]) + return(roi_value_list) +} From 777db937bf637a19713ba2919a6ac146b20dc0a0 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 5 Apr 2024 09:11:31 +0200 Subject: [PATCH 42/73] cleaned up version of fitGaussian.R --- DIMS/Utils/fit_gaussian.R | 333 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 333 insertions(+) create mode 100644 DIMS/Utils/fit_gaussian.R diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R new file mode 100644 index 0000000..9dc7e52 --- /dev/null +++ b/DIMS/Utils/fit_gaussian.R @@ -0,0 +1,333 @@ +## adapted from fitGaussian.R +# variables with fixed values will be removed from function parameters +# scale, outdir, plot, width, height +# max_index doesn't need to be passed to this function, can be determined here. +# remove plot sections (commented out) +# several functions need to be loaded before this function can run +fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force, use_bounds, plot, scanmode, + int_factor, width, height) { + #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param force: Number of local maxima in int_vector (integer) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' @param width: Value for width of plot (integer) + #' @param height: Value for height of plot (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + # Initialise + peak_mean <- NULL + peak_area <- NULL + peak_qual <- NULL + peak_min <- NULL + peak_max <- NULL + fit_quality1 <- 0.15 + fit_quality <- 0.2 + + # One local maximum: + if (force == 1) { + # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) + fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) + # set initial value for scale factor + scale <- 2 + # test if the mean is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) + + } else { + # test if the fit is bad + if (fit_values$qual > fit_quality1) { + # Try to fit two curves; find two local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are two indices in new_index + if (length(new_index) != 2) { + new_index <- round(length(mass_vector) / 3) + new_index <- c(new_index, 2 * new_index) + } + # run this function again with two local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, + scale, resol, outdir, force = 2, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) + # good fit + } else { + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale, + fit_values$sigma, int_factor)) + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + #### Two local maxima; need at least 6 data points for this #### + } else if (force == 2 && (length(mass_vector) > 6)) { + # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { + # check if fit quality is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force = 2, use_bounds = TRUE, + plot, scanmode, int_factor, width, height)) + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)){ + if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit three curves; find three local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are three indices in new_index + if (length(new_index) != 3) { + new_index <- round(length(mass_vector) / 4) + new_index <- c(new_index, 2 * new_index, 3 * new_index) + } + # run this function again with three local maxima + return(fitGaussian(mass_vector2, mass_vector, int_vector, new_index, + scale, resol, outdir, force = 3, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) + # good fit, all means are within m/z range + } else { + # check if means are within 3 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means != nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + # section for plot + # plot_header <- NULL + # for (i in 1:length(fit_values$mean)) { + # plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " ")) + # peak_mean <- c(peak_mean, fit_values$mean[i]) + # peak_area <- c(peak_area, fit_values$area[i]) + # } + # peak_qual <- fit_values$qual + # peak_min <- mass_vector[1] + # peak_max <- mass_vector[length(mass_vector)] + # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " ")) + # if (plot) legend("topright", legend=plot_header) + } + } + + #### Three local maxima; need at least 6 data points for this #### + } else if (force == 3 && (length(mass_vector) > 6)) { + # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || + fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) { + + # check if fit quality is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force, use_bounds = TRUE, + plot, scanmode, int_factor, width, height)) + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)) { + if (fit_values$mean[i] < mass_vector[1] || fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit four curves; find four local maxima + new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 + # test if there are four indices in new_index + if (length(new_index) != 4) { + new_index <- round(length(mass_vector) / 5) + new_index <- c(new_index, 2 * new_index, 3 * new_index, 4 * new_index) + } + # run this function again with four local maxima + return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, + outdir, force = 4, use_bounds = FALSE, plot, scanmode, + int_factor, width, height)) + # good fit, all means are within m/z range + } else { + # check if means are within 4 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means!=nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + + # plot_header <- NULL + # for (i in 1:length(fit_values$mean)){ + # plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " ")) + # + # peak_mean <- c(peak_mean, fit_values$mean[i]) + # peak_area <- c(peak_area, fit_values$area[i]) + # } + # peak_qual <- fit_values$qual + # peak_min <- mass_vector[1] + # peak_max <- mass_vector[length(mass_vector)] + # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " ")) + # if (plot) legend("topright", legend=plot_header) + } + } + + #### Four local maxima; need at least 6 data points for this #### + } else if (force == 4 && (length(mass_vector) > 6)) { + # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual) + fit_values <- fit4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) + # test if one of the means is outside the m/z range + if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || + fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || + fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] || + fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) { + + # check if quality of fit is bad + if (fit_values$qual > fit_quality) { + # run this function again with fixed boundaries + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force, use_bounds = TRUE, + plot, scanmode, int_factor, width, height)) + + } else { + # check which mean is outside range and remove it from the list of means + # NB: peak_mean and other variables have not been given values from 2-peak fit yet! + for (i in 1:length(fit_values$mean)) { + if (fit_values$mean[i] < mass_vector[1] | fit_values$mean[i] > mass_vector[length(mass_vector)]) { + peak_mean <- c(peak_mean, -i) + peak_area <- c(peak_area, -i) + } else { + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + # if all means are within range + } else { + # check for bad fit + if (fit_values$qual > fit_quality) { + # Try to fit 1 curve, force = 5 + return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + outdir, force = 5, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) + # good fit, all means are within m/z range + } else { + # check if means are within 4 ppm and sum if so + tmp <- fit_values$qual + nr_means_new <- -1 + nr_means <- length(fit_values$mean) + while (nr_means != nr_means_new) { + nr_means <- length(fit_values$mean) + fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) + nr_means_new <- length(fit_values$mean) + } + # restore original quality score + fit_values$qual <- tmp + + # plot_header<-NULL + # for (i in 1:length(fit_values$mean)){ + # plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " ")) + # peak_mean <- c(peak_mean, fit_values$mean[i]) + # peak_area <- c(peak_area, fit_values$area[i]) + # } + # peak_qual <- fit_values$qual + # peak_min <- mass_vector[1] + # peak_max <- mass_vector[length(mass_vector)] + # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " ")) + # if (plot) legend("topright", legend=plot_header) + } + } + + # More than four local maxima; fit 1 peak. + } else { + scale <- 2 + fit_quality1 <- 0.40 + use_bounds <- TRUE + max_index <- which(int_vector == max(int_vector)) + fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + plot, fit_quality1, use_bounds) + # check for bad fit + if (fit_values$qual > fit_quality1) { + # remove + if (plot) dev.off() + # get fit values from fit_optim + fit_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, fit_values$area) + peak_min <- fit_values$min + peak_max <- fit_values$max + peak_qual <- 0 + } else { + peak_mean <- c(peak_mean, fit_values$mean) + peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } + } + + # put all values for this region of interest into a list + roi_value_list <- list("mean" = peak_mean, + "area" = peak_area, + "qual" = peak_qual, + "min" = peak_min, + "max" = peak_max) + return(roi_value_list) +} + From 5ff20a6bd62e498656fc7eb76cb5a0ca7c688675 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Mon, 8 Apr 2024 12:54:38 +0200 Subject: [PATCH 43/73] Add changes v2.7 --- .../add_lab_id_and_onderzoeksnummer.R | 10 ++++ DIMS/AddOnFunctions/create_violin_plots.R | 46 ++++++++++----- .../get_patient_data_to_helix.R | 31 ++++++++++ DIMS/AddOnFunctions/is_diagnostic_patient.R | 6 ++ DIMS/AddOnFunctions/prepare_alarmvalues.R | 56 +++++++++++++------ DIMS/GenerateViolinPlots.R | 36 +++++++++--- 6 files changed, 146 insertions(+), 39 deletions(-) create mode 100644 DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R create mode 100644 DIMS/AddOnFunctions/get_patient_data_to_helix.R create mode 100644 DIMS/AddOnFunctions/is_diagnostic_patient.R diff --git a/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R new file mode 100644 index 0000000..9d91a55 --- /dev/null +++ b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R @@ -0,0 +1,10 @@ +add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) { + # Split patient number into labnummer and Onderzoeksnummer + for (row in 1:nrow(df_metabs_helix)) { + df_metabs_helix[row,"labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row,"Patient"]) + labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] + df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) + } + + return(df_metabs_helix) +} diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R index 35e6afb..df398be 100644 --- a/DIMS/AddOnFunctions/create_violin_plots.R +++ b/DIMS/AddOnFunctions/create_violin_plots.R @@ -1,5 +1,5 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) { - + # set parameters for plots plot_height <- 9.6 plot_width <- 6 @@ -10,18 +10,26 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU # green blue blue/purple purple orange red # patient plots, create the PDF device - if (grepl("Diagnostics", pdf_dir)) { - prefix <- "Dx_" - } else if (grepl("IEM", pdf_dir)) { - prefix <- "IEM_" - } else { - prefix <- "R_" + pt_name_sub <- pt_name + suffix <- "" + if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) { + prefix <- "MB" + suffix <- "_DIMS_PL_DIAG" + # substitute P and M in P2020M00001 into right format for Helix + pt_name_sub <- gsub("[PM]", "", pt_name) + pt_name_sub <- gsub("\\..*", "", pt_name_sub) + } else if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" } - - pdf(paste0(pdf_dir, "/", prefix, pt_name, ".pdf"), + + pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"), onefile = TRUE, - width = plot_width, - height = plot_height) + width = plot_width, + height = plot_height) # page headers: page_headers <- names(metab_perpage) @@ -42,17 +50,25 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU for (page_index in 1:length(metab_perpage)) { # extract list of metabolites to plot on a page metab_list_2plot <- metab_perpage[[page_index]] + # extract original data for patient of interest (pt_name) before cut-offs + pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] # cut off Z-scores higher than 20 or lower than -5 (for nicer plots) metab_list_2plot$value[metab_list_2plot$value > 20] <- 20 metab_list_2plot$value[metab_list_2plot$value < -5] <- -5 # extract data for patient of interest (pt_name) pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # restore original Z-score before cut-off, for showing Z-scores in PDF + pt_list_2plot$value_orig <- pt_list_2plot_orig$value # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ] # subtitle per page sub_perpage <- gsub("_", " ", page_headers[page_index]) # for IEM plots, put subtitle on two lines sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + # add size parameter for showing Z-score of patient per metabolite + Z_size <- rep(3, nrow(pt_list_2plot)) + # set size to 0 if row is empty + Z_size[is.na(pt_list_2plot$value)] <- 0 # draw violin plot. shape=22 gives square for patient of interest ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) + @@ -61,6 +77,8 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU geom_violin(scale="width") + geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") + scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") + + # add Z-score value for patient of interest at x=16 + geom_text(data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, size = Z_size) + # add labels. Use font Courier to get all the plots in the same location. labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + theme(axis.text.y = element_text(family = "Courier", size=6)) + @@ -72,11 +90,11 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU # add vertical lines geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) + geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2) - + suppressWarnings(print(ggplot_object)) } - + # add explanation of violin plots, version number etc. # plot.new() plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='') @@ -91,7 +109,7 @@ create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NU #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n")) #grid.arrange(explanation_grob) } - + # close the PDF file dev.off() diff --git a/DIMS/AddOnFunctions/get_patient_data_to_helix.R b/DIMS/AddOnFunctions/get_patient_data_to_helix.R new file mode 100644 index 0000000..c582177 --- /dev/null +++ b/DIMS/AddOnFunctions/get_patient_data_to_helix.R @@ -0,0 +1,31 @@ +get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all){ + # Combine Z-scores of metab groups together + df_all_metabs_zscores <- bind_rows(metab_interest_sorted) + # Change columnnames + colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score") + # Change Patient column to character instead of factor + df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient) + + # Delete whitespaces HMDB_name + df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right") + + # Split HMDB_name column on "nitine;" for match dims_helix_table + df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[,1] + + # Combine stofgroepen + dims_helix_table <- bind_rows(metab_list_all) + # Filter table for metabolites for Helix + dims_helix_table <- dims_helix_table %>% filter(Helix == "ja") + # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores + dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[,1] + dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) + + # Filter DIMS results for metabolites for Helix + df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split) + # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc. + df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split)) + + df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore) + + return(df_metabs_helix) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/is_diagnostic_patient.R b/DIMS/AddOnFunctions/is_diagnostic_patient.R new file mode 100644 index 0000000..839f41d --- /dev/null +++ b/DIMS/AddOnFunctions/is_diagnostic_patient.R @@ -0,0 +1,6 @@ +is_diagnostic_patient <- function(patient_column){ + # Check for Diagnostics patients with correct patientnumber (e.g. starting with "P2024M") + diagnostic_patients <- grepl("^P[0-9]{4}M",patient_column) + + return(diagnostic_patients) +} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/prepare_alarmvalues.R b/DIMS/AddOnFunctions/prepare_alarmvalues.R index ff94fce..c09c536 100644 --- a/DIMS/AddOnFunctions/prepare_alarmvalues.R +++ b/DIMS/AddOnFunctions/prepare_alarmvalues.R @@ -1,23 +1,43 @@ -prepare_alarmvalues <- function(pt_name, metab_interest_sorted) { - # set parameters for table - high_zscore_cutoff <- 5 - low_zscore_cutoff <- -3 +prepare_alarmvalues <- function(pt_name, dims_helix_table) { + # extract data for patient of interest (pt_name) + pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name) + pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2) + + # Make empty dataframes for metabolites above or below alarmvalues + pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric()) + pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric()) - # make table of all metabolites - all_metab <- c() - for (page_nr in 1:length(metab_interest_sorted)) { - all_metab <- rbind(all_metab, metab_interest_sorted[[page_nr]]) + # Loop over individual metabolites + for (metab in unique(pt_metabs_helix$HMDB_name)){ + # Get data for individual metabolite + pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) + # print(pt_metab) + + # Check if zscore is positive of negative + if(pt_metab$Z_score > 0) { + # Get specific alarmvalue for metabolite + high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore) + + # If zscore is above the alarmvalue, add to pt_list_high table + if(pt_metab$Z_score > high_zscore_cutoff_metab) { + pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_high <- rbind(pt_list_high, pt_metab_high) + } + } else { + # Get specific alarmvalue for metabolite + low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore) + + # If zscore is below the alarmvalue, add to pt_list_low table + if(pt_metab$Z_score < low_zscore_cutoff_metab) { + pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_low <- rbind(pt_list_low, pt_metab_low) + } + } } - # extract data for patient of interest (pt_name) - pt_list <- all_metab[which(all_metab$variable==pt_name), ] - # remove column with patient name - pt_list <- pt_list[ , -2] - # round off Z-scores - pt_list$value <- round(as.numeric(pt_list$value), 2) - # determine alarms for this patient: Z-score above 5 or below -3 - pt_list_high <- pt_list[pt_list$value > high_zscore_cutoff, ] - pt_list_low <- pt_list[pt_list$value < low_zscore_cutoff, ] + # sort tables on zscore + pt_list_high <- pt_list_high %>% arrange(desc(Z_score)) + pt_list_low <- pt_list_low %>% arrange(Z_score) # add lines for increased, decreased extra_line1 <- c("Increased", "") extra_line2 <- c("Decreased", "") @@ -27,6 +47,6 @@ prepare_alarmvalues <- function(pt_name, metab_interest_sorted) { rownames(top_metab_pt) <- NULL # change column names for display colnames(top_metab_pt) <- c("Metabolite", "Z-score") - + return(top_metab_pt) } diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 097d030..2a79d40 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -11,11 +11,11 @@ ## adapted from 15-dIEM_violin.R # load packages -library(dplyr) +suppressPackageStartupMessages(library("dplyr")) library(reshape2) library(openxlsx) library(ggplot2) -library(gridExtra) +suppressPackageStartupMessages(library("gridExtra")) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -64,6 +64,10 @@ source(paste0(scripts_dir, "prepare_data_perpage.R")) source(paste0(scripts_dir, "prepare_toplist.R")) source(paste0(scripts_dir, "create_violin_plots.R")) source(paste0(scripts_dir, "prepare_alarmvalues.R")) +source(paste0(scripts_dir, "output_helix.R")) +source(paste0(scripts_dir, "get_patient_data_to_helix.R")) +source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R")) +source(paste0(scripts_dir, "is_diagnostic_patient.R")) # number of diseases that score highest in algorithm to plot top_nr_iem <- 5 @@ -75,15 +79,18 @@ ratios_cutoff <- -5 nr_plots_perpage <- 20 # binary variable: run function, yes(1) or no(0) -if (z_score == 1) { - algorithm <- ratios <- violin <- 1 -} +if (z_score == 1) { + algorithm <- ratios <- violin <- 1 +} else { + algorithm <- ratios <- violin <- 0 +} # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) header_row <- 1 # column name where the data starts (default B) col_start <- "B" zscore_cutoff <- 5 xaxis_cutoff <- 20 +protocol_name <- "DIMS_PL_DIAG" #### STEP 1: Preparation #### # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS @@ -342,7 +349,7 @@ if (algorithm == 1) { #### STEP 5: Make violin plots ##### # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, -# top_diseases, top_metab, output_dir ||| out: pdf file +# top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file if (violin == 1) { @@ -403,13 +410,28 @@ if (violin == 1) { metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) + # for Diagnostics metabolites to be saved in Helix + if(grepl("Diagnost", pdf_dir)) { + # get table that combines DIMS results with stofgroepen/Helix table + dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) + + # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs + if(any(is_diagnostic_patient(dims_helix_table$Patient))){ + # get output file for Helix + output_helix <- output_for_helix(protocol_name, dims_helix_table) + # write output to file + path_helixfile <- paste0(outdir, "/output_Helix_", run_name,".csv") + write.csv(output_helix, path_helixfile, quote = F, row.names = F) + } + } + # make violin plots per patient for (pt_nr in 1:length(patient_list)) { pt_name <- patient_list[pt_nr] # for category Diagnostics, make list of metabolites that exceed alarm values for this patient # for category Other, make list of top highest and lowest Z-scores for this patient if (grepl("Diagnost", pdf_dir)) { - top_metab_pt <- prepare_alarmvalues(pt_name, metab_interest_sorted) + top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table) } else { top_metab_pt <- prepare_toplist(pt_name, zscore_patients) } From c7a2cfd2f21ea8768bf8fcf1bd4046a9dde16f29 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Mon, 8 Apr 2024 13:58:28 +0200 Subject: [PATCH 44/73] Added helix output file --- DIMS/GenerateViolinPlots.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index 27bc20b..a1f5259 100644 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -13,6 +13,7 @@ process GenerateViolinPlots { path('Other/*.pdf'), emit: other_plot_files path('dIEM/*.pdf'), emit: diem_plot_files path('*.xlsx'), emit: excel_file + path('*.csv'), emit: helix_file script: """ From c2c3eb494e5ccc849bb6d18634a2737f314329c1 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Thu, 11 Apr 2024 15:00:43 +0200 Subject: [PATCH 45/73] Added output_helix.R --- DIMS/AddOnFunctions/output_helix.R | 31 ++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 DIMS/AddOnFunctions/output_helix.R diff --git a/DIMS/AddOnFunctions/output_helix.R b/DIMS/AddOnFunctions/output_helix.R new file mode 100644 index 0000000..c098675 --- /dev/null +++ b/DIMS/AddOnFunctions/output_helix.R @@ -0,0 +1,31 @@ +output_for_helix <- function(protocol_name, df_metabs_helix){ + + # Remove positive controls + df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient)) + + # Add 'Vial' column, each patient has unique ID + df_metabs_helix <- df_metabs_helix %>% + group_by(Patient) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + # Split patient number into labnummer and Onderzoeksnummer + df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix) + + # Add column with protocol name + df_metabs_helix$Protocol <- protocol_name + + # Change name Z_score and Helix_naam columns to Amount and Name + change_columns <- c(Amount = "Z_score", Name = "Helix_naam") + df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) + + # Select only necessary columns and set them in correct order + df_metabs_helix <- df_metabs_helix %>% + select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) + + # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) + df_metabs_helix <- df_metabs_helix %>% + group_by(Onderzoeksnummer, Name) %>% distinct() %>% ungroup() + + return(df_metabs_helix) +} \ No newline at end of file From 3fe61b0d2a10103cc627e45a7d07efd5e62bb721 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Fri, 12 Apr 2024 09:12:13 +0200 Subject: [PATCH 46/73] flake8 changes --- CheckQC/check_qc.py | 46 +++++++++++++++---------------- CheckQC/test_check_qc.py | 15 ++++++++-- Utils/create_hsmetrics_summary.py | 2 +- Utils/get_stats_from_flagstat.py | 24 ++++++++++++---- 4 files changed, 54 insertions(+), 33 deletions(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index e6f2cc1..dd236d4 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -25,7 +25,7 @@ def non_empty_existing_path(file_or_dir): Returns: string: Provided input file or directory. If dir, suffix '/' might be added. - """ + """ input_path = Path(file_or_dir) if not input_path.is_file() and not input_path.is_dir(): raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir) @@ -44,7 +44,7 @@ def parse_arguments_and_check(args_in): Returns: Namespace: Convert argument strings to objects and assign them as attributes of the namespace. - """ + """ parser = argparse.ArgumentParser( description="Check and summarize sample quality using qc metrics and their thresholds." ) @@ -82,7 +82,7 @@ def read_yaml(yaml_file): Returns: Object: Content of the YAML file. - """ + """ yaml_loaded = yaml.safe_load(open(yaml_file)) if not yaml_loaded: raise ValueError("Could not load YAML.") @@ -97,7 +97,7 @@ def check_allowed_operators(qc_operator): Raises: ValueError: If provided qc_operator is invalid / unsupported. - """ + """ operators = ["<", "<=", ">", ">=", "==", "!=", "match"] if qc_operator not in operators: raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}") @@ -111,7 +111,7 @@ def check_required_keys_metrics(qc_metrics): Raises: KeyError: Required key is not provided for the qc metric settings - """ + """ for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]: if any([req_key not in setting.keys() for setting in qc_metrics]): raise KeyError(f"Required key {req_key} not in all metrics settings.") @@ -129,7 +129,7 @@ def select_metrics(filename, input_files): """ # If filename is string, change into regex to match absolute and relative paths in input_files. if filename.isalpha(): - filename=".*" + filename + filename = ".*" + filename metrics = list(filter(re.compile(f"{filename}").match, input_files)) if not metrics: warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}")) @@ -151,7 +151,7 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): Returns: list of strings: Valid column names to include in report. - """ + """ not_existing_cols = list(set(qc_report_cols) - set(qc_metric_cols)) if qc_report_cols == "@all": qc_report_cols = qc_metric_cols @@ -190,7 +190,7 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): - """Get rows that fail provided qc threshold + """Get rows that fail provided qc threshold Args: qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values @@ -203,7 +203,7 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): Returns: object: The indexes of failed rows in qc_metric - """ + """ # Select failed rows using qc_threshold regex pattern and qc_operator 'match' if qc_operator == "match" and isinstance(qc_threshold, str): return qc_metric[qc_col].str.match(qc_threshold) @@ -231,7 +231,7 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): Returns: qc_metric (DataFrame): DataFrame of qc metric without failed rows qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows - """ + """ qc_metric_out = DataFrame(columns=["sample", "qc_check", "qc_status", "qc_msg", "qc_value"]) failed_samples = [] if failed_rows.to_list(): @@ -272,7 +272,7 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): Returns: pandas DataFrame: Sorted qc metric for both passed and failed samples, without duplicates. - """ + """ # Add passed samples to output for sample_col in sample_cols: qc_metric_out = concat([ @@ -284,7 +284,7 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): ) ]) # Try to convert column qc_value to float. - # If ValueError is raised, probably because column is a string, continue. + # If ValueError is raised, probably because column is a string, continue. try: qc_metric_out["qc_value"] = qc_metric_out["qc_value"].astype("float") except ValueError: @@ -295,13 +295,13 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): def create_and_write_output(qc_output, output_path, output_prefix): - """Joined qc metrics is created and written to output file. + """Joined qc metrics is created and written to output file. Args: qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates. - output_path (string): Relative or absolute path where output should be saved. + output_path (string): Relative or absolute path where output should be saved. output_prefix (string): Output prefix for output file. - """ + """ # Add qc_summary qc_output.insert(1, "qc_summary", "PASS") qc_output.loc[qc_output.isin(["FAIL"]).any(axis=1), "qc_summary"] = "FAIL" @@ -310,7 +310,7 @@ def create_and_write_output(qc_output, output_path, output_prefix): def read_and_judge_metrics(qc, metrics): - """Read and judge each single qc metric and join results. + """Read and judge each single qc metric and join results. Args: qc (dict): qc settings of the metric @@ -318,7 +318,7 @@ def read_and_judge_metrics(qc, metrics): Returns: pandas DataFrame: Joined and judged qc metrics. - """ + """ for qc_file in metrics: qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"') report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"]) @@ -355,26 +355,26 @@ def read_and_judge_metrics(qc, metrics): def check_qc(input_files, settings, output_path, output_prefix): """ - Main function to judge input files on configured qc settings. - It creates a single results table, each row representing + Main function to judge input files on configured qc settings. + It creates a single results table, each row representing sample (string): sample name qc_summary: Summarized status of all qcs for sample (pass or fail) qc columns (5 per each qc metric); qc_check (string): QC check consiting of qc title, operator and threshold qc_status (string): Status of performed qc check (pass or fail) - qc_msg (string): String with human readable message if sample failed qc check, empty if passed. + qc_msg (string): String with human readable message if sample failed qc check, empty if passed. qc_value (string, int, float): qc value/score to check. Args: input_files (list): All qc metrics input files. settings (string): Path to yaml file - output_path (string): Relative or absolute path where output should be saved. + output_path (string): Relative or absolute path where output should be saved. output_prefix (string): Output prefix for output file. Raises: ValueError: No input files found to match any qc metric patterns defined in settings. - ValueError: Duplicated samples with different values found in some of the input files. - """ + ValueError: Duplicated samples with different values found in some of the input files. + """ # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings. qc_settings = read_yaml(settings) check_required_keys_metrics(qc_settings["metrics"]) diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py index a757fac..629d439 100644 --- a/CheckQC/test_check_qc.py +++ b/CheckQC/test_check_qc.py @@ -113,13 +113,19 @@ class TestSelectMetrics(): # match on word truth and SNP ( ".*truth.*SNP", - ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'], + [ + '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', + '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv' + ], ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] ), # match on word truth and SNP ( ".*truth.*SNP", - ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', 'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'], + [ + 'U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', + 'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv' + ], ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] ), # match if 'truth' is absent and contains 'SNP' @@ -127,7 +133,10 @@ class TestSelectMetrics(): # ?! Match if 'truth' is absent. ( "(?:(?!truth).)*SNP.*$", - ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'], + [ + '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', + '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv' + ], ['12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'] ), diff --git a/Utils/create_hsmetrics_summary.py b/Utils/create_hsmetrics_summary.py index 405e1ce..4256fc3 100644 --- a/Utils/create_hsmetrics_summary.py +++ b/Utils/create_hsmetrics_summary.py @@ -8,7 +8,7 @@ parser.add_argument('hsmetrics_files', type=argparse.FileType('r'), nargs='*', help='HSMetric file') arguments = parser.parse_args() - interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]") + interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]") # noqa: W605 summary_header = [] summary_data = {} for hsmetrics_file in arguments.hsmetrics_files: diff --git a/Utils/get_stats_from_flagstat.py b/Utils/get_stats_from_flagstat.py index 2f46abb..52560bc 100644 --- a/Utils/get_stats_from_flagstat.py +++ b/Utils/get_stats_from_flagstat.py @@ -43,12 +43,24 @@ print("\n\t{0} %duplication\n".format(100*sample_dups/sample_mapped)) - print("Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format( - total=counts['total'], total_75bp=counts['total']*75, total_100bp=counts['total']*100, total_150bp=counts['total']*150 - )) - print("Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format( - total=counts['mapped'], total_75bp=counts['mapped']*75, total_100bp=counts['mapped']*100, total_150bp=counts['mapped']*150 - )) + print( + "Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, " + "150bp={total_150bp:,} bp)".format( + total=counts['total'], + total_75bp=counts['total']*75, + total_100bp=counts['total']*100, + total_150bp=counts['total']*150 + ) + ) + print( + "Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, " + "150bp={total_150bp:,} bp)".format( + total=counts['mapped'], + total_75bp=counts['mapped']*75, + total_100bp=counts['mapped']*100, + total_150bp=counts['mapped']*150 + ) + ) print("Average mapped per lib: {:,} reads".format(int(round(float(counts['mapped'])/float(counts['files']))))) print("Average dups per lib: {:,} reads".format(int(round(float(counts['dups'])/float(counts['files']))))) print("Average dups % per lib: {:.2f} %".format(100*float(counts['dups'])/float(counts['mapped']))) From 24399fccf714f75d668b1be28526e7a53a37a367 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 12 Apr 2024 14:22:06 +0200 Subject: [PATCH 47/73] Added package for Helix output --- DIMS/GenerateViolinPlots.R | 1 + 1 file changed, 1 insertion(+) diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 2a79d40..38a8c61 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -16,6 +16,7 @@ library(reshape2) library(openxlsx) library(ggplot2) suppressPackageStartupMessages(library("gridExtra")) +library(stringr) # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) From c22e1f0baa76261bdebaae3414c60e9d31357bee Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 08:12:00 +0200 Subject: [PATCH 48/73] Docstrings summary to newline and comments with uppercase --- CheckQC/check_qc.py | 39 ++++++++---- CheckQC/test_check_qc.py | 128 ++++++++++++++++++++++----------------- 2 files changed, 100 insertions(+), 67 deletions(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index dd236d4..3e65352 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -14,7 +14,8 @@ def non_empty_existing_path(file_or_dir): - """This function checks whether the provided file or dir exists and is not empty. + """ + This function checks whether the provided file or dir exists and is not empty. Args: file_or_dir (string): Input file or directory @@ -37,7 +38,8 @@ def non_empty_existing_path(file_or_dir): def parse_arguments_and_check(args_in): - """Parses arguments and validates / checks format of input. + """ + Parses arguments and validates / checks format of input. Args: args_in (list of strings): Commandline input arguments. @@ -72,7 +74,8 @@ def parse_arguments_and_check(args_in): def read_yaml(yaml_file): - """Read input yaml + """ + Read input yaml Args: yaml_file (string): String with path to yaml file @@ -90,7 +93,8 @@ def read_yaml(yaml_file): def check_allowed_operators(qc_operator): - """Check if provided qc_operator is allowed. + """ + Check if provided qc_operator is allowed. Args: qc_operator (string): (Custom / math) operator @@ -104,7 +108,8 @@ def check_allowed_operators(qc_operator): def check_required_keys_metrics(qc_metrics): - """Check if all required settings are included in the qc_settings + """ + Check if all required settings are included in the qc_settings Args: qc_metrics (list with dicts): qc settings per qc metric @@ -118,7 +123,8 @@ def check_required_keys_metrics(qc_metrics): def select_metrics(filename, input_files): - """Using regular expression to match the qc metric filename with the input files + """ + Using regular expression to match the qc metric filename with the input files Args: filename (string): Filename of qc metric, could be regex. @@ -138,7 +144,8 @@ def select_metrics(filename, input_files): def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): - """Get valid columns to include in final output report + """ + Get valid columns to include in final output report Args: qc_report_cols (list, string): column name(s) to include in report. @@ -167,7 +174,8 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshold): - """Add and rename columns in qc_metric. + """ + Add and rename columns in qc_metric. Args: qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values @@ -190,7 +198,8 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): - """Get rows that fail provided qc threshold + """ + Get rows that fail provided qc threshold Args: qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values @@ -220,7 +229,8 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): - """Failed samples are added to the output metric, and removed from qc_metric. + """ + Failed samples are added to the output metric, and removed from qc_metric. Args: qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values @@ -263,7 +273,8 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): - """Passed samples are added to the output metric + """ + Passed samples are added to the output metric Args: qc_metric (DataFrame): DataFrame of qc metric without failed rows @@ -295,7 +306,8 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): def create_and_write_output(qc_output, output_path, output_prefix): - """Joined qc metrics is created and written to output file. + """ + Joined qc metrics is created and written to output file. Args: qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates. @@ -310,7 +322,8 @@ def create_and_write_output(qc_output, output_path, output_prefix): def read_and_judge_metrics(qc, metrics): - """Read and judge each single qc metric and join results. + """ + Read and judge each single qc metric and join results. Args: qc (dict): qc settings of the metric diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py index 629d439..9d572ed 100644 --- a/CheckQC/test_check_qc.py +++ b/CheckQC/test_check_qc.py @@ -15,7 +15,7 @@ @pytest.fixture(scope="module", autouse=True) def setup_test_path(tmp_path_factory): test_tmp_path = str(tmp_path_factory.mktemp("test")) + "/" - # create empty files + # Create empty files open(str(test_tmp_path) + "/empty.txt", "a").close() open(str(test_tmp_path) + "/empty.yaml", "a").close() return test_tmp_path @@ -78,11 +78,14 @@ def test_required_keys_present(self): [{"filename": "fakename"}], [ {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake", "report_cols": "fake"}, - {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols + # Missing report_cols + {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, ], [ - {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols - {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols + # Missing report_cols + {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, + # Missing report_cols + {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, ] ] ) @@ -96,21 +99,21 @@ def test_missing_keys(self, incomplete_qc_metrics): class TestSelectMetrics(): @pytest.mark.parametrize("filename_or_regex,input_files,expected", [ - # multi match + # Multi match ("test", ["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]), - # single match + # Single match ("test", ["test1.txt", "fake2.txt"], ["test1.txt"]), - # # match with relative path + # Match with relative path ("test", ["./random/path/to/test1.txt"], ["./random/path/to/test1.txt"]), - # match with absolute path + # Match with absolute path ("test", ["/random/path/to/test1.txt"], ["/random/path/to/test1.txt"]), - # match regex: kinship file suffix + # Match regex: kinship file suffix ( ".*.kinship_check.out$", ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"], ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"] ), - # match on word truth and SNP + # Match on word truth and SNP ( ".*truth.*SNP", [ @@ -119,7 +122,7 @@ class TestSelectMetrics(): ], ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] ), - # match on word truth and SNP + # Match on word truth and SNP ( ".*truth.*SNP", [ @@ -128,7 +131,7 @@ class TestSelectMetrics(): ], ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] ), - # match if 'truth' is absent and contains 'SNP' + # Match if 'truth' is absent and contains 'SNP' # ?: Match expression but do not capture it # ?! Match if 'truth' is absent. ( @@ -157,10 +160,14 @@ def test_no_match(self): class TestGetColumnsToReport(): @pytest.mark.parametrize("report_cols,metric_cols,qc_col,expected", [ (["col1"], ["col1"], "col1", ["qc_title", "qc_value"]), - (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), # additional report col - (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]), # different order output - (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]), # additional metric col - ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), # special @all option + # Additional report col + (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), + # Different order output + (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]), + # Additional metric col + (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]), + # Special @all option + ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), ]) def test_get_columns_to_report(self, report_cols, metric_cols, qc_col, expected): qc_report_cols = check_qc.get_columns_to_report(report_cols, metric_cols, qc_col) @@ -182,11 +189,11 @@ class TestAddAndRenameColumns(): def test_add_and_rename_columns(self): fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": ["0.01"]}) qc_metric_out = check_qc.add_and_rename_columns(fake_qc_metric, "FAKE_title", "fake_qc_col", "fake_op", "fake_thres") - # assert expected column values + # Assert expected column values assert qc_metric_out["qc_title"].values == "fake_title" assert qc_metric_out["qc_status"].values == "PASS" assert qc_metric_out["qc_check"].values == "fake_thres fake_op fake_qc_col" - # assert all expected columns exist + # Assert all expected columns exist assert not list( set(['sample', 'qc_value', 'qc_title', 'qc_status', 'qc_check', 'qc_msg']) - set(qc_metric_out.columns) ) @@ -195,10 +202,14 @@ def test_add_and_rename_columns(self): class TestGetFailedRows(): @pytest.mark.parametrize("qc_op,qc_thres", [ - ("match", "fake_thres"), # test match - ("==", "FAIL"), # test string - ("==", 1), # test int - ("==", 0.1), # test float + # Test match + ("match", "fake_thres"), + # Test string + ("==", "FAIL"), + # Test int + ("==", 1), + # Test float + ("==", 0.1), ]) def test_correct(self, qc_op, qc_thres): fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": [qc_thres]}) @@ -223,9 +234,12 @@ def test_add_failed_samples_single_sample_col(self): failed_rows = fake_qc_metric.loc[fake_qc_metric["sample_col"] == "sample2"].index qc_metric, qc_metric_out = check_qc.add_failed_samples_metric( fake_qc_metric, failed_rows, fake_qc_metric.columns.to_list(), ["sample_col"]) - assert "sample" in qc_metric_out.columns.to_list() # test rename column - assert "sample2" not in qc_metric["sample_col"].to_list() # test removal failed sample - assert "sample2" in qc_metric_out["sample"].to_list() # test added failed sample + # Test rename column + assert "sample" in qc_metric_out.columns.to_list() + # Test removal failed sample + assert "sample2" not in qc_metric["sample_col"].to_list() + # Test added failed sample + assert "sample2" in qc_metric_out["sample"].to_list() assert len(qc_metric) == 1 and len(qc_metric_out) == 1 assert qc_metric_out["qc_status"].values == "FAIL" @@ -235,18 +249,21 @@ def test_add_failed_samples_multi_sample_col(self): columns=["sample_col1", "sample_col2"] ) fake_kinship_metric = fake_kinship_metric.assign(qc_check="checks", qc_value="wrong") - failed_rows = fake_kinship_metric.iloc[0:2].index # define sample1 vs sample2 and sample1 vs sample3 as failed + # Define sample1 vs sample2 and sample1 vs sample3 as failed + failed_rows = fake_kinship_metric.iloc[0:2].index qc_metric, qc_metric_out = check_qc.add_failed_samples_metric( fake_kinship_metric, failed_rows, fake_kinship_metric.columns.to_list(), ["sample_col1", "sample_col2"]) for failed_sample in ["sample1", "sample2", "sample3"]: - # test removal failed sample + # Test removal failed sample assert failed_sample not in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel()) - assert failed_sample in qc_metric_out["sample"].to_list() # test added failed sample + # Test added failed sample + assert failed_sample in qc_metric_out["sample"].to_list() assert qc_metric_out["qc_status"].values.all() == "FAIL" assert len(qc_metric) == 1 twice_failed = qc_metric_out.loc[qc_metric_out["sample"] == "sample1"] - assert "wrong;wrong" == twice_failed["qc_value"].item() # assert join with ';' on column qc_value - # assert join with ';' on column qc_msg + # Assert join with ';' on column qc_value + assert "wrong;wrong" == twice_failed["qc_value"].item() + # Assert join with ';' on column qc_msg assert "sample1 sample2 checks wrong;sample1 sample3 checks wrong" == twice_failed["qc_msg"].item() for passed_sample in ["sample4", "sample5"]: assert passed_sample in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel()) @@ -276,20 +293,23 @@ def test_add_passed_samples_multi_sample_col(self): ) qc_metric_out = check_qc.add_passed_samples_metric( fake_qc_metric, fake_sample_qc, ["sample_col1", "sample_col2"]) - assert "sample" in qc_metric_out.columns.to_list() # test rename column - assert qc_metric_out["sample"].to_list().count("s4") == 1 # test removal duplicates - assert "new_col" not in qc_metric_out.columns.to_list() # test additional columns ignored + # Test rename column + assert "sample" in qc_metric_out.columns.to_list() + # Test removal duplicates + assert qc_metric_out["sample"].to_list().count("s4") == 1 + # Test additional columns ignored + assert "new_col" not in qc_metric_out.columns.to_list() class TestCreateAndWriteOutput(): @pytest.mark.parametrize("exp_summary,qc_output", [ - # all qc checks passed + # All qc checks passed ("PASS", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["PASS"]})), - # single qc check failed + # Single qc check failed ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["FAIL"]})), - # all qc check failed + # All qc check failed ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["FAIL"], "qc_status_kinship": ["FAIL"]})), - # not restricted to qc_status_ column name + # Not restricted to qc_status_ column name ("PASS", DataFrame({"sample": ["s1"], "random_col1": ["PASS"], "random_col2": ["PASS"]})), ]) def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output): @@ -304,61 +324,61 @@ def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output): class TestGetOutputMetrics(): @pytest.mark.parametrize("data_in,nr_rows", [ - # single sample + # Single sample (["sample1_fake_check.txt"], 1), - # multiple single samples + # Multiple single samples (["sample1_fake_check.txt", "sample2_fake_check.txt"], 2), - # single multi samples + # Single multi samples (["240101_fake_check.txt"], 2), - # multiple multi samples + # Multiple multi samples (["240101_fake_check.txt", "240102_fake_check.txt"], 4), - # multi and single sample + # Multi and single sample (["sample1_fake_check.txt", "240101_fake_check.txt"], 3), ]) def test_input_ok(self, data_in, nr_rows, dataset, datadir): datadir_files = [f"{datadir}/{filename}" for filename in data_in] - # input1 = datadir / "sample1_fake_check.txt" df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files) assert not df_output.empty observed_cols = df_output.columns.to_list() - assert df_output.shape[0] == nr_rows # shape results in tuple with no. rows and no. cols + # Shape results in tuple with no. rows and no. cols + assert df_output.shape[0] == nr_rows assert len(observed_cols) == 5 assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc'] @pytest.mark.parametrize("data_in,nr_rows,exp_warn_msg", [ - # single sample duplicate + # Single sample duplicate (["sample1_fake_check.txt"]*2, 1, "Sample IDs occur multiple times in input:"), - # single multi samples duplicate + # Single multi samples duplicate (["240101_fake_check.txt"]*2, 2, "Sample IDs occur multiple times in input:"), - # multiple multi samples, duplicate samples + # Multiple multi samples, duplicate samples (["240101_fake_check.txt", "240101_v2_fake_check.txt"], 4, "Different qc values for duplicated sample IDs in input:"), ]) def test_input_warn(self, data_in, nr_rows, exp_warn_msg, dataset, datadir): datadir_files = [f"{datadir}/{filename}" for filename in data_in] - # input1 = datadir / "sample1_fake_check.txt" with pytest.warns(UserWarning) as match_warning: df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files) warn_msg = match_warning[0].message.args[0] assert exp_warn_msg in warn_msg assert not df_output.empty observed_cols = df_output.columns.to_list() - assert df_output.shape[0] == nr_rows # Shape: tuple with no. rows and no. cols + # Shape: tuple with no. rows and no. cols + assert df_output.shape[0] == nr_rows assert len(observed_cols) == 5 assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc'] class TestCheckQc(): @pytest.mark.parametrize("settings,data_in,exp_shape", [ - # single metric, single sample input + # Single metric, single sample input ("settings_single_metric", ["sample1_fake_check.txt"], (1, 5)), - # two metrics, single sample input + # Two metrics, single sample input ("settings_two_metrics", ["sample1_fake_check.txt"], (1, 9)), - # single metric, multiple samples input + # Single metric, multiple samples input ("settings_single_metric", ["240101_fake_check.txt"], (2, 5)), ("settings_single_metric", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 5)), - # two metrics, multiple sample input + # Two metrics, multiple sample input ("settings_two_metrics", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 9)), - # two metric, multi and single sample input + # Two metricS, multi and single sample input ("settings_two_metrics", ["sample1_fake_check.txt", "240101_fake_check.txt"], (3, 9)), ]) def test_ok(self, settings, data_in, exp_shape, datadir, dataset, mocker, ): From f66b1f87a01eea7efb42725b7c771fc668eba0b6 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 08:15:28 +0200 Subject: [PATCH 49/73] Change isalpha to punctuation check to determine if filename string or regex --- CheckQC/check_qc.py | 9 ++++++--- CheckQC/test_check_qc.py | 8 +++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index 3e65352..472bf89 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -5,6 +5,7 @@ from os import strerror as os_strerror from pathlib import Path import re +from string import punctuation import sys import warnings @@ -127,14 +128,16 @@ def select_metrics(filename, input_files): Using regular expression to match the qc metric filename with the input files Args: - filename (string): Filename of qc metric, could be regex. + filename (string): Filename of qc metric, could be a regex. input_files (list): All qc metrics input files. Returns: list: Input files matching the given filename. """ - # If filename is string, change into regex to match absolute and relative paths in input_files. - if filename.isalpha(): + # Change filename without special characters (except for '_', '-' and '.') into + # a regex pattern to match absolute and relative paths in input_files. + special_symbols = set(punctuation) - set(".", "_", "-") + if not any([char in special_symbols for char in set(filename)]): filename = ".*" + filename metrics = list(filter(re.compile(f"{filename}").match, input_files)) if not metrics: diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py index 9d572ed..baedf3d 100644 --- a/CheckQC/test_check_qc.py +++ b/CheckQC/test_check_qc.py @@ -148,7 +148,13 @@ def test_select_metric(self, filename_or_regex, input_files, expected): metrics = check_qc.select_metrics(filename_or_regex, input_files) assert metrics == expected - def test_no_match(self): + @pytest.mark.parametrize("filename_or_regex,input_files", [ + # No match + ("test", ["fake1.txt", "fake2.txt"]), + # No match, filename with specialchar assumed to be a regex + ("specialchar_@", ["12/specialchar_@.txt"]), + ]) + def test_no_match(self, filename_or_regex, input_files): with pytest.warns(UserWarning) as match_warning: return_val = check_qc.select_metrics("test", ["fake1.txt", "fake2.txt"]) warn_msg = match_warning[0].message.args[0] From d83ca8d58eb4fcd42afcb5cb215ee27f8ec1c7ab Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 08:51:22 +0200 Subject: [PATCH 50/73] Moved comments to new lines and resolved flake8 errors --- GenderCheck/test_calculate_gender.py | 15 ++++++++------- .../1.0.0/test_get_gender_from_bam_chrx.py | 4 +++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/GenderCheck/test_calculate_gender.py b/GenderCheck/test_calculate_gender.py index 0e84e0b..334c158 100644 --- a/GenderCheck/test_calculate_gender.py +++ b/GenderCheck/test_calculate_gender.py @@ -4,15 +4,12 @@ class TestIsValidRead(): - - class MyObject: def __init__(self, qual, start, end): self.mapping_quality = qual self.reference_start = start self.reference_end = end - @pytest.mark.parametrize("read,mapping_qual,expected", [ (MyObject(19, True, True), 20, False), # mapping quality is below the threshold (MyObject(20, True, True), 20, True), # mapping quality is equal to the threshold @@ -35,10 +32,14 @@ def test_get_gender_from_bam(self, bam, mapping_qual, locus_y, ratio_y, expected class TestCompareGender(): @pytest.mark.parametrize("sample_id,analysis_id,test_gender,true_gender,expected", [ - ("test_sample", "test_analyse", "male", "male", "test_sample\ttest_analyse\tmale\tmale\tPASS\n"), # test_gender and true_gender identical, PASS - ("test_sample", "test_analyse", "male", "female", "test_sample\ttest_analyse\tmale\tfemale\tFAIL\n"), # test_gender and true_gender not identical , FAIL - ("test_sample", "test_analyse", "male", "unknown", "test_sample\ttest_analyse\tmale\tunknown\tPASS\n"), # true_gender unknown, PASS - ("test_sample", "test_analyse", "male", "not_detected", "test_sample\ttest_analyse\tmale\tnot_detected\tFAIL\n"), # true_gender not_detected, FAIL + # test_gender and true_gender identical, PASS + ("test_sample", "test_analyse", "male", "male", "test_sample\ttest_analyse\tmale\tmale\tPASS\n"), + # test_gender and true_gender not identical , FAIL + ("test_sample", "test_analyse", "male", "female", "test_sample\ttest_analyse\tmale\tfemale\tFAIL\n"), + # true_gender unknown, PASS + ("test_sample", "test_analyse", "male", "unknown", "test_sample\ttest_analyse\tmale\tunknown\tPASS\n"), + # true_gender not_detected, FAIL + ("test_sample", "test_analyse", "male", "not_detected", "test_sample\ttest_analyse\tmale\tnot_detected\tFAIL\n"), ]) def test_compare_gender(self, sample_id, analysis_id, test_gender, true_gender, expected): assert expected == calculate_gender.compare_gender(sample_id, analysis_id, test_gender, true_gender) diff --git a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py index 0a494ea..7674f45 100644 --- a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py +++ b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py @@ -26,7 +26,9 @@ class TestGetGenderFromBam: ("./test_bam.bam", 20, "X:2699520-154931044", 5.5, 7.5, ("M", False)), ("./test_bam.bam", 20, "X:2699520-154931044", 4.5, 6.5, ("F", True)), ]) - def test_get_gender_from_bam(self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome): + def test_get_gender_from_bam( + self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome + ): assert expected_outcome == get_gender_from_bam_chrx.get_gender_from_bam_chrx( bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female) From b78741dce7fe90b7c4a8e70dd5a8fd6dcfbaaad9 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 09:02:05 +0200 Subject: [PATCH 51/73] Single newline at end of file (.nf) --- ExomeDepth/CallCNV.nf | 2 +- ExomeDepth/GetRefset.nf | 2 +- ExomeDepth/Summary.nf | 2 +- ExonCov/ImportBam.nf | 2 +- ExonCov/SampleQC.nf | 2 +- Kinship/Kinship.nf | 2 +- TrendAnalysis/TrendAnalysis.nf | 2 +- UPD/IGV.nf | 2 +- Utils/CreateHSmetricsSummary.nf | 2 +- Utils/EditSummaryFileHappy.nf | 2 +- Utils/GetStatsFromFlagstat.nf | 2 +- Utils/ParseChildFromFullTrio.nf | 2 +- Utils/SavePedFile.nf | 2 +- Utils/VersionLog.nf | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ExomeDepth/CallCNV.nf b/ExomeDepth/CallCNV.nf index 9a3197e..fda8ef1 100644 --- a/ExomeDepth/CallCNV.nf +++ b/ExomeDepth/CallCNV.nf @@ -22,4 +22,4 @@ process CallCNV { source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py callcnv ./ ${bam_file} ${analysis_id} ${sample_id} """ -} \ No newline at end of file +} diff --git a/ExomeDepth/GetRefset.nf b/ExomeDepth/GetRefset.nf index efff299..fbe9d7b 100644 --- a/ExomeDepth/GetRefset.nf +++ b/ExomeDepth/GetRefset.nf @@ -18,4 +18,4 @@ process GetRefset { python ${params.dx_resources_path}/${params.exomedepth_path}/exomedepth_db.py add_sample_return_refset_bam ${bam_file} --print_refset_stdout | \ tr -d '\n' """ -} \ No newline at end of file +} diff --git a/ExomeDepth/Summary.nf b/ExomeDepth/Summary.nf index 3244d8c..a883376 100644 --- a/ExomeDepth/Summary.nf +++ b/ExomeDepth/Summary.nf @@ -17,4 +17,4 @@ process Summary { source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py summary ${exomedepth_logs} > ${analysis_id}_exomedepth_summary.txt """ -} \ No newline at end of file +} diff --git a/ExonCov/ImportBam.nf b/ExonCov/ImportBam.nf index 474ae35..ff193da 100644 --- a/ExonCov/ImportBam.nf +++ b/ExonCov/ImportBam.nf @@ -21,4 +21,4 @@ process ImportBam { --exon_bed ${params.dxtracks_path}/${params.exoncov_bed} \ ${analysis_id} WES ${bam_file} | tr -d '\n' """ -} \ No newline at end of file +} diff --git a/ExonCov/SampleQC.nf b/ExonCov/SampleQC.nf index 49ecf2d..0c5bd78 100644 --- a/ExonCov/SampleQC.nf +++ b/ExonCov/SampleQC.nf @@ -19,4 +19,4 @@ process SampleQC { python ${params.exoncov_path}/ExonCov.py sample_qc \ -s ${samples} -p ${panels} > ${analysis_id}.ExonCovQC_check.out """ -} \ No newline at end of file +} diff --git a/Kinship/Kinship.nf b/Kinship/Kinship.nf index 9241dbf..fd051fe 100644 --- a/Kinship/Kinship.nf +++ b/Kinship/Kinship.nf @@ -19,4 +19,4 @@ process Kinship { cp king.kin0 ${analysis_id}.kinship python ${projectDir}/CustomModules/Kinship/check_kinship.py ${analysis_id}.kinship ${ped_file} > ${analysis_id}.kinship_check.out """ -} \ No newline at end of file +} diff --git a/TrendAnalysis/TrendAnalysis.nf b/TrendAnalysis/TrendAnalysis.nf index c8968ab..5009861 100644 --- a/TrendAnalysis/TrendAnalysis.nf +++ b/TrendAnalysis/TrendAnalysis.nf @@ -12,4 +12,4 @@ process TrendAnalysis { source ${params.trend_analysis_path}/venv/bin/activate python ${params.trend_analysis_path}/trend_analysis.py upload processed_data ${analysis_id} . """ -} \ No newline at end of file +} diff --git a/UPD/IGV.nf b/UPD/IGV.nf index 2ea72af..f2974c7 100644 --- a/UPD/IGV.nf +++ b/UPD/IGV.nf @@ -19,4 +19,4 @@ process IGV { source ${params.upd_path}/venv/bin/activate python ${params.upd_path}/make_UPD_igv.py ${ped_file} ${analysis_id} $trio_sample ${vcf_files} -c """ -} \ No newline at end of file +} diff --git a/Utils/CreateHSmetricsSummary.nf b/Utils/CreateHSmetricsSummary.nf index 2ca443f..45553cb 100644 --- a/Utils/CreateHSmetricsSummary.nf +++ b/Utils/CreateHSmetricsSummary.nf @@ -14,4 +14,4 @@ process CreateHSmetricsSummary { """ python2 ${projectDir}/CustomModules/Utils/create_hsmetrics_summary.py ${hsmetrics_files} > HSMetrics_summary.txt """ -} \ No newline at end of file +} diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf index 8050c46..a4aba58 100644 --- a/Utils/EditSummaryFileHappy.nf +++ b/Utils/EditSummaryFileHappy.nf @@ -27,4 +27,4 @@ process EditSummaryFileHappy { # Remove tmp files rm ${summary_csv}.tmp """ -} \ No newline at end of file +} diff --git a/Utils/GetStatsFromFlagstat.nf b/Utils/GetStatsFromFlagstat.nf index 99018ed..27e6bbe 100644 --- a/Utils/GetStatsFromFlagstat.nf +++ b/Utils/GetStatsFromFlagstat.nf @@ -14,4 +14,4 @@ process GetStatsFromFlagstat { """ python2 ${projectDir}/CustomModules/Utils/get_stats_from_flagstat.py ${flagstat_files} > run_stats.txt """ -} \ No newline at end of file +} diff --git a/Utils/ParseChildFromFullTrio.nf b/Utils/ParseChildFromFullTrio.nf index 9963277..b9377bd 100644 --- a/Utils/ParseChildFromFullTrio.nf +++ b/Utils/ParseChildFromFullTrio.nf @@ -17,4 +17,4 @@ process ParseChildFromFullTrio { """ python2 ${projectDir}/CustomModules/Utils/parse_child_from_fulltrio.py ${ped_file} ${sample_ids} | tr -d '\n' """ -} \ No newline at end of file +} diff --git a/Utils/SavePedFile.nf b/Utils/SavePedFile.nf index 1c49c07..00f45a3 100644 --- a/Utils/SavePedFile.nf +++ b/Utils/SavePedFile.nf @@ -14,4 +14,4 @@ process SavePedFile { """ cp --remove-destination "\$(readlink ${ped_file})" ./ """ -} \ No newline at end of file +} diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf index 9d5c654..cff4f49 100644 --- a/Utils/VersionLog.nf +++ b/Utils/VersionLog.nf @@ -22,4 +22,4 @@ process VersionLog { echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml done """ -} \ No newline at end of file +} From 37c67e605ddbc32bd84bdc903a4f8f4cf43990e2 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 09:04:45 +0200 Subject: [PATCH 52/73] replace noqa W605 with a regex string (r"") --- Utils/create_hsmetrics_summary.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Utils/create_hsmetrics_summary.py b/Utils/create_hsmetrics_summary.py index 4256fc3..582b61e 100644 --- a/Utils/create_hsmetrics_summary.py +++ b/Utils/create_hsmetrics_summary.py @@ -8,7 +8,7 @@ parser.add_argument('hsmetrics_files', type=argparse.FileType('r'), nargs='*', help='HSMetric file') arguments = parser.parse_args() - interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]") # noqa: W605 + interval_files_pattern = re.compile(r"BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]") summary_header = [] summary_data = {} for hsmetrics_file in arguments.hsmetrics_files: From 54670e7153830761fc07e98fc60b539fbd4eb3c1 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 09:11:04 +0200 Subject: [PATCH 53/73] get quotechar from qc settings or use default. --- CheckQC/check_qc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index 472bf89..791094f 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -336,7 +336,9 @@ def read_and_judge_metrics(qc, metrics): pandas DataFrame: Joined and judged qc metrics. """ for qc_file in metrics: - qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar='"') + qc_metric_raw = read_csv( + qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar=qc.get("quotechar", '"') + ) report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"]) qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"]) failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"]) From 5202e47ccb93540a5d75bbf036112f13cb0502b7 Mon Sep 17 00:00:00 2001 From: ellendejong Date: Mon, 15 Apr 2024 10:47:04 +0200 Subject: [PATCH 54/73] add comment floats to string conversion --- CheckQC/check_qc.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index 791094f..d0b86f7 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -297,6 +297,8 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): .loc[:, qc_metric_out.columns] ) ]) + # Some qc metrics did result in errors when merging the tables. + # The merge failed when the column qc_value has floats stored as strings. # Try to convert column qc_value to float. # If ValueError is raised, probably because column is a string, continue. try: From 038521dda6441cff824293be807a95d67d7e363f Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Tue, 16 Apr 2024 09:40:52 +0200 Subject: [PATCH 55/73] Cleaned up GenerateViolinPlots AddOnFunctions --- DIMS/Utils/add_lab_id_and_onderzoeksnummer.R | 16 +++ DIMS/Utils/check_same_samplename.R | 10 ++ DIMS/Utils/create_violin_plots.R | 122 +++++++++++++++++++ DIMS/Utils/get_patient_data_to_helix.R | 39 ++++++ DIMS/Utils/is_diagnostic_patient.R | 11 ++ DIMS/Utils/output_helix.R | 39 ++++++ DIMS/Utils/prepare_alarmvalues.R | 59 +++++++++ DIMS/Utils/prepare_data.R | 51 ++++++++ DIMS/Utils/prepare_data_perpage.R | 58 +++++++++ DIMS/Utils/prepare_toplist.R | 35 ++++++ 10 files changed, 440 insertions(+) create mode 100644 DIMS/Utils/add_lab_id_and_onderzoeksnummer.R create mode 100644 DIMS/Utils/check_same_samplename.R create mode 100644 DIMS/Utils/create_violin_plots.R create mode 100644 DIMS/Utils/get_patient_data_to_helix.R create mode 100644 DIMS/Utils/is_diagnostic_patient.R create mode 100644 DIMS/Utils/output_helix.R create mode 100644 DIMS/Utils/prepare_alarmvalues.R create mode 100644 DIMS/Utils/prepare_data.R create mode 100644 DIMS/Utils/prepare_data_perpage.R create mode 100644 DIMS/Utils/prepare_toplist.R diff --git a/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R b/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R new file mode 100644 index 0000000..f2ff13e --- /dev/null +++ b/DIMS/Utils/add_lab_id_and_onderzoeksnummer.R @@ -0,0 +1,16 @@ +add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) { + #' Adding labnummer and Onderzoeksnummer to a dataframe + #' + #' @param df_metabs_helix: dataframe with patient data to be uploaded to Helix + #' + #' @return: dataframe with added labnummer and Onderzoeksnummer columns + + # Split patient number into labnummer and Onderzoeksnummer + for (row in 1:nrow(df_metabs_helix)) { + df_metabs_helix[row, "labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row, "Patient"]) + labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] + df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) + } + + return(df_metabs_helix) +} diff --git a/DIMS/Utils/check_same_samplename.R b/DIMS/Utils/check_same_samplename.R new file mode 100644 index 0000000..6e80841 --- /dev/null +++ b/DIMS/Utils/check_same_samplename.R @@ -0,0 +1,10 @@ +check_same_samplename <- function(int_col_name, zscore_col_name) { + #' A check to see if intensity and Z-score columns match + #' + #' @param int_col_name: name of an intensity column (string) + #' @param zscore_col_name: name of a Z-score column (string) + #' + #' @return: match or mismatch of the columns (boolean) + + paste0(int_col_name, "_Zscore") == zscore_col_name +} diff --git a/DIMS/Utils/create_violin_plots.R b/DIMS/Utils/create_violin_plots.R new file mode 100644 index 0000000..be4f1f6 --- /dev/null +++ b/DIMS/Utils/create_violin_plots.R @@ -0,0 +1,122 @@ +# remove parameter default value +# add explanation variable to parameters +create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt = NULL) { + #' Create violin plots for each patient + #' + #' @param pdf_dir: location where to save the pdf file (string) + #' @param pt_name: patient code (string) + #' @param metab_perpage: list of dataframe with a dataframe for each page in the violinplot pdf (list) + #' @param top_metab_pt: dataframe with increased and decrease metabolites for this patient (dataframe) + + # set parameters for plots + plot_height <- 9.6 + plot_width <- 6 + fontsize <- 1 + circlesize <- 0.8 + colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF", "#A704FD", "#F36265", "#DA0641") + # green blue blue/purple purple orange red + + # patient plots, create the PDF device + pt_name_sub <- pt_name + suffix <- "" + if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) { + prefix <- "MB" + suffix <- "_DIMS_PL_DIAG" + # substitute P and M in P2020M00001 into right format for Helix + pt_name_sub <- gsub("[PM]", "", pt_name) + pt_name_sub <- gsub("\\..*", "", pt_name_sub) + } else if (grepl("Diagnostics", pdf_dir)) { + prefix <- "Dx_" + } else if (grepl("IEM", pdf_dir)) { + prefix <- "IEM_" + } else { + prefix <- "R_" + } + + pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"), + onefile = TRUE, + width = plot_width, + height = plot_height) + + # page headers: + page_headers <- names(metab_perpage) + + # put table into PDF file, if not empty + if (!is.null(dim(top_metab_pt))) { + plot.new() + # get the names and numbers in the table aligned + table_theme <- ttheme_default(core = list(fg_params = list(hjust = 0, x = 0.05, fontsize = 6)), + colhead = list(fg_params = list(fontsize = 8, fontface = "bold"))) + grid.table(top_metab_pt, theme = table_theme, rows = NULL) + # g <- tableGrob(top_metab_pt) + # grid.draw(g) + text(x = 0.45, y = 1.02, paste0("Top deviating metabolites for patient: ", pt_name), font = 1, cex = 1) + } + + # violin plots + for (page_index in 1:length(metab_perpage)) { + # extract list of metabolites to plot on a page + metab_list_2plot <- metab_perpage[[page_index]] + # extract original data for patient of interest (pt_name) before cut-offs + pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # cut off Z-scores higher than 20 or lower than -5 (for nicer plots) + metab_list_2plot$value[metab_list_2plot$value > 20] <- 20 + metab_list_2plot$value[metab_list_2plot$value < -5] <- -5 + # extract data for patient of interest (pt_name) + pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] + # restore original Z-score before cut-off, for showing Z-scores in PDF + pt_list_2plot$value_orig <- pt_list_2plot_orig$value + # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients + metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ] + # subtitle per page + sub_perpage <- gsub("_", " ", page_headers[page_index]) + # for IEM plots, put subtitle on two lines + sub_perpage <- gsub("probability", "\nprobability", sub_perpage) + # add size parameter for showing Z-score of patient per metabolite + z_size <- rep(3, nrow(pt_list_2plot)) + # set size to 0 if row is empty + z_size[is.na(pt_list_2plot$value)] <- 0 + + # draw violin plot. shape=22 gives square for patient of interest + ggplot_object <- ggplot(metab_list_2plot, aes(x = value, y = HMDB_name)) + + theme(axis.text.y = element_text(size = rel(fontsize)), plot.caption = element_text(size = rel(fontsize))) + + xlim(-5, 20) + + geom_violin(scale = "width") + + geom_point(data = pt_list_2plot, aes(color = value), size = 3.5 * circlesize, shape = 22, fill = "white") + + scale_fill_gradientn( + colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", + aesthetics = "colour" + ) + + # add Z-score value for patient of interest at x=16 + geom_text( + data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, + size = z_size + ) + + # add labels. Use font Courier to get all the plots in the same location. + labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + + theme(axis.text.y = element_text(family = "Courier", size = 6)) + + # do not show legend + theme(legend.position = "none") + + # add title + ggtitle(label = paste0("Results for patient ", pt_name)) + + # add vertical lines + geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty = 2) + + geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty = 2) + + suppressWarnings(print(ggplot_object)) + + } + + # add explanation of violin plots, version number etc. + plot(NA, xlim = c(0, 5), ylim = c(0, 5), bty = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "") + if (length(explanation) > 0) { + text(0.2, 5, explanation[1], pos = 4, cex = 0.8) + for (line_index in 2:length(explanation)) { + text_y_position <- 5 - (line_index * 0.2) + text(-0.2, text_y_position, explanation[line_index], pos = 4, cex = 0.5) + } + } + + # close the PDF file + dev.off() +} diff --git a/DIMS/Utils/get_patient_data_to_helix.R b/DIMS/Utils/get_patient_data_to_helix.R new file mode 100644 index 0000000..1eeaf7a --- /dev/null +++ b/DIMS/Utils/get_patient_data_to_helix.R @@ -0,0 +1,39 @@ +get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all) { + #' Get patient data to be uploaded to Helix + #' + #' @param metab_interest_sorted: list of dataframes with metabolite Z-scores for each sample/patient (list) + #' @param metab_list_all: list of tables with metabolites for Helix and violin plots (list) + #' + #' @return: dataframe with patient data with only metabolites for Helix and violin plots + #' with Helix name, high/low Z-score cutoffs + + # Combine Z-scores of metab groups together + df_all_metabs_zscores <- bind_rows(metab_interest_sorted) + # Change columnnames + colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score") + # Change Patient column to character instead of factor + df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient) + + # Delete whitespaces HMDB_name + df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right") + + # Split HMDB_name column on "nitine;" for match dims_helix_table + df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[, 1] + + # Combine stofgroepen + dims_helix_table <- bind_rows(metab_list_all) + # Filter table for metabolites for Helix + dims_helix_table <- dims_helix_table %>% filter(Helix == "ja") + # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores + dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[, 1] + dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) + + # Filter DIMS results for metabolites for Helix + df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split) + # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc. + df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split)) + + df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore) + + return(df_metabs_helix) +} diff --git a/DIMS/Utils/is_diagnostic_patient.R b/DIMS/Utils/is_diagnostic_patient.R new file mode 100644 index 0000000..5820838 --- /dev/null +++ b/DIMS/Utils/is_diagnostic_patient.R @@ -0,0 +1,11 @@ +is_diagnostic_patient <- function(patient_column) { + #' Check for Diagnostics patients with correct patient number (e.g. starting with "P2024M") + #' + #' @param patient_column: a column from dataframe with IDs (character vector) + #' + #' @return: a logical vector with TRUE or FALSE for each element (vector) + + diagnostic_patients <- grepl("^P[0-9]{4}M", patient_column) + + return(diagnostic_patients) +} diff --git a/DIMS/Utils/output_helix.R b/DIMS/Utils/output_helix.R new file mode 100644 index 0000000..09fa1a9 --- /dev/null +++ b/DIMS/Utils/output_helix.R @@ -0,0 +1,39 @@ +output_for_helix <- function(protocol_name, df_metabs_helix) { + #' Get the output dataframe for Helix + #' + #' @param protocol_name: protocol name (string) + #' @param df_metabs_helix: dataframe with metabolite Z-scores for patients (dataframe) + #' + #' @return: dataframe with patient metabolite Z-scores in correct format for Helix + + # Remove positive controls + df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient)) + + # Add 'Vial' column, each patient has unique ID + df_metabs_helix <- df_metabs_helix %>% + group_by(Patient) %>% + mutate(Vial = cur_group_id()) %>% + ungroup() + + # Split patient number into labnummer and Onderzoeksnummer + df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix) + + # Add column with protocol name + df_metabs_helix$Protocol <- protocol_name + + # Change name Z_score and Helix_naam columns to Amount and Name + change_columns <- c(Amount = "Z_score", Name = "Helix_naam") + df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) + + # Select only necessary columns and set them in correct order + df_metabs_helix <- df_metabs_helix %>% + select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) + + # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) + df_metabs_helix <- df_metabs_helix %>% + group_by(Onderzoeksnummer, Name) %>% + distinct() %>% + ungroup() + + return(df_metabs_helix) +} diff --git a/DIMS/Utils/prepare_alarmvalues.R b/DIMS/Utils/prepare_alarmvalues.R new file mode 100644 index 0000000..94ffbfd --- /dev/null +++ b/DIMS/Utils/prepare_alarmvalues.R @@ -0,0 +1,59 @@ +prepare_alarmvalues <- function(pt_name, dims_helix_table) { + #' Create a dataframe with all metabolites that exceed the min and max Z-score cutoffs + #' + #' @param pt_name: patient code (string) + #' @param dims_helix_table: dataframe with metabolite Z-scores for each patient and Helix info (dataframe) + #' + #' @return: dataframe with metabolites that exceed the min and max Z-score cutoffs for the selected patient + + # extract data for patient of interest (pt_name) + pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name) + pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2) + + # Make empty dataframes for metabolites above or below alarmvalues + pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric()) + pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric()) + + # Loop over individual metabolites + for (metab in unique(pt_metabs_helix$HMDB_name)){ + # Get data for individual metabolite + pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) + # print(pt_metab) + + # Check if zscore is positive of negative + if (pt_metab$Z_score > 0) { + # Get specific alarmvalue for metabolite + high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore) + + # If zscore is above the alarmvalue, add to pt_list_high table + if (pt_metab$Z_score > high_zscore_cutoff_metab) { + pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_high <- rbind(pt_list_high, pt_metab_high) + } + } else { + # Get specific alarmvalue for metabolite + low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore) + + # If zscore is below the alarmvalue, add to pt_list_low table + if (pt_metab$Z_score < low_zscore_cutoff_metab) { + pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score) + pt_list_low <- rbind(pt_list_low, pt_metab_low) + } + } + } + + # sort tables on zscore + pt_list_high <- pt_list_high %>% arrange(desc(Z_score)) + pt_list_low <- pt_list_low %>% arrange(Z_score) + # add lines for increased, decreased + extra_line1 <- c("Increased", "") + extra_line2 <- c("Decreased", "") + # combine the two lists + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + # change column names for display + colnames(top_metab_pt) <- c("Metabolite", "Z-score") + + return(top_metab_pt) +} diff --git a/DIMS/Utils/prepare_data.R b/DIMS/Utils/prepare_data.R new file mode 100644 index 0000000..ea3efe5 --- /dev/null +++ b/DIMS/Utils/prepare_data.R @@ -0,0 +1,51 @@ +# unused variables will be removed: metab_list_alarm +prepare_data <- function(metab_list_all, zscore_patients_local) { + #' Combine patient Z-scores with metabolite info + #' + #' @param metab_list_all: list of dataframes with metabolite information for different stofgroepen (list) + #' @param zscore_patients_local: dataframe with metabolite Z-scores for all patient + #' + #' @return: list of dataframes for each stofgroep with data for each metabolite and patient/control per row + + # remove "_Zscore" from column (patient) names + colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local)) + # put data into pages, max 20 violin plots per page in PDF + metab_interest_sorted <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_list_all)) { + metab_class <- names(metab_list_all)[metab_class_index] + metab_list <- metab_list_all[[metab_class_index]] + if (ncol(metab_list) > 2) { + # third column are the alarm values, so reduce the data frame to 2 columns and save list + metab_list_alarm <- metab_list + metab_list <- metab_list[, c(1, 2)] + } + # make sure that all HMDB_names have 45 characters + for (metab_index in 1:length(metab_list$HMDB_name)) { + if (is.character(metab_list$HMDB_name[metab_index])) { + hmdb_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]] + } else { + hmdb_name_separated <- "strspliterror" + } + if (length(hmdb_name_separated) <= 45) { + hmdb_name_separated <- c(hmdb_name_separated, rep(" ", 45 - length(hmdb_name_separated))) + } else { + hmdb_name_separated <- c(hmdb_name_separated[1:42], "...") + } + metab_list$HMDB_name[metab_index] <- paste0(hmdb_name_separated, collapse = "") + } + # find metabolites and ratios in data frame zscore_patients_local + metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code") + # remove column "HMDB_code" + metab_interest <- metab_interest[, -which(colnames(metab_interest) == "HMDB_code")] + # put the data frame in long format + metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name") + # sort on metabolite names (HMDB_name) + sort_order <- order(metab_interest_melt$HMDB_name) + metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ] + metab_category <- c(metab_category, metab_class) + } + names(metab_interest_sorted) <- metab_category + + return(metab_interest_sorted) +} diff --git a/DIMS/Utils/prepare_data_perpage.R b/DIMS/Utils/prepare_data_perpage.R new file mode 100644 index 0000000..61d2e15 --- /dev/null +++ b/DIMS/Utils/prepare_data_perpage.R @@ -0,0 +1,58 @@ +# remove default variable values +prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat = 20, nr_contr = 30) { + #' Combine patient and control data for each page of the violinplot pdf + #' + #' @param metab_interest_sorted: list of dataframes with data for each metabolite and patient (list) + #' @param metab_interest_contr: list of dataframes with data for each metabolite and control (list) + #' @param nr_plots_perpage: number of plots per page in the violinplot pdf (integer) + #' @param nr_pat: number of patients (integer) + #' @param nr_contr: number of controls (integer) + #' + #' @return: list of dataframes with metabolite Z-scores for each patient and control, + #' the length of list is the number of pages for the violinplot pdf (list) + + total_nr_pages <- 0 + metab_perpage <- list() + metab_category <- c() + for (metab_class_index in 1:length(metab_interest_sorted)) { + # split list into pages, each page containing max nr_plots_perpage (20) compounds + metab_interest_perclass <- metab_interest_sorted[[metab_class_index]] + metab_class <- names(metab_interest_sorted)[metab_class_index] + # add controls + metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]] + # number of pages for this class + nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage) + for (page_nr in 1:nr_pages) { + total_nr_pages <- total_nr_pages + 1 + select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr - 1)) + 1 + select_rows_end <- nr_pat * nr_plots_perpage * page_nr + metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ] + # same for controls + select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr - 1)) + 1 + select_rows_end_contr <- nr_contr * nr_plots_perpage * page_nr + metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ] + # add controls + metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr) + # if a page has fewer than nr_plots_perpage plots, fill page with empty plots + na_rows <- which(is.na(metab_onepage$HMDB_name)) + if (length(na_rows) > 0) { + # repeat the patient and control variables + metab_onepage$variable[na_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)] + # for HMDB name, substitute a number of spaces + for (row_nr in na_rows) { + metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr / (nr_pat + nr_contr))), collapse = "") + } + metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name) + # leave the values at NA + } + # put data for one page into object with data for all pages + metab_perpage[[total_nr_pages]] <- metab_onepage + # create list of page headers + metab_category <- c(metab_category, paste(metab_class, page_nr, sep = "_")) + } + } + # add page headers to list + names(metab_perpage) <- metab_category + + return(metab_perpage) +} diff --git a/DIMS/Utils/prepare_toplist.R b/DIMS/Utils/prepare_toplist.R new file mode 100644 index 0000000..cc616c4 --- /dev/null +++ b/DIMS/Utils/prepare_toplist.R @@ -0,0 +1,35 @@ +prepare_toplist <- function(pt_name, zscore_patients_copy) { + #' Create a dataframe with the top 20 highest and top 10 lowest metabolites per patient + #' + #' @param pt_name: patient code (string) + #' @param zscore_patients_copy: dataframe with metabolite Z-scores per patient (dataframe) + #' + #' @return: dataframe with 30 metabolites and Z-scores (dataframe) + + # set parameters for table + top_highest <- 20 + top_lowest <- 10 + + # extract data for patient of interest (pt_name) + pt_list <- zscore_patients_copy[, c(1, 2, which(colnames(zscore_patients_copy) == pt_name))] + # sort metabolites on Z-scores for this patient + pt_list_sort <- sort(pt_list[, 3], index.return = TRUE) + # determine top highest and lowest Z-scores for this patient + pt_list_sort <- sort(pt_list[, 3], index.return = TRUE) + pt_list_low <- pt_list[pt_list_sort$ix[1:top_lowest], ] + pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix) - top_highest + 1)], ] + # round off Z-scores + pt_list_low[, 3] <- round(as.numeric(pt_list_low[, 3]), 2) + pt_list_high[, 3] <- round(as.numeric(pt_list_high[, 3]), 2) + # add lines for increased, decreased + extra_line1 <- c("Increased", "", "") + extra_line2 <- c("Decreased", "", "") + top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) + # remove row names + rownames(top_metab_pt) <- NULL + + # change column names for display + colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") + + return(top_metab_pt) +} From b75fabadc79387c0ee0e7e9ece15003b8324163a Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Thu, 18 Apr 2024 17:13:45 +0200 Subject: [PATCH 56/73] Output paths added --- DIMS/AverageTechReplicates.nf | 1 + DIMS/GenerateExcel.nf | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 9fe9a98..9e47af3 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -18,6 +18,7 @@ process AverageTechReplicates { path('*_avg.RData'), emit: binned_files path('miss_infusions_negative.txt') path('miss_infusions_positive.txt') + path('*_TICplots.pdf') script: """ diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 32ec928..86055bd 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -14,8 +14,9 @@ process GenerateExcel { output: path('AdductSums_*.txt') path('*IS_results.RData') - path('*.xlsx'), emit: excel_file - path('plots'), emit: plot_files + path('*_positive_control.RData') + path('*.xlsx'), emit: excel_files + path('plots/*.png'), emit: plot_files script: """ From 3687f14b880d75437645edfc037b3597ccccd3ea Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Thu, 18 Apr 2024 17:14:34 +0200 Subject: [PATCH 57/73] Changed to Unidentified steps to parallel --- DIMS/UnidentifiedCalcZscores.R | 11 +++++--- DIMS/UnidentifiedCollectPeaks.R | 48 ++++++++++++++++++++++++++++++-- DIMS/UnidentifiedCollectPeaks.nf | 3 +- DIMS/UnidentifiedFillMissing.R | 48 ++++++++++++++++---------------- DIMS/UnidentifiedPeakGrouping.R | 40 +++++++++++++++++++------- DIMS/UnidentifiedPeakGrouping.nf | 2 +- 6 files changed, 110 insertions(+), 42 deletions(-) diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R index 2bce990..e58063a 100755 --- a/DIMS/UnidentifiedCalcZscores.R +++ b/DIMS/UnidentifiedCalcZscores.R @@ -15,10 +15,13 @@ source(paste0(scripts_dir, "statistics_z.R")) scanmodes <- c("positive", "negative") for (scanmode in scanmodes) { - # get list of files - filled_file <- paste0("PeakGroupList_", scanmode, "_Unidentified_filled.RData") - # load file - outlist_total <- get(load(filled_file)) + filled_files <- list.files("./", full.names = TRUE, pattern = paste0(scanmode, ".{1,}_Unidentified_filled")) + # load files and combine into one object + outlist_total <- NULL + for (file_nr in 1:length(filled_files)) { + peakgrouplist_filled <- get(load(filled_files[file_nr])) + outlist_total <- rbind(outlist_total, peakgrouplist_filled) + } # remove duplicates; peak groups with exactly the same m/z outlist_total <- mergeDuplicatedRows(outlist_total) diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index a5674f1..3de66d5 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -1,6 +1,8 @@ #!/usr/bin/Rscript ## adapted from 7-collectSamplesGroupedHMDB.R +options(digits = 16) + # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -20,12 +22,54 @@ for (scanmode in scanmodes) { # load list_of_peaks_used_in_peak_groups_identified load(files[file_index]) remove <- c(remove, - which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[file_index, "mzmed.pkt"])) + which(outlist_total[, "mzmed.pkt"] %in% list_of_peaks_used_in_peak_groups_identified[, "mzmed.pkt"])) # nolint } outlist_rest <- outlist_total[-remove, ] # sort on mass outlist <- outlist_rest[order(as.numeric(outlist_rest[, "mzmed.pkt"])), ] # save output - save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")) + # save(outlist, file = paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData")) + + size_parts <- 10000 + # start <- 1 + + num_parts <- ceiling(nrow(outlist) / size_parts) + + for (part in 1:num_parts){ + if (part == 1) { + start_part <- 1 + end_part <- size_parts + } else { + start_part <- (part - 1) * size_parts + 1 + } + if (part == num_parts) { + end_part <- nrow(outlist) + } else if (part != 1) { + end_part <- part * size_parts + } + + outlist_part <- outlist[c(start_part:end_part), ] + # # add ppm extra before start + # if (part != 1) { + # mz_start <- outlist_part[1, "mzmed.pkt"] + # mz_ppm_range <- ppm * as.numeric(mz_start) / 1e+06 + # mz_start_min_ppm <- mz_start - mz_ppm_range + # outlist_before_part <- outlist %>% filter(mzmed.pkt >= mz_start_min_ppm & mzmed.pkt < mz_start) + + # outlist_part <- rbind(outlist_before_part, outlist_part) + # } + + # add ppm extra after end + if (part != num_parts) { + mz_end <- outlist_part[nrow(outlist_part), "mzmed.pkt"] + mz_ppm_range <- ppm * as.numeric(mz_end) / 1e+06 + mz_end_plus_ppm <- mz_end + mz_ppm_range + outlist_after_part <- outlist %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm) + + outlist_part <- rbind(outlist_part, outlist_after_part) + } + + save(outlist_part, file = paste0(scanmode, "_", paste0("unidentified_part_", part, ".RData"))) + } } diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf index 6b2656e..4f419a7 100644 --- a/DIMS/UnidentifiedCollectPeaks.nf +++ b/DIMS/UnidentifiedCollectPeaks.nf @@ -9,7 +9,8 @@ process UnidentifiedCollectPeaks { path(peaklist_identified) output: - path('SpectrumPeaks_*_Unidentified.RData') + // path('SpectrumPeaks_*_Unidentified.RData') + path('*.RData') script: """ diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R index 2ee085f..0fbdea4 100755 --- a/DIMS/UnidentifiedFillMissing.R +++ b/DIMS/UnidentifiedFillMissing.R @@ -4,12 +4,12 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -peakgrouplist_file1 <- cmd_args[1] -peakgrouplist_file2 <- cmd_args[2] -scripts_dir <- cmd_args[3] -thresh <- as.numeric(cmd_args[4]) -resol <- as.numeric(cmd_args[5]) -ppm <- as.numeric(cmd_args[6]) +peakgrouplist_file <- cmd_args[1] +# peakgrouplist_file2 <- cmd_args[2] +scripts_dir <- cmd_args[2] +thresh <- as.numeric(cmd_args[3]) +resol <- as.numeric(cmd_args[4]) +ppm <- as.numeric(cmd_args[5]) outdir <- "./" # load in function scripts @@ -23,27 +23,27 @@ source(paste0(scripts_dir, "ident.hires.noise.HPC.R")) source(paste0(scripts_dir, "elementInfo.R")) source(paste0(scripts_dir, "globalAssignments.HPC.R")) -peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) -for (peakgrouplist_file in peakgrouplist_files) { +# peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) +# for (peakgrouplist_file in peakgrouplist_files) { - if (grepl("_pos", peakgrouplist_file)) { - scanmode <- "positive" - } else if (grepl("_neg", peakgrouplist_file)) { - scanmode <- "negative" - } +if (grepl("_pos", basename(peakgrouplist_file))) { + scanmode <- "positive" +} else if (grepl("_neg", basename(peakgrouplist_file))) { + scanmode <- "negative" +} - # get replication pattern for sample names - pattern_file <- paste0(scanmode, "_repl_pattern.RData") - repl_pattern <- get(load(pattern_file)) +# get replication pattern for sample names +pattern_file <- paste0(scanmode, "_repl_pattern.RData") +repl_pattern <- get(load(pattern_file)) - # load peak group list and determine output file name - outpgrlist_identified <- get(load(peakgrouplist_file)) +# load peak group list and determine output file name +outpgrlist_identified <- get(load(peakgrouplist_file)) - outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) +outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) - # replace missing values (zeros) with random noise - peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) +# replace missing values (zeros) with random noise +peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) - # save output - save(peakgrouplist_filled, file = outputfile_name) -} +# save output +save(peakgrouplist_filled, file = outputfile_name) +# } diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R index 16dd8c1..4600820 100755 --- a/DIMS/UnidentifiedPeakGrouping.R +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -4,8 +4,9 @@ # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) -resol <- as.numeric(cmd_args[1]) -ppm <- as.numeric(cmd_args[2]) +unidentified_peaklist <- cmd_args[1] +resol <- as.numeric(cmd_args[2]) +ppm <- as.numeric(cmd_args[3]) outdir <- "./" options(digits = 16) @@ -76,14 +77,33 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { return(outpgrlist) } -scanmodes <- c("positive", "negative") +# scanmodes <- c("positive", "negative") -for (scanmode in scanmodes) { - # generate peak group lists of the unidentified peaks - unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData") - outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) - write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) +# for (scanmode in scanmodes) { +# # generate peak group lists of the unidentified peaks +# unidentified_peaklist <- paste0("SpectrumPeaks_", scanmode, "_Unidentified.RData") +# outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) +# write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_Unidentified.txt")) - # save output in RData format for further processing - save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData")) +# # save output in RData format for further processing +# save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_Unidentified.RData")) +# } + +# determine appropriate scanmode based on unidentified_peaklist file +if (grepl("negative", basename(unidentified_peaklist))) { + scanmode <- "negative" +} else if (grepl("positive", basename(unidentified_peaklist))) { + scanmode <- "positive" } + +# generate peak group lists of the unidentified peaks +outpgrlist <- grouping_rest(outdir, unidentified_peaklist, scanmode, ppm = ppm) + +# determine part number of unidentified_peaklist file +part_number <- gsub("\\D", "", basename(unidentified_peaklist)) + +# save output in txt format +write.table(outpgrlist, file = paste0("PeakGroupList_", scanmode, "_part_", part_number, "_Unidentified.txt")) + +# save output in RData format for further processing +save(outpgrlist, file=paste0("PeakGroupList_", scanmode, "_part_", part_number, "_Unidentified.RData")) diff --git a/DIMS/UnidentifiedPeakGrouping.nf b/DIMS/UnidentifiedPeakGrouping.nf index 2988195..ce140ec 100644 --- a/DIMS/UnidentifiedPeakGrouping.nf +++ b/DIMS/UnidentifiedPeakGrouping.nf @@ -14,6 +14,6 @@ process UnidentifiedPeakGrouping { script: """ - Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $params.resolution $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedPeakGrouping.R $unidentified_spectrumpeaks_files $params.resolution $params.ppm """ } From 7187c2f478a4093e34b1fb24269a88c70a9a8e2b Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 19 Apr 2024 13:48:23 +0200 Subject: [PATCH 58/73] Added changes HPC and parallelisation Unidentified --- DIMS/GenerateExcel.R | 4 +++- DIMS/GenerateExcel.nf | 7 +++---- DIMS/GenerateViolinPlots.R | 2 +- DIMS/GenerateViolinPlots.nf | 2 +- DIMS/UnidentifiedCollectPeaks.R | 4 ++-- DIMS/Utils/fit_optim.R | 2 +- 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 420b2e1..b2edf5c 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -181,9 +181,11 @@ if (z_score == 1) { } colnames(outlist_noZ)[startcol:ncol(outlist_noZ)] <- cnames_robust - # output metabolites filtered on relevance into tab-separated file + # output metabolites filtered on relevance + save(outlist, file = paste0("AdductSums_filtered_Zscores.RData")) write.table(outlist, file = paste0("AdductSums_filtered_Zscores.txt"), sep = "\t", row.names = FALSE) # output filtered metabolites with robust scaled Zscores + save(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.RData")) write.table(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE) # get the IDs of the patients and sort diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 86055bd..689d3ef 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -12,11 +12,10 @@ process GenerateExcel { path(relevance_file) output: - path('AdductSums_*.txt') - path('*IS_results.RData') - path('*_positive_control.RData') + path('*.RData') + path('*.txt') path('*.xlsx'), emit: excel_files - path('plots/*.png'), emit: plot_files + path('*.png'), emit: plot_files script: """ diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 38a8c61..562c794 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -421,7 +421,7 @@ if (violin == 1) { # get output file for Helix output_helix <- output_for_helix(protocol_name, dims_helix_table) # write output to file - path_helixfile <- paste0(outdir, "/output_Helix_", run_name,".csv") + path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv") write.csv(output_helix, path_helixfile, quote = F, row.names = F) } } diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index a1f5259..641d27a 100644 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -11,7 +11,7 @@ process GenerateViolinPlots { output: path('Diagnostics/*.pdf'), emit: diag_plot_files path('Other/*.pdf'), emit: other_plot_files - path('dIEM/*.pdf'), emit: diem_plot_files + path('dIEM_plots/*.pdf'), emit: diem_plot_files path('*.xlsx'), emit: excel_file path('*.csv'), emit: helix_file diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index 3de66d5..61e00f3 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -64,8 +64,8 @@ for (scanmode in scanmodes) { if (part != num_parts) { mz_end <- outlist_part[nrow(outlist_part), "mzmed.pkt"] mz_ppm_range <- ppm * as.numeric(mz_end) / 1e+06 - mz_end_plus_ppm <- mz_end + mz_ppm_range - outlist_after_part <- outlist %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm) + mz_end_plus_ppm <- as.numeric(mz_end) + mz_ppm_range + outlist_after_part <- as.data.frame(outlist) %>% filter(mzmed.pkt > mz_end & mzmed.pkt <= mz_end_plus_ppm) outlist_part <- rbind(outlist_part, outlist_after_part) } diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R index ac3a9ee..bc5f7d3 100644 --- a/DIMS/Utils/fit_optim.R +++ b/DIMS/Utils/fit_optim.R @@ -2,7 +2,7 @@ # variables with fixed values will be removed from function parameters # plot, width, height # fit_gaussian should be defined before this function is called. -fit_op4tim <- function(mass_vector, int_vector, resol, +fit_optim <- function(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) { #' Determine optimized fit of Gaussian curve to small region of m/z #' From f9335619d9fd7bc05e3c564f5bc2f3f7eb2a4d36 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 19 Apr 2024 14:22:25 +0200 Subject: [PATCH 59/73] Added loading package --- DIMS/UnidentifiedCollectPeaks.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index 61e00f3..be1ac0e 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -1,6 +1,9 @@ #!/usr/bin/Rscript ## adapted from 7-collectSamplesGroupedHMDB.R +# load required packages +suppressMessages(library("dplyr")) + options(digits = 16) # define parameters From 8397c12319f93ed5876ae7374e65fbf01f66e65a Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 19 Apr 2024 15:31:17 +0200 Subject: [PATCH 60/73] Changed output --- DIMS/GenerateExcel.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 689d3ef..6134766 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -15,7 +15,7 @@ process GenerateExcel { path('*.RData') path('*.txt') path('*.xlsx'), emit: excel_files - path('*.png'), emit: plot_files + path('plots/*.png'), emit: plot_files script: """ From 507c88dd405edbbcd217383a388697e15871525d Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 22 Apr 2024 14:05:10 +0200 Subject: [PATCH 61/73] cleaned up peak finding, fill missing functions --- DIMS/FillMissing.R | 2 +- DIMS/PeakFinding.R | 2 +- DIMS/Utils/atomic_info.R | 92 ++++++++ DIMS/Utils/calculate_zscores.R | 63 +++++ DIMS/Utils/check_overlap.R | 24 ++ DIMS/Utils/estimate_area.R | 30 +++ DIMS/Utils/fit_gaussian.R | 78 ++---- DIMS/Utils/fit_gaussians.R | 1 - DIMS/Utils/fit_init.R | 1 + DIMS/Utils/fit_optim.R | 4 +- DIMS/Utils/fit_peaks.R | 381 ++++++++++++++++++++++++++++++ DIMS/Utils/get_element_info.R | 24 ++ DIMS/Utils/get_fit_quality.R | 34 +++ DIMS/Utils/identify_noisepeaks.R | 103 ++++++++ DIMS/Utils/merge_duplicate_rows.R | 59 +++++ DIMS/Utils/optimize_gaussfit.R | 23 ++ DIMS/Utils/replace_zeros.R | 64 +++++ DIMS/Utils/search_mzrange.R | 20 +- DIMS/Utils/sum_curves.R | 35 +++ DIMS/Utils/within_ppm.R | 64 +++++ 20 files changed, 1030 insertions(+), 74 deletions(-) create mode 100644 DIMS/Utils/atomic_info.R create mode 100644 DIMS/Utils/calculate_zscores.R create mode 100644 DIMS/Utils/check_overlap.R create mode 100644 DIMS/Utils/estimate_area.R create mode 100644 DIMS/Utils/fit_peaks.R create mode 100644 DIMS/Utils/get_element_info.R create mode 100644 DIMS/Utils/get_fit_quality.R create mode 100644 DIMS/Utils/identify_noisepeaks.R create mode 100644 DIMS/Utils/merge_duplicate_rows.R create mode 100644 DIMS/Utils/optimize_gaussfit.R create mode 100644 DIMS/Utils/replace_zeros.R create mode 100644 DIMS/Utils/sum_curves.R create mode 100644 DIMS/Utils/within_ppm.R diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index f7166d0..f96e257 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -39,7 +39,7 @@ outpgrlist_identified <- get(load(peakgrouplist_file)) outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) # replace missing values (zeros) with random noise -peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) +peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) # save output save(peakgrouplist_filled, file = outputfile_name) diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index e2edc85..08eb546 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -63,4 +63,4 @@ print(thresh) print(width) print(height) -findPeaks.Gauss.HPC(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/Utils/atomic_info.R b/DIMS/Utils/atomic_info.R new file mode 100644 index 0000000..94d0e83 --- /dev/null +++ b/DIMS/Utils/atomic_info.R @@ -0,0 +1,92 @@ +## adapted from globalAssignments.HPC.R +# relative abundancies from theoretical mass and composition +# not a function, but a large amount of objects that will become available in memory +# refactor: include in get_element_info +# check if all these elements are necessary. Only use snake_make for Hmass -> hydrogen_mass. +# move source library to higher level +options(digits = 16) + +suppressPackageStartupMessages(library(lattice)) + +# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST +Ba <- list(name = "Ba", mass = 130, isotope = list(mass = c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768), abundance = c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717))) +Br <- list(name = "Br", mass = 79, isotope = list(mass = c(-0.0816639, 0, -0.083711), abundance = c(0.5069, 0, 0.4931))) +C <- list(name = "C", mass = 12, isotope = list(mass = c(0, 0.003354838, 0.003241989), abundance = c(0.9893, 0.0107, 0))) +Ca <- list(name = "Ca", mass = 40, isotope = list(mass = c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467), abundance = c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187))) +Cl <- list(name = "Cl", mass = 35, isotope = list(mass = c(-0.03114732, 0, -0.03409741), abundance = c(0.7576, 0, 0.2424))) +Cr <- list(name = "Cr", mass = 50, isotope = list(mass = c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175), abundance = c(0.04345, 0, 0.83789, 0.09501, 0.02365))) +Cu <- list(name = "Cu", mass = 63, isotope = list(mass = c(-0.0704011, 0, -0.0722071), abundance = c(0.6917, 0, 0.3083))) +F <- list(name = "F", mass = 19, isotope = list(mass = c(-0.00159678), abundance = c(1))) +Fe <- list(name = "Fe", mass = 54, isotope = list(mass = c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227), abundance = c(0.058, 0, 0.9172, 0.022, 0.0028))) +H <- list(name = "H", mass = 1, isotope = list(mass = c(0.00782503207, 0.014101778, 0.01604928), abundance = c(0.999885, 0.000115, 0))) +Hg <- list(name = "Hg", mass = 196, isotope = list(mass = c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533), abundance = c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687))) +I <- list(name = "I", mass = 127, isotope = list(mass = c(-0.095527), abundance = c(1))) +K <- list(name = "K", mass = 39, isotope = list(mass = c(-0.0362926, -0.0360008, -0.0381746), abundance = c(0.932581, 0.000117, 0.067302))) +Li <- list(name = "Li", mass = 6, isotope = list(mass = c(0.0151214, 0.016003), abundance = c(0.075, 0.925))) +Mg <- list(name = "Mg", mass = 24, isotope = list(mass = c(-0.0149577, -0.0141626, -0.0174063), abundance = c(0.7899, 0.1, 0.1101))) +Mn <- list(name = "Mn", mass = 55, isotope = list(mass = c(-0.0619529), abundance = c(1))) +N <- list(name = "N", mass = 14, isotope = list(mass = c(0.003074002, 0.00010897), abundance = c(0.99634, 0.00366))) +Na <- list(name = "Na", mass = 23, isotope = list(mass = c(-0.0102323), abundance = c(1))) +Ni <- list(name = "Ni", mass = 58, isotope = list(mass = c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321), abundance = c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926))) +O <- list(name = "O", mass = 16, isotope = list(mass = c(-0.00508538044, -0.0008683, -0.0008397), abundance = c(0.99757, 0.000381, 0.00205))) +P <- list(name = "P", mass = 31, isotope = list(mass = c(-0.026238), abundance = c(1))) +S <- list(name = "S", mass = 32, isotope = list(mass = c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924), abundance = c(0.9499, 0.0075, 0.0425, 0, 1e-04))) +Se <- list(name = "Se", mass = 74, isotope = list(mass = c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022), abundance = c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873))) +Si <- list(name = "Si", mass = 28, isotope = list(mass = c(-0.0230729, -0.0235051, -0.0262293), abundance = c(0.9223, 0.0467, 0.031))) +Sn <- list(name = "Sn", mass = 112, isotope = list(mass = c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257), abundance = c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579))) +Zn <- list(name = "Zn", mass = 64, isotope = list(mass = c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675), abundance = c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006))) + +# The following list is for our own use +NH4 <- list(name = "NH4", mass = 18, isotope = list(mass = c(0.03437, 0.03141, -0.95935)), abundance = c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1 +Ac <- list(name = "Ac", mass = 60, isotope = list(mass = c(0.02114, 0.02450, 0.02538)), abundance = c(0.975, 0.021, 0.004)) # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4 +NaCl <- list(name = "NaCl", mass = 58, isotope = list(mass = c(-0.04137, 0, -0.04433)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl2 <- list(name = "NaCl2", mass = 116, isotope = list(mass = c(-0.08274, 0, -0.08866)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl3 <- list(name = "NaCl3", mass = 174, isotope = list(mass = c(-0.12411, 0, -0.13299)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl4 <- list(name = "NaCl4", mass = 232, isotope = list(mass = c(-0.16548, 0, -0.17732)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +NaCl5 <- list(name = "NaCl5", mass = 290, isotope = list(mass = c(-0.20685, 0, -0.22165)), abundance = c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 +For <- list(name = "For", mass = 45, isotope = list(mass = c(-0.00233, 0.00103)), abundance = c(0.989, 0.011)) # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4 +Na2 <- list(name = "2Na-H", mass = 46, isotope = list(mass = c(-1.0282896)), abundance = c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +Met <- list(name = "CH3OH", mass = 32, isotope = list(mass = c(1.034045, 1.037405)), abundance = c(0.989, 0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +CH3OH <- list(name = "CH3OH", mass = 32, isotope = list(mass = c(1.034045, 1.037405)), abundance = c(0.989, 0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 +Na3 <- list(name = "3Na-2H", mass = 69, isotope = list(mass = c(-2.0463469)), abundance = c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! +KCl <- list(name = "KCl", mass = 74, isotope = list(mass = c(-0.06744, 0.92961, -0.06744, 0.92772)), abundance = c(0.7047, 0.2283, 0.0507, 0.0162)) # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3 +H2PO4 <- list(name = "H2PO4", mass = 97, isotope = list(mass = c(-0.03091)), abundance = c(1)) +HSO4 <- list(name = "HSO4", mass = 97, isotope = list(mass = c(-0.04042, 0, -0.04462)), abundance = c(0.96, 0, 0.04)) +Met2 <- list(name = "Met2", mass = 64, isotope = list(mass = c(1.060265, 1.013405)), abundance = c(0.978, 0.022)) +Met3 <- list(name = "Met3", mass = 96, isotope = list(mass = c(1.086485, 1.089845)), abundance = c(0.969, 0.031)) +Met4 <- list(name = "Met4", mass = 128, isotope = list(mass = c(1.112705, 1.116065)), abundance = c(0.959, 0.041)) +Met5 <- list(name = "Met5", mass = 160, isotope = list(mass = c(1.20935, 1.142285)), abundance = c(0.949, 0.051)) +NaminH <- list(name = "Na-H", mass = 21, isotope = list(mass = c(-0.02571416)), abundance = c(1)) +KminH <- list(name = "K-H", mass = 37, isotope = list(mass = c(-0.05194, 0.94617)), abundance = c(0.9328, 0.0672)) +H2O <- list(name = "H2O", mass = -19, isotope = list(mass = c(-0.01894358)), abundance = c(1)) +NaK <- list(name = "NaK-H", mass = 61, isotope = list(mass = c(-0.054345, 0.943765)), abundance = c(0.9328, 0.0672)) +min2H <- list(name = "min2H", mass = -2, isotope = list(mass = c(-0.0151014)), abundance = c(1)) +plus2H <- list(name = "plus2H", mass = 2, isotope = list(mass = c(0.0151014)), abundance = c(1)) +plus2Na <- list(name = "plus2Na", mass = 46, isotope = list(mass = c(-0.02046), abundance = c(1))) +plusNaH <- list(name = "plusNaH", mass = 24, isotope = list(mass = c(-0.00295588), abundance = c(1))) +plusKH <- list(name = "plusKH", mass = 40, isotope = list(mass = c(-0.029008, 0.969101)), abundance = c(0.9328, 0.0672)) +plusHNH <- list(name = "plusHNH", mass = 19, isotope = list(mass = c(0.04164642)), abundance = c(1)) +min3H <- list(name = "min3H", mass = -3, isotope = list(mass = c(-0.02182926)), abundance = c(1)) +plus3H <- list(name = "plus3H", mass = 3, isotope = list(mass = c(0.02182926)), abundance = c(1)) +plus3Na <- list(name = "plus3Na", mass = 68, isotope = list(mass = c(0.96931)), abundance = c(1)) +plus2NaH <- list(name = "plus2NaH", mass = 47, isotope = list(mass = c(0.2985712)), abundance = c(1)) +plusNa2H <- list(name = "plusNa2H", mass = 25, isotope = list(mass = c(0.00432284)), abundance = c(1)) + +# add first isotope of Cl because of high abundance +Cl37 <- list(name = "Cl37", mass = 37, isotope = list(mass = c(-0.03409741)), abundance = c(1)) + +all_elements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn) +all_adducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, + NH4, Ac, NaCl, For, Na2, CH3OH, NaCl2, NaCl3, NaCl4, NaCl5, Na3, KCl, H2PO4, HSO4, + Met2, Met3, Met4, Met5, NaminH, KminH, H2O, NaK, min2H, plus2H, plus2Na, plusNaH, + plusKH, min3H, plus3H, plusHNH, plus3Na, plus2NaH, plusNa2H) + +atoms_inuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "13C", "34S", "18O", "37Cl") +atomic_weights <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259) +electron <- 0.00054858 + +hydrogen_mass <- Hmass <- H$mass + H$isotope$mass[1] +Dmass <- H$mass + 1 + H$isotope$mass[2] +Tmass <- H$mass + 2 + H$isotope$mass[3] +C13mass <- C$mass + 1 + C$isotope$mass[2] +N15mass <- N$mass + 1 + N$isotope$mass[2] diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R new file mode 100644 index 0000000..9791e45 --- /dev/null +++ b/DIMS/Utils/calculate_zscores.R @@ -0,0 +1,63 @@ +## adapted from statistics_z.R +# refactor: change column names from avg.ctrls to avg_ctrls, sd.ctrls to sd_ctrls +# remove parameter sort_col +# check logic of parameter adducts +calculate_zscores <- function(peakgroup_list, sort_col, adducts) { + #' Calculate Z-scores for peak groups based on average and standard deviation of controls + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param sort_col: Column to sort on (string) + #' @param adducts: Parameter indicating whether there are adducts in the list (boolean) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + case_label <- "P" + control_label <- "C" + + # calculate mean and standard deviation for Control group + ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) + case_cols <- grep(case_label, colnames(peakgroup_list), fixed = TRUE) + int_cols <- c(ctrl_cols, case_cols) + # set al zeros to NA + peakgroup_list[, int_cols][peakgroup_list[, int_cols] == 0] <- NA + ctrl_ints <- peakgroup_list[, ctrl_cols, drop = FALSE] + peakgroup_list$avg.ctrls <- apply(ctrl_ints, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) + peakgroup_list$sd.ctrls <- apply(ctrl_ints, 1, function(x) sd(as.numeric(x), na.rm = TRUE)) + + # set new column names and calculate Z-scores + colnames_zscores <- NULL + for (col_index in int_cols) { + col_name <- colnames(peakgroup_list)[col_index] + colnames_zscores <- c(colnames_zscores, paste0(col_name, "_Zscore")) + zscores_1col <- (as.numeric(as.vector(unlist(peakgroup_list[, col_index]))) - + peakgroup_list$avg.ctrls) / peakgroup_list$sd.ctrls + peakgroup_list <- cbind(peakgroup_list, zscores_1col) + } + + # apply new column names to columns at end plus avg and sd columns + startcol <- ncol(peakgroup_list) + 3 + colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores + + # add ppm deviation column + zscore_cols <- grep("Zscore", colnames(peakgroup_list), fixed = TRUE) + if (!adducts) { + if ((dim(peakgroup_list[, zscore_cols])[2] + 6) != (startcol - 1)) { + ppmdev <- array(1:nrow(peakgroup_list), dim = c(nrow(peakgroup_list))) + # calculate ppm deviation + for (i in 1:nrow(peakgroup_list)) { + if (!is.na(peakgroup_list$theormz_HMDB[i]) && + !is.null(peakgroup_list$theormz_HMDB[i]) && + (peakgroup_list$theormz_HMDB[i] != "")) { + ppmdev[i] <- 10^6 * (as.numeric(as.vector(peakgroup_list$mzmed.pgrp[i])) - + as.numeric(as.vector(peakgroup_list$theormz_HMDB[i]))) / + as.numeric(as.vector(peakgroup_list$theormz_HMDB[i])) + } else { + ppmdev[i] <- NA + } + } + peakgroup_list <- cbind(peakgroup_list[, 1:6], ppmdev = ppmdev, peakgroup_list[, 7:ncol(peakgroup_list)]) + } + } + + return(peakgroup_list) +} diff --git a/DIMS/Utils/check_overlap.R b/DIMS/Utils/check_overlap.R new file mode 100644 index 0000000..66492e8 --- /dev/null +++ b/DIMS/Utils/check_overlap.R @@ -0,0 +1,24 @@ +## adapted from checkOverlap.R +check_overlap <- function(range1, range2) { + #' Modify range1 and range2 in case of overlap + #' + #' @param range1: Vector of m/z values for first peak (float) + #' @param range2: Vector of m/z values for second peak (float) + #' + #' @return new_ranges: list of two ranges (list) + + # Check for overlap + if (length(intersect(range1, range2)) == 2) { + if (length(range1) >= length(range2)) { + range1 <- range1[-length(range1)] + } else { + range2 <- range2[-1] + } + } else if (length(intersect(range1, range2)) == 3) { + range1 <- range1[-length(range1)] + range2 <- range2[-1] + } + new_ranges <- list("range1" = range1, "range2" = range2) + return(new_ranges) +} + diff --git a/DIMS/Utils/estimate_area.R b/DIMS/Utils/estimate_area.R new file mode 100644 index 0000000..1d24de2 --- /dev/null +++ b/DIMS/Utils/estimate_area.R @@ -0,0 +1,30 @@ +## adapted from getArea.R +# variables with fixed values will be removed from function parameters +# int_factor +estimate_area <- function(mass_max, resol, scale, sigma, int_factor) { + #' Estimate area of Gaussian curve + #' + #' @param mass_max: Value for m/z at maximum intensity of a peak (float) + #' @param resol: Value for resolution (integer) + #' @param scale: Value for peak width (float) + #' @param sigma: Value for standard deviation (float) + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return area_curve: Value for area under the Gaussian curve (float) + + # avoid vectors that are too big (cannot allocate vector of size ...) + if (mass_max > 1200) return(0) + + # generate a mass_vector with equally spaced m/z values + fwhm <- get_fwhm(mass_max, resol) + mz_min <- mass_max - 2 * fwhm + mz_max <- mass_max + 2 * fwhm + mz_range <- mz_max - mz_min + mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int.factor) + + # estimate area under the curve + area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100 + + return(area_curve) +} + diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R index 9dc7e52..35275cd 100644 --- a/DIMS/Utils/fit_gaussian.R +++ b/DIMS/Utils/fit_gaussian.R @@ -38,7 +38,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # One local maximum: if (force == 1) { # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) - fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) + fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + plot, fit_quality1, use_bounds) # set initial value for scale factor scale <- 2 # test if the mean is outside the m/z range @@ -75,8 +76,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale #### Two local maxima; need at least 6 data points for this #### } else if (force == 2 && (length(mass_vector) > 6)) { # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) - fit_values <- fit2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol, - use_bounds, plot, fit_quality, int_factor) + fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { @@ -114,9 +115,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale new_index <- c(new_index, 2 * new_index, 3 * new_index) } # run this function again with three local maxima - return(fitGaussian(mass_vector2, mass_vector, int_vector, new_index, - scale, resol, outdir, force = 3, use_bounds = FALSE, - plot, scanmode, int_factor, width, height)) + return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, + scale, resol, outdir, force = 3, use_bounds = FALSE, + plot, scanmode, int_factor, width, height)) # good fit, all means are within m/z range } else { # check if means are within 3 ppm and sum if so @@ -125,32 +126,20 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale nr_means <- length(fit_values$mean) while (nr_means != nr_means_new) { nr_means <- length(fit_values$mean) - fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, mass_vector2, mass_vector, ppm = 4, resol, plot) nr_means_new <- length(fit_values$mean) } # restore original quality score fit_values$qual <- tmp - # section for plot - # plot_header <- NULL - # for (i in 1:length(fit_values$mean)) { - # plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " ")) - # peak_mean <- c(peak_mean, fit_values$mean[i]) - # peak_area <- c(peak_area, fit_values$area[i]) - # } - # peak_qual <- fit_values$qual - # peak_min <- mass_vector[1] - # peak_max <- mass_vector[length(mass_vector)] - # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " ")) - # if (plot) legend("topright", legend=plot_header) } } - #### Three local maxima; need at least 6 data points for this #### + # Three local maxima; need at least 6 data points for this } else if (force == 3 && (length(mass_vector) > 6)) { # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual) - fit_values <- fit3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, - use_bounds, plot, fit_quality, int_factor) + fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || @@ -199,35 +188,22 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale tmp <- fit_values$qual nr_means_new <- -1 nr_means <- length(fit_values$mean) - while (nr_means!=nr_means_new) { + while (nr_means != nr_means_new) { nr_means <- length(fit_values$mean) - fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, - mass_vector2, mass_vector, ppm = 4, resol, plot) + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) nr_means_new <- length(fit_values$mean) } # restore original quality score fit_values$qual <- tmp - - # plot_header <- NULL - # for (i in 1:length(fit_values$mean)){ - # plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " ")) - # - # peak_mean <- c(peak_mean, fit_values$mean[i]) - # peak_area <- c(peak_area, fit_values$area[i]) - # } - # peak_qual <- fit_values$qual - # peak_min <- mass_vector[1] - # peak_max <- mass_vector[length(mass_vector)] - # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " ")) - # if (plot) legend("topright", legend=plot_header) } } #### Four local maxima; need at least 6 data points for this #### } else if (force == 4 && (length(mass_vector) > 6)) { # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual) - fit_values <- fit4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, - use_bounds, plot, fit_quality, int_factor) + fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + use_bounds, plot, fit_quality, int_factor) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || @@ -273,24 +249,12 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale nr_means <- length(fit_values$mean) while (nr_means != nr_means_new) { nr_means <- length(fit_values$mean) - fit_values <- isWithinXppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, - mass_vector2, mass_vector, ppm = 4, resol, plot) + fit_values <- within_ppm(fit_values$mean, fit_values$scale, fit_values$sigma, fit_values$area, + mass_vector2, mass_vector, ppm = 4, resol, plot) nr_means_new <- length(fit_values$mean) } # restore original quality score fit_values$qual <- tmp - - # plot_header<-NULL - # for (i in 1:length(fit_values$mean)){ - # plot_header <- c(plot_header, paste("mean =", fit_values$mean[i], sep = " ")) - # peak_mean <- c(peak_mean, fit_values$mean[i]) - # peak_area <- c(peak_area, fit_values$area[i]) - # } - # peak_qual <- fit_values$qual - # peak_min <- mass_vector[1] - # peak_max <- mass_vector[length(mass_vector)] - # plot_header <- c(plot_header, paste("fq =", fit_values$qual, sep = " ")) - # if (plot) legend("topright", legend=plot_header) } } @@ -300,8 +264,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale fit_quality1 <- 0.40 use_bounds <- TRUE max_index <- which(int_vector == max(int_vector)) - fit_values <- fit1Peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, - plot, fit_quality1, use_bounds) + fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, + plot, fit_quality1, use_bounds) # check for bad fit if (fit_values$qual > fit_quality1) { # remove @@ -315,7 +279,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_qual <- 0 } else { peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) + peak_area <- c(peak_area, get_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) peak_qual <- fit_values$qual peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R index 007b8ed..c4924e1 100644 --- a/DIMS/Utils/fit_gaussians.R +++ b/DIMS/Utils/fit_gaussians.R @@ -140,7 +140,6 @@ fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, return(opt_fit) } - fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma4, query_mass1, scale1, query_mass2, scale2, diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R index f9e0c27..c761194 100644 --- a/DIMS/Utils/fit_init.R +++ b/DIMS/Utils/fit_init.R @@ -2,6 +2,7 @@ # variables with fixed values will be removed from function parameters # scale, outdir, plot, width, height # mz_index, start_index, end_index, sample_name not used. +# use_bounds is used, but not defined. # fit_gaussian should be defined before this function is called. fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, outdir, sample_name, scanmode, plot, width, height, diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R index bc5f7d3..e280b57 100644 --- a/DIMS/Utils/fit_optim.R +++ b/DIMS/Utils/fit_optim.R @@ -34,10 +34,10 @@ fit_optim <- function(mass_vector, int_vector, resol, mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], length = mz_diff * int_factor) sigma <- get_stdev(mass_vector2, int_max_simple) - scale <- optimizeGauss(mass_vector2, int_max_simple, sigma, mass_max) + scale <- optimize_gaussfit(mass_vector2, int_max_simple, sigma, mass_max) # get an estimate of the area under the peak - area <- getArea(mass_max, resol, scale, sigma, int_factor) + area <- get_area(mass_max, resol, scale, sigma, int_factor) # put all values for this region of interest into a list roi_value_list <- list("mean" = mass_max, diff --git a/DIMS/Utils/fit_peaks.R b/DIMS/Utils/fit_peaks.R new file mode 100644 index 0000000..92e08d6 --- /dev/null +++ b/DIMS/Utils/fit_peaks.R @@ -0,0 +1,381 @@ +## adapted from fit1Peak.R, fit2peaks.R, fit3peaks.R and fit4peaks.R (combined) +# variables with fixed values will be removed from function parameters +# plot, int_factor +fit_1peak <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality, use_bounds) { + #' Fit 1 Gaussian peak in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + if (length(int_vector) < 3) { + message("Range too small, no fit possible!") + } else { + if ((length(int_vector) == 4)) { + # fit 1 peak + mu <- weighted.mean(mass_vector, int_vector) + sigma <- get_stdev(mass_vector, int_vector) + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + } else { + # set range vector + if ((length(mass_vector) - length(max_index)) < 2) { + range1 <- c((length(mass_vector) - 4) : length(mass_vector)) + } else if (length(max_index) < 2) { + range1 <- c(1:5) + } else { + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + } + if (range1[1] == 0) range1 <- range1[-1] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) { + range1 <- range1[-which(is.na(int_vector[range1]))] + } + # fit 1 peak + mu <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + } + + p1 <- fitted_peak$par + + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 1.2 * scale + + # bad fit + if (fq_new > fit_quality) { + # optimize scaling factor + fq <- 0 + scale <- 0 + if (sum(int_vector) > sum(p1[2] * dnorm(mass_vector, p1[1], sigma))) { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + fq <- fq_new + scale <- scale_new + # fit 1 peak + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + p1 <- fitted_peak$par + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 1.2 * scale + } + } else { + while ((round(fq, digits = 3) != round(fq_new, digits = 3)) && (scale_new < 10000)) { + fq <- fq_new + scale <- scale_new + # fit 1 peak + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + p1 <- fitted_peak$par + # get new value for fit quality and scale + fq_new <- get_fit_quality(mass_vector, int_vector, p1[1], p1[1], resol, p1[2], sigma)$fq_new + scale_new <- 0.8 * scale + } + } + # use optimized scale factor to fit 1 peak + if (fq < fq_new) { + fitted_peak <- fit_1gaussian(mass_vector, int_vector, sigma, mu, scale, use_bounds) + p1 <- fitted_peak$par + fq_new <- fq + } + } + } + + roi_value_list <- list("mean" = p1[1], "scale" = p1[2], "sigma" = sigma, "qual" = fq_new) + return(roi_value_list) +} + +fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, + plot = FALSE, fit_quality, int_factor) { + #' Fit 2 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 2 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + if (range1[1] == 0) range1 <- range1[-1] + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + if (length(mass_vector) < range2[length(range2)]) range2 <- range2[-length(range2)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + + # fit 2 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # combined + fitted_2peaks <- fit_2gaussians(mass_vector, int_vector, sigma1, sigma2, p1[1], p1[2], p2[1], p2[2], use_bounds) + pc <- fitted_2peaks$par + + # get fit quality + if (is.null(sigma2)) sigma2 <- sigma1 + sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3]))[1], sort(c(pc[1], pc[3]))[2], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + +fit_3peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, + plot = FALSE, fit_quality, int_factor) { + #' Fit 3 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 3 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + range3 <- c(max_index[3] - 2, max_index[3] - 1, max_index[3], max_index[3] + 1, max_index[3] + 2) + remove <- which(range1 < 1) + if (length(remove) > 0) { + range1 <- range1[-remove] + } + remove <- which(range2 < 1) + if (length(remove) > 0) { + range2 <- range2[-remove] + } + if (length(mass_vector) < range3[length(range3)]) range3 <- range3[-length(range3)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + range2 <- check_overlap(range2, range3)[[1]] + range3 <- check_overlap(range2, range3)[[2]] + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + remove <- which(range3 < 1) + if (length(remove) > 0) range3 <- range3[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + if (length(which(is.na(int_vector[range3]))) != 0) range3 <- range3[-which(is.na(int_vector[range3]))] + + # fit 3 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # third peak + mu3 <- weighted.mean(mass_vector[range3], int_vector[range3]) + sigma3 <- get_stdev(mass_vector[range3], int_vector[range3]) + fitted_peak <- fit_1gaussian(mass_vector[range3], int_vector[range3], sigma3, mu3, scale, use_bounds) + p3 <- fitted_peak$par + # combined + fitted_3peaks <- fit_3gaussians(mass_vector, int_vector, sigma1, sigma2, sigma3, + p1[1], p1[2], p2[1], p2[2], p3[1], p3[2], use_bounds) + pc <- fitted_3peaks$par + + # get fit quality + sum_fit = (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + + (pc[6] * dnorm(mass_vector, pc[5], sigma3)) + fq <- get_fit_quality(mass_vector, int_vector, sort(c(pc[1], pc[3], pc[5]))[1], sort(c(pc[1], pc[3], pc[5]))[3], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor) + area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_area <- c(peak_area, area3) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_mean <- c(peak_mean, pc[5]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_scale <- c(peak_scale, pc[6]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + peak_sigma <- c(peak_sigma, sigma3) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + +fit_4peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds = FALSE, + plot = FALSE, fit_quality, int_factor) { + #' Fit 4 Gaussian peaks in small region of m/z + #' + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param max_index: Index in int_vector with the highest intensity (integer) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' @param fit_quality: Value indicating quality of fit of Gaussian curve (float) + #' @param use_bounds: Boolean to indicate whether boundaries are to be used + #' @param int_factor: Value used to calculate area under Gaussian curve (integer) + #' + #' @return roi_value_list: list of fit values for region of interest (list) + + peak_mean <- NULL + peak_area <- NULL + peak_scale <- NULL + peak_sigma <- NULL + + # set range vectors for 4 peaks + range1 <- c(max_index[1] - 2, max_index[1] - 1, max_index[1], max_index[1] + 1, max_index[1] + 2) + range2 <- c(max_index[2] - 2, max_index[2] - 1, max_index[2], max_index[2] + 1, max_index[2] + 2) + range3 <- c(max_index[3] - 2, max_index[3] - 1, max_index[3], max_index[3] + 1, max_index[3] + 2) + range4 <- c(max_index[4] - 2, max_index[4] - 1, max_index[4], max_index[4] + 1, max_index[4] + 2) + if (range1[1] == 0) range1 <- range1[-1] + if (length(mass_vector) < range4[length(range4)]) range4 <- range4[-length(range4)] + range1 <- check_overlap(range1, range2)[[1]] + range2 <- check_overlap(range1, range2)[[2]] + range2 <- check_overlap(range2, range3)[[1]] + range3 <- check_overlap(range2, range3)[[2]] + range3 <- check_overlap(range3, range4)[[1]] + range4 <- check_overlap(range3, range4)[[2]] + remove <- which(range4 > length(mass_vector)) + if (length(remove) > 0) { + range4 <- range4[-remove] + } + # check for negative or 0 + remove <- which(range1 < 1) + if (length(remove) > 0) range1 <- range1[-remove] + remove <- which(range2 < 1) + if (length(remove) > 0) range2 <- range2[-remove] + remove <- which(range3 < 1) + if (length(remove) > 0) range3 <- range3[-remove] + remove <- which(range4 < 1) + if (length(remove) > 0) range4 <- range4[-remove] + # remove NA + if (length(which(is.na(int_vector[range1]))) != 0) range1 <- range1[-which(is.na(int_vector[range1]))] + if (length(which(is.na(int_vector[range2]))) != 0) range2 <- range2[-which(is.na(int_vector[range2]))] + if (length(which(is.na(int_vector[range3]))) != 0) range3 <- range3[-which(is.na(int_vector[range3]))] + if (length(which(is.na(int_vector[range4]))) != 0) range4 <- range4[-which(is.na(int_vector[range4]))] + + # fit 4 peaks, first separately, then together + mu1 <- weighted.mean(mass_vector[range1], int_vector[range1]) + sigma1 <- get_stdev(mass_vector[range1], int_vector[range1]) + fitted_peak <- fit_1gaussian(mass_vector[range1], int_vector[range1], sigma1, mu1, scale, use_bounds) + p1 <- fitted_peak$par + # second peak + mu2 <- weighted.mean(mass_vector[range2], int_vector[range2]) + sigma2 <- get_stdev(mass_vector[range2], int_vector[range2]) + fitted_peak <- fit_1gaussian(mass_vector[range2], int_vector[range2], sigma2, mu2, scale, use_bounds) + p2 <- fitted_peak$par + # third peak + mu3 <- weighted.mean(mass_vector[range3], int_vector[range3]) + sigma3 <- get_stdev(mass_vector[range3], int_vector[range3]) + fitted_peak <- fit_1gaussian(mass_vector[range3], int_vector[range3], sigma3, mu3, scale, use_bounds) + p3 <- fitted_peak$par + # fourth peak + mu4 <- weighted.mean(mass_vector[range4], int_vector[range4]) + sigma4 <- get_stdev(mass_vector[range4], int_vector[range4]) + fitted_peak <- fit_1gaussian(mass_vector[range4], int_vector[range4], sigma4, mu4, scale, use_bounds) + p4 <- fitted_peak$par + # combined + fitted_4peaks <- fit_4gaussians(mass_vector, int_vector, sigma1, sigma2, sigma3, sigma3, + p1[1], p1[2], p2[1], p2[2], p3[1], p3[2], p4[1], p4[2], use_bounds) + pc <- fitted_4peaks$par + + # get fit quality + sum_fit <- (pc[2] * dnorm(mass_vector, pc[1], sigma1)) + + (pc[4] * dnorm(mass_vector, pc[3], sigma2)) + + (pc[6] * dnorm(mass_vector, pc[5], sigma3)) + + (pc[8] * dnorm(mass_vector, pc[7], sigma3)) + fq <- get_fit_quality(mass_vector, int_vector, + sort(c(pc[1], pc[3], pc[5], pc[7]))[1], sort(c(pc[1], pc[3], pc[5], pc[7]))[4], + resol, sum_fit = sum_fit)$fq_new + + # get parameter values + area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor) + area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor) + area4 <- get_area(pc[7], resol, pc[8], sigma4, int_factor) + peak_area <- c(peak_area, area1) + peak_area <- c(peak_area, area2) + peak_area <- c(peak_area, area3) + peak_area <- c(peak_area, area4) + peak_mean <- c(peak_mean, pc[1]) + peak_mean <- c(peak_mean, pc[3]) + peak_mean <- c(peak_mean, pc[5]) + peak_mean <- c(peak_mean, pc[7]) + peak_scale <- c(peak_scale, pc[2]) + peak_scale <- c(peak_scale, pc[4]) + peak_scale <- c(peak_scale, pc[6]) + peak_scale <- c(peak_scale, pc[8]) + peak_sigma <- c(peak_sigma, sigma1) + peak_sigma <- c(peak_sigma, sigma2) + peak_sigma <- c(peak_sigma, sigma3) + peak_sigma <- c(peak_sigma, sigma4) + + roi_value_list <- list("mean" = peak_mean, "scale" = peak_scale, "sigma" = peak_sigma, "area" = peak_area, "qual" = fq) + return(roi_value_list) +} + diff --git a/DIMS/Utils/get_element_info.R b/DIMS/Utils/get_element_info.R new file mode 100644 index 0000000..c4bdd92 --- /dev/null +++ b/DIMS/Utils/get_element_info.R @@ -0,0 +1,24 @@ +## adapted from elementInfo.R, which is adapted from Rdisop function .getElement +# refactor: check where library is initialised: library(Rdisop) +get_element_info <- function(name, elements = NULL) { + #' Get info on m/z and isotopes for all chemical elements + #' + #' @param name: Name of adduct, e.g. Na (string) + #' @param elements: List of all adducts to take into account (list of strings) + #' + #' @return element_info: peak group list with filled-in intensities (matrix) + + # get information on all elements + if (!is.list(elements) || length(elements) == 0 ) { + all_elements <- initializePSE() + } + # extract information for a particular adduct + if (name == "CH3OH+H") { + # regular_expr should be exact match for name, except for methanol + regular_expr <- "^CH3OH\\+H$" + } else { + regular_expr <- paste0("^", name, "$") + } + element_info <- all_elements[[grep(regular_expr, sapply(all_elements, function(x) { x$name }))]] + return(element_info) +} \ No newline at end of file diff --git a/DIMS/Utils/get_fit_quality.R b/DIMS/Utils/get_fit_quality.R new file mode 100644 index 0000000..23b1947 --- /dev/null +++ b/DIMS/Utils/get_fit_quality.R @@ -0,0 +1,34 @@ +## adapted from getFitQuality.R +# parameter not used: mu_last +get_fit_quality <- function(mass_vector, int_vector, mu_first, mu_last, resol, scale = NULL, sigma = NULL, sum_fit = NULL) { + #' Fit 1 Gaussian peak in small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Value used to calculate area under Gaussian curve (integer) + #' @param mu_first: Value for first peak (float) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param resol: Value for resolution (integer) + #' @param sum_fit: Value indicating quality of fit of Gaussian curve (float) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + if (is.null(sum_fit)) { + mass_vector_int <- mass_vector + int_vector_int <- int_vector + # get new fit quality + fq_new <- mean(abs((scale * dnorm(mass_vector_int, mu_first, sigma)) - int_vector_int) / + rep((max(scale * dnorm(mass_vector_int, mu_first, sigma)) / 2), length(mass_vector_int))) + } else { + sum_fit_int <- sum_fit + int_vector_int <- int_vector + mass_vector_int <- mass_vector + # get new fit quality + fq_new <- mean(abs(sum_fit_int - int_vector_int) / rep(max(sum_fit_int) /2, length(sum_fit_int))) + } + + # Prevent division by 0 + if (is.nan(fq_new)) fq_new <- 1 + + list_params <- list("fq_new" = fq_new, "x_int" = mass_vector_int, "y_int" = int_vector_int) + return(list_params) +} + diff --git a/DIMS/Utils/identify_noisepeaks.R b/DIMS/Utils/identify_noisepeaks.R new file mode 100644 index 0000000..5e557cb --- /dev/null +++ b/DIMS/Utils/identify_noisepeaks.R @@ -0,0 +1,103 @@ +## adapted from ident.hires.noise.HPC +# refactor: remove variables slope, incpt, ppm_iso_fixed +# combine with function get_element_info +# modified identify function to also look for adducts and their isotopes +identify_noisepeaks <- function(peakgroup_list, all_adducts, scanmode = "Negative", look4 = c("Cl", "Ac"), + noise_mz = NULL, resol = 140000, slope = 0, incpt = 0, ppm_fixed, ppm_iso_fixed) { + #' Replace intensities that are zero with random value + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param all_adducts: List of adducts to take into account (list of strings) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param look4: List of adducts to look for (list of strings) + #' @param noise_mz: All known noise peaks (matrix) + #' @param resol: Value for resolution (integer) + #' @param slope: Value for slope for mass correction (float) + #' @param incpt: Value for intercept for mass correction (float) + #' @param ppm_fixed: Value for distance between two values of mass (integer) + #' @param ppm_iso_fixed: Value for distance between two values of mass for isotope peaks (integer) + #' + #' @return final_outlist: peak group list with filled-in intensities (matrix) + + options(stringsAsFactors = FALSE) + metlin <- assi <- iso <- rep("", nrow(peakgroup_list)) + theormz <- nisos <- expint <- conf <- rep(0, nrow(peakgroup_list)) + + # add adducts to identification list + if (scanmode == "Positive") { + adduct_scanmode <- "+" + } else { + adduct_scanmode <- "-" + } + # make a copy of noise_mz + noise_mz_orig <- noise_mz + + # loop over type of adduct + for (adduct_index in 1:length(look4)) { + noise_mz_adduct <- noise_mz_orig + noise_mz_adduct[, "CompoundName"] <- as.character(noise_mz_orig[, "CompoundName"]) + + if (look4[adduct_index] == "H2O") { + add2label <- paste0("[M-", look4[adduct_index], "]", adduct_scanmode) + } else { + add2label <- paste0("[M+", look4[adduct_index], "]", adduct_scanmode) + } + + noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName "], add2label) + adduct_info <- get_element_info(look4[adduct_index], all_adducts) + if (scanmode == "Positive") { + adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] - hydrogen_mass + } else { + adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] + hydrogen_mass + } + + # loop over compounds in database + for (compound_index in 1:nrow(noise_mz_adduct)) { + # construct information for compound + adduct: + if (scanmode == "Positive") { + noise_mz_adduct[compound_index, "Mpos"] <- as.numeric(noise_mz_adduct[compound_index, "Mpos"]) + adduct_mass + noise_mz_adduct[compound_index, "MNeg"] <- 0 + } else { + noise_mz_adduct[compound_index, "Mpos"] <- 0 + noise_mz_adduct[compound_index, "MNeg"] <- as.numeric(noise_mz_adduct[compound_index, "MNeg"]) + adduct_mass + } + } + noise_mz <- rbind(noise_mz, noise_mz_adduct) + } + + if (scanmode == "Positive") { + theor_mcol <- as.numeric(noise_mz[, "Mpos"]) + } else { + theor_mcol <- as.numeric(noise_mz[, "MNeg"]) + } + + # get mz information from peakgroup_list + mcol <- peakgroup_list[, "mzmed.pgrp"] + # if column with average intensities is missing, calculate it: + if (!("avg.int" %in% colnames(peakgroup_list))) { + mzmaxcol <- which(colnames(peakgroup_list) == "mzmax.pgrp") + endcol <- ncol(peakgroup_list) + peakgroup_list[, "avg.int"] <- apply(peakgroup_list[, (mzmaxcol + 1):(endcol)], 1, mean) + } + + # do indentification using own database: + for (row_index in 1:nrow(noise_mz)) { + theor_mz <- theor_mcol[row_index] + + # set tolerance for mz accuracy of main peak + mtol <- theor_mz * ppm_fixed / 1000000 + # find main peak + selp <- which(mcol > (theor_mz - mtol) & mcol < (theor_mz + mtol)) + # if there is more than one candidate peak for main, select best one based on mz_diff + if (length(selp) > 1) { + selp <- selp[abs(mcol[selp] - theor_mz) == min(abs(mcol[selp] - theor_mz))] + } + if (length(selp) == 1) { + assi[selp] <- paste(assi[selp], as.character(noise_mz[row_index, "CompoundName"]), sep = ";") + theormz[selp] <- theor_mz + } + } + + final_outlist <- cbind(peakgroup_list, assi, theormz, conf, nisos, iso, expint, metlin) + return(final_outlist) +} diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R new file mode 100644 index 0000000..da3bc52 --- /dev/null +++ b/DIMS/Utils/merge_duplicate_rows.R @@ -0,0 +1,59 @@ +## adapted from mergeDuplicatedRows.R +merge_duplicate_rows <- function(peakgroup_list) { + #' Merge identification info for peak groups with the same mass + #' + #' @param peakgroup_list: Peak group list (matrix) + #' + #' @return peakgroup_list_dedup: de-duplicated peak group list (matrix) + + collapse <- function(column_label, peakgroup_list, index_dup) { + #' Collapse identification info for peak groups with the same mass + #' + #' @param column_label: Name of column in peakgroup_list (string) + #' @param peakgroup_list: Peak group list (matrix) + #' @param index_dup: Index of duplicate peak group (integer) + #' + #' @return collapsed_items: Semicolon-separated list of info (string) + # get the item(s) that need to be collapsed + list_items <- as.vector(peakgroup_list[index_dup, column_label]) + # remove NA + if (length(which(is.na(list_items))) > 0) list_items <- list_items[-which(is.na(list_items))] + collapsed_items <- paste(list_items, collapse = ";") + return(collapsed_items) + } + + options(digits = 16) + collect <- NULL + remove <- NULL + + # check for peak groups with identical mass + index_dup <- which(duplicated(peakgroup_list[, "mzmed.pgrp"])) + + while (length(index_dup) > 0) { + # get the index for the peak group which is double + peaklist_index <- which(peakgroup_list[, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"]) + single_peakgroup <- peakgroup_list[peaklist_index[1], , drop = FALSE] + + # use function collapse to concatenate info + single_peakgroup[, "assi_HMDB"] <- collapse("assi_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "iso_HMDB"] <- collapse("iso_HMDB", peakgroup_list, peaklist_index) + single_peakgroup[, "HMDB_code"] <- collapse("HMDB_code", peakgroup_list, peaklist_index) + single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index) + if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA + single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index) + if (single_peakgroup[,"theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA + + # keep track of deduplicated entries + collect <- rbind(collect, single_peakgroup) + remove <- c(remove, peaklist_index) + + # remove current entry from index + index_dup <- index_dup[-which(peakgroup_list[index_dup, "mzmed.pgrp"] == peakgroup_list[index_dup[1], "mzmed.pgrp"])] + } + + # remove duplicate entries + if (!is.null(remove)) peakgroup_list <- peakgroup_list[-remove, ] + # append deduplicated entries + peakgroup_list_dedup <- rbind(peakgroup_list, collect) + return(peakgroup_list_dedup) +} diff --git a/DIMS/Utils/optimize_gaussfit.R b/DIMS/Utils/optimize_gaussfit.R new file mode 100644 index 0000000..2d7f95d --- /dev/null +++ b/DIMS/Utils/optimize_gaussfit.R @@ -0,0 +1,23 @@ +## adapted from optimizeGauss.R +optimize_gaussfit <- function(mass_vector, int_vector, sigma, mass_max) { + #' Optimize fit of Gaussian curve to small region of m/z + #' + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param int_vector: Vector of intensities for a region of interest (float) + #' @param sigma: Value for standard deviation (float) + #' @param mass_max: Value for mass at center of peak (float) + #' + #' @return opt_fit: list of fit values for region of interest (list) + + # define optimization function for optim based on normal distribution + opt_f <- function(p, mass_vector, int_vector, sigma, mass_max) { + curve <- p * dnorm(mass_vector, mass_max, sigma) + return((max(curve) - max(int_vector))^2) + } + + # get optimal value for fitted Gaussian curve + opt_fit <- optimize(opt_f, c(0, 100000), tol = 0.0001, mass_vector, int_vector, sigma, mass_max) + + return(opt_fit$minimum) +} + diff --git a/DIMS/Utils/replace_zeros.R b/DIMS/Utils/replace_zeros.R new file mode 100644 index 0000000..cf65340 --- /dev/null +++ b/DIMS/Utils/replace_zeros.R @@ -0,0 +1,64 @@ +## adapted from replaceZeros.R +# this function does two things: replace zeros with random value and identify noise peaks +# refactor: split into two functions +# remove parameters outdir and thresh +# make hard-coded path to file with noise peaks into variable +# remove variables outdir, thresh +replace_zeros <- function(peakgroup_list, repl_pattern, scanmode, resol, outdir, thresh, ppm) { + #' Replace intensities that are zero with random value + #' + #' @param peakgroup_list: Peak group list (matrix) + #' @param repl_pattern: Replication pattern (list of strings) + #' @param scanmode: Scan mode, positive or negative (string) + #' @param resol: Value for resolution (integer) + #' @param outdir: Path for output directory (string) + #' @param thresh: Value for threshold (integer) + #' @param ppm: Value for distance between two values of mass (integer) + #' + #' @return final_outlist: peak group list with filled-in intensities (matrix) + + # replace zeros + if (!is.null(peakgroup_list)) { + for (sample_index in 1:length(names(repl_pattern))) { + sample_peaks <- peakgroup_list[, names(repl_pattern)[sample_index]] + zero_intensity <- which(sample_peaks <= 0) + if (!length(zero_intensity)) { + next + } + for (zero_index in 1:length(zero_intensity)) { + area <- generate_gaussian(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh, + resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area + peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, mean = area, + sd = 0.25 * area) + } + } + + # Add column with average intensity + peakgroup_list <- cbind(peakgroup_list, "avg.int" = apply(peakgroup_list[, 7:(ncol(peakgroup_list) - 4)], 1, mean)) + + if (scanmode == "negative") { + label <- "MNeg" + label2 <- "Negative" + # look for adducts in negative mode + look4_adducts <- c("Cl", "Cl37", "For", "NaCl", "KCl", "H2PO4", "HSO4", "Na-H", "K-H", "H2O", "I") + } else { + label <- "Mpos" + label2 <- "Positive" + # look for adducts in positive mode + look4_adducts <- c("Na", "K", "NaCl", "NH4", "2Na-H", "CH3OH", "KCl", "NaK-H") + } + + # Identify noise peaks + noise_mz <- read.table(file = "/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", + sep = "\t", header = TRUE, quote = "") + noise_mz <- noise_mz[(noise_mz[, label] != 0), 1:4] + outlist_withnoise <- identify_noisepeaks(peakgroup_list, all_adducts, scanmode = label2, + noise_mz, look4 = look4_adducts, resol = resol, + slope = 0, incpt = 0, ppm_fixed = ppm, ppm_iso_fixed = ppm) + noise_info <- outlist_withnoise[, c("assi", "theormz")] + colnames(noise_info) <- c("assi_noise", "theormz_noise") + + final_outlist <- cbind(peakgroup_list, noise_info) + return(final_outlist) + } +} diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R index aa0abbd..ba6920e 100644 --- a/DIMS/Utils/search_mzrange.R +++ b/DIMS/Utils/search_mzrange.R @@ -21,10 +21,6 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r #' #' @return allpeaks_values: list of m/z regions of interest - # initialise list to store results for all peaks. Currently in do_peakfinding - # allpeaks_values <- list("mean" = NULL, "area" = NULL, "nr" = NULL, - # "min" = NULL, "max" = NULL, "qual" = NULL, "spikes" = 0) - # find indices where intensity is not equal to zero nonzero_indices <- as.vector(which(ints_fullrange != 0)) @@ -76,8 +72,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r if (roi_values$qual[1] == 1) { # get optimized fit values - roi_values <- generateGaussian(mass_vector, int_vector, resol, plot, - scanmode, int_factor, width, height) + roi_values <- fit_optim(mass_vector, int_vector, resol, plot, + scanmode, int_factor, width, height) # add region of interest to list of all peaks allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) @@ -100,8 +96,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r } else { - roi_values <- generateGaussian(mass_vector, int_vector, resol, - plot, scanmode, int_factor, width, height) + roi_values <- fit_optim(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) @@ -148,8 +144,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r mz_index, start_index, end_index) if (roi_values$qual[1] == 1) { - roi_values <- generateGaussian(mass_vector, int_vector, resol, - plot, scanmode, int_factor, width, height) + roi_values <- fit_optim(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) @@ -170,8 +166,8 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r } } } else { - roi_values <- generateGaussian(mass_vector, int_vector, resol, - plot, scanmode, int_factor, width, height) + roi_values <- fit_optim(mass_vector, int_vector, resol, + plot, scanmode, int_factor, width, height) allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) diff --git a/DIMS/Utils/sum_curves.R b/DIMS/Utils/sum_curves.R new file mode 100644 index 0000000..542026b --- /dev/null +++ b/DIMS/Utils/sum_curves.R @@ -0,0 +1,35 @@ +## adapted from sumCurves.R +# variables with fixed values will be removed from function parameters +# plot +# parameter half_max not used +sum_curves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, mass_vector2, mass_vector, resol, plot) { + #' Sum two curves + #' + #' @param mean1: Value for mean m/z of first peak (float) + #' @param mean2: Value for mean m/z of second peak (float) + #' @param scale1: Initial value used to estimate scaling parameter for first peak (integer) + #' @param scale2: Initial value used to estimate scaling parameter for second peak (integer) + #' @param sigma1: Value for standard deviation for first peak (float) + #' @param sigma2: Value for standard deviation for second peak (float) + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + + sum_fit <- (scale1 * dnorm(mass_vector2, mean1, sigma1)) + (scale2 * dnorm(mass_vector2, mean2, sigma2)) + + mean1_plus2 <- weighted.mean(c(mean1, mean2), c(max(scale1 * dnorm(mass_vector2, mean1, sigma1)), + max(scale2 * dnorm(mass_vector2, mean2, sigma2)))) + + # get new values for parameters + fwhm <- get_fwhm(mean1_plus2, resol) + area <- max(sum_fit) + scale <- scale1 + scale2 + sigma <- (fwhm / 2) * 0.85 + + list_params <- list("mean" = mean1_plus2, "area" = area, "scale" = scale, "sigma" = sigma) + return(list_params) +} + diff --git a/DIMS/Utils/within_ppm.R b/DIMS/Utils/within_ppm.R new file mode 100644 index 0000000..abdb0d4 --- /dev/null +++ b/DIMS/Utils/within_ppm.R @@ -0,0 +1,64 @@ +## adapted from isWithinXppm.R +# variables with fixed values will be removed from function parameters +# plot +within_ppm <- function(mean, scale, sigma, area, mass_vector2, mass_vector, ppm = 4, resol, plot) { + #' Test whether two mass ranges are within ppm distance of each other + #' + #' @param mean: Value for mean m/z (float) + #' @param scale: Initial value used to estimate scaling parameter (integer) + #' @param sigma: Value for standard deviation (float) + #' @param area: Value for area under the curve (float) + #' @param mass_vector2: Vector of equally spaced m/z values (float) + #' @param mass_vector: Vector of m/z values for a region of interest (float) + #' @param ppm: Value for distance between two values of mass (integer) + #' @param resol: Value for resolution (integer) + #' @param plot: Parameter indicating whether plots should be made (boolean) + #' + #' @return list_params: list of parameters indicating quality of fit (list) + + # sort + index <- order(mean) + mean <- mean[index] + scale <- scale[index] + sigma <- sigma[index] + area <- area[index] + + summed <- NULL + remove <- NULL + + if (length(mean) > 1) { + for (i in 2:length(mean)) { + if ((abs(mean[i - 1] - mean[i]) / mean[i - 1]) * 10^6 < ppm) { + + # avoid double occurance in sum + if ((i - 1) %in% summed) next + + result_values <- sum_curves(mean[i - 1], mean[i], scale[i - 1], scale[i], sigma[i - 1], sigma[i], + mass_vector2, mass_vector, resol, plot) + summed <- c(summed, i - 1, i) + if (is.nan(result_values$mean)) result_values$mean <- 0 + mean[i - 1] <- result_values$mean + mean[i] <- result_values$mean + area[i - 1] <- result_values$area + area[i] <- result_values$area + scale[i - 1] <- result_values$scale + scale[i] <- result_values$scale + sigma[i - 1] <- result_values$sigma + sigma[i] <- result_values$sigma + + remove <- c(remove, i) + } + } + } + + if (length(remove) != 0) { + mean <- mean[-c(remove)] + area <- area[-c(remove)] + scale <- scale[-c(remove)] + sigma <- sigma[-c(remove)] + } + + list_params <- list("mean" = mean, "area" = area, "scale" = scale, "sigma" = sigma, "qual" = NULL) + return(list_params) +} + From 0a8171971baf97600f5100f58d8cde954bbe489f Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 26 Apr 2024 14:24:53 +0200 Subject: [PATCH 62/73] TIC plots and TRFP flag --- DIMS/AverageTechReplicates.R | 6 ++++-- DIMS/ThermoRawFileParser.nf | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index b243025..bf27b3e 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -46,7 +46,7 @@ load(init_file) # lower the threshold below which a sample will be removed for DBS and for high m/z if (dims_matrix == "DBS") { - thresh2remove <- 50000000 + thresh2remove <- 500000000 } if (highest_mz > 700) { thresh2remove <- 1000000 @@ -66,6 +66,7 @@ for (sample_nr in 1:length(repl_pattern)) { nr_neg <- 0 for (file_nr in 1:length(tech_reps)) { load(paste(tech_reps[file_nr], ".RData", sep = "")) + cat("\n\nParsing", tech_reps[file_nr]) # negative scanmode cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) if (sum(peak_list$neg[, 1]) < thresh2remove) { @@ -146,7 +147,8 @@ for (sample_nr in c(1:length(repl_pattern))) { sample_name <- names(repl_pattern)[sample_nr] for (file_nr in 1:length(tech_reps)) { plot_nr <- plot_nr + 1 - repl1_nr <- read.table(tic_files[file_nr]) + repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) + # repl1_nr <- read.table(tic_files[file_nr]) bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]] bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]] if (bad_color_neg & bad_color_pos) { diff --git a/DIMS/ThermoRawFileParser.nf b/DIMS/ThermoRawFileParser.nf index 6e51984..022a2c9 100644 --- a/DIMS/ThermoRawFileParser.nf +++ b/DIMS/ThermoRawFileParser.nf @@ -13,6 +13,6 @@ process ConvertRawFile { """ source /hpc/dbg_mz/tools/mono/etc/profile - mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./ + mono /hpc/dbg_mz/tools/ThermoRawFileParser_1.1.11/ThermoRawFileParser.exe -i=${raw_file} --output=./ -p """ } From 4041d978fd9989114b930462b757a91c3a9a3ed2 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 26 Apr 2024 15:18:30 +0200 Subject: [PATCH 63/73] Fix TIC plot pdf --- DIMS/AverageTechReplicates.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index bf27b3e..81f6b46 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -147,8 +147,8 @@ for (sample_nr in c(1:length(repl_pattern))) { sample_name <- names(repl_pattern)[sample_nr] for (file_nr in 1:length(tech_reps)) { plot_nr <- plot_nr + 1 - repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) - # repl1_nr <- read.table(tic_files[file_nr]) + # repl1_nr <- read.table(paste(paste(outdir, "2-pklist/", sep = "/"), tech_reps[file_nr], "_TIC.txt", sep = "")) + repl1_nr <- read.table(paste0(tech_reps[file_nr], "_TIC.txt")) bad_color_pos <- tech_reps[file_nr] %in% remove_pos[[1]] bad_color_neg <- tech_reps[file_nr] %in% remove_neg[[1]] if (bad_color_neg & bad_color_pos) { From 2089babdd517e1024746874336d3b0416a5e9b30 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 24 May 2024 10:46:20 +0200 Subject: [PATCH 64/73] corrected error in aggregate lines --- DIMS/AssignToBins.R | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 7c7a95a..472b1e6 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -76,19 +76,23 @@ bin_indices_neg <- cut( if (nrow(pos_raw_data_matrix) > 0) { # set NA in intensities to zero pos_raw_data_matrix[is.na(pos_raw_data_matrix[, "intensity"]), "intensity"] <- 0 - # use only values above dims_thresh - pos_intensity_above_threshold <- pos_raw_data_matrix[which(pos_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"] - # aggregate intensities, calculate mean - aggr_int_pos <- stats::aggregate(pos_intensity_above_threshold, list(bin_indices_pos), mean) + # aggregate intensities, calculate mean, use only values above dims_thresh + aggr_int_pos <- stats::aggregate(pos_raw_data_matrix[, "intensity"], + list(bin_indices_pos), + FUN = function(x) { mean(x[which(x > dims_thresh)]) }) + # set NA to zero in second column + aggr_int_pos[is.na(aggr_int_pos[, 2]), 2] <- 0 pos_bins[aggr_int_pos[, 1]] <- aggr_int_pos[, 2] } if (nrow(neg_raw_data_matrix) > 0) { # set NA in intensities to zero neg_raw_data_matrix[is.na(neg_raw_data_matrix[, "intensity"]), "intensity"] <- 0 - # use only values above dims_thresh - neg_intensity_above_threshold <- neg_raw_data_matrix[which(neg_raw_data_matrix[, "intensity"] > dims_thresh), "intensity"] - # aggregate intensities, calculate mean - aggr_int_neg <- stats::aggregate(neg_intensity_above_threshold, list(bin_indices_neg), mean) + # aggregate intensities, calculate mean, use only values above dims_thresh + aggr_int_neg <- stats::aggregate(neg_raw_data_matrix[, "intensity"], + list(bin_indices_neg), + FUN = function(x) { mean(x[which(x > dims_thresh)]) }) + # set NA to zero in second column + aggr_int_neg[is.na(aggr_int_neg[, 2]), 2] <- 0 neg_bins[aggr_int_neg[, 1]] <- aggr_int_neg[, 2] } From a9c267c6629ef2dcdbad52b813b854ef6cb968e7 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 24 May 2024 14:57:07 +0200 Subject: [PATCH 65/73] code review changes applied --- DIMS/AssignToBins.R | 1 - DIMS/AverageTechReplicates.R | 1 - DIMS/AverageTechReplicates.nf | 3 +- DIMS/CollectFilled.R | 5 ++- DIMS/CollectSumAdducts.R | 1 - DIMS/FillMissing.R | 21 ++++++------ DIMS/GenerateBreaks.R | 1 - DIMS/GenerateExcel.R | 2 -- DIMS/GenerateViolinPlots.R | 1 - DIMS/HMDBparts.R | 1 - DIMS/HMDBparts_main.R | 1 - DIMS/MakeInit.R | 1 - DIMS/PeakFinding.R | 59 +++++++++++++-------------------- DIMS/PeakGrouping.R | 1 - DIMS/SpectrumPeakFinding.R | 3 +- DIMS/SumAdducts.R | 1 - DIMS/UnidentifiedCalcZscores.R | 1 - DIMS/UnidentifiedCollectPeaks.R | 1 - DIMS/UnidentifiedFillMissing.R | 1 - DIMS/UnidentifiedPeakGrouping.R | 1 - 20 files changed, 38 insertions(+), 69 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 472b1e6..8479c09 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 2-DIMS.R # load required packages diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 81f6b46..6a773f5 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript # adapted from 3-AverageTechReplicates.R # load packages diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 9e47af3..5387e61 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -24,7 +24,8 @@ process AverageTechReplicates { """ Rscript ${baseDir}/CustomModules/DIMS/AverageTechReplicates.R $init_file \ $params.nr_replicates \ - $analysis_id $matrix \ + $analysis_id \ + $matrix \ $highest_mz_file """ } diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index f28e1c9..48fba8c 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 10-collectSamplesFilled.R # define parameters @@ -8,8 +7,8 @@ scripts_dir <- cmd_args[1] ppm <- as.numeric(cmd_args[2]) z_score <- as.numeric(cmd_args[3]) -source(paste0(scripts_dir, "mergeDuplicatedRows.R")) -source(paste0(scripts_dir, "statistics_z.R")) +source(paste0(scripts_dir, "merge_duplicate_rows.R")) +source(paste0(scripts_dir, "calculate_zscores.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") diff --git a/DIMS/CollectSumAdducts.R b/DIMS/CollectSumAdducts.R index dcd05af..f4f4723 100755 --- a/DIMS/CollectSumAdducts.R +++ b/DIMS/CollectSumAdducts.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 12-collectSamplesAdded.R # define parameters diff --git a/DIMS/FillMissing.R b/DIMS/FillMissing.R index f96e257..a76c163 100755 --- a/DIMS/FillMissing.R +++ b/DIMS/FillMissing.R @@ -1,5 +1,4 @@ -#!/usr/bin/Rscript -# adapted from 9-runFillMissing.R +## adapted from 9-runFillMissing.R # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -12,15 +11,15 @@ ppm <- as.numeric(cmd_args[5]) outdir <- "./" # load in function scripts -source(paste0(scripts_dir, "replaceZeros.R")) -source(paste0(scripts_dir, "generateGaussian.R")) -source(paste0(scripts_dir, "getFwhm.R")) -source(paste0(scripts_dir, "getSD.R")) -source(paste0(scripts_dir, "getArea.R")) -source(paste0(scripts_dir, "optimizeGauss.R")) -source(paste0(scripts_dir, "ident.hires.noise.HPC.R")) -source(paste0(scripts_dir, "elementInfo.R")) -source(paste0(scripts_dir, "globalAssignments.HPC.R")) +source(paste0(scripts_dir, "replace_zeros.R")) +source(paste0(scripts_dir, "fit_optim.R")) +source(paste0(scripts_dir, "get_fwhm.R")) +source(paste0(scripts_dir, "get_stdev.R")) +source(paste0(scripts_dir, "estimate_area.R")) +source(paste0(scripts_dir, "optimize_gaussfit.R")) +source(paste0(scripts_dir, "identify_noisepeaks.R")) +source(paste0(scripts_dir, "get_element_info.R")) +source(paste0(scripts_dir, "atomic_info.R")) # determine scan mode if (grepl("_pos", peakgrouplist_file)) { diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 3e38f1c..e475b34 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 1-generateBreaksFwhm.HPC.R ## # load required package diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index b2edf5c..7d34204 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 13-excelExport.R # load required packages @@ -290,7 +289,6 @@ is_summed <- is_list[c(names(repl_pattern), "HMDB_code")] is_summed$HMDB.name <- is_list$name is_summed <- reshape2::melt(is_summed, id.vars = c("HMDB_code", "HMDB.name")) colnames(is_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -is_summed$Intensity <- as.numeric(is_summed$Intensity) is_summed$Matrix <- dims_matrix is_summed$Rundate <- rundate is_summed$Project <- project diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 562c794..c7026a9 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -7,7 +7,6 @@ # corresponding Z-scores. # 2. All files from github: https://github.com/UMCUGenetics/DIMS -#!/usr/bin/Rscript ## adapted from 15-dIEM_violin.R # load packages diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 4cb343f..520286b 100644 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript # adapted from hmdb_parts.R # define parameters diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 0486b03..932c9e4 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from hmdb_part_adductSums.R # define parameters diff --git a/DIMS/MakeInit.R b/DIMS/MakeInit.R index b01a31b..44d4996 100644 --- a/DIMS/MakeInit.R +++ b/DIMS/MakeInit.R @@ -1,4 +1,3 @@ -#!/usr/bin/env Rscript ## adapted from makeInit in old pipeline # define parameters diff --git a/DIMS/PeakFinding.R b/DIMS/PeakFinding.R index 08eb546..697f8f7 100644 --- a/DIMS/PeakFinding.R +++ b/DIMS/PeakFinding.R @@ -1,5 +1,4 @@ -#!/usr/bin/Rscript -# adapted from 4-peakFinding.R +## adapted from 4-peakFinding.R # define parameters cmd_args <- commandArgs(trailingOnly = TRUE) @@ -12,27 +11,22 @@ thresh <- 2000 outdir <- "./" # load in function scripts -source(paste0(scripts_dir, "findPeaks.Gauss.HPC.R")) -source(paste0(scripts_dir, "searchMZRange.R")) -source(paste0(scripts_dir, "generateGaussian.R")) -source(paste0(scripts_dir, "fitGaussian.R")) -source(paste0(scripts_dir, "fitGaussianInit.R")) -source(paste0(scripts_dir, "getFwhm.R")) -source(paste0(scripts_dir, "getSD.R")) -source(paste0(scripts_dir, "optimizeGauss.R")) -source(paste0(scripts_dir, "fit1Peak.R")) -source(paste0(scripts_dir, "fit2peaks.R")) -source(paste0(scripts_dir, "fit3peaks.R")) -source(paste0(scripts_dir, "fit4peaks.R")) -source(paste0(scripts_dir, "fitG.R")) -source(paste0(scripts_dir, "fit2G.R")) -source(paste0(scripts_dir, "fit3G.R")) -source(paste0(scripts_dir, "fit4G.R")) -source(paste0(scripts_dir, "getArea.R")) -source(paste0(scripts_dir, "getFitQuality.R")) -source(paste0(scripts_dir, "checkOverlap.R")) -source(paste0(scripts_dir, "sumCurves.R")) -source(paste0(scripts_dir, "isWithinXppm.R")) +source(paste0(scripts_dir, "do_peakfinding.R")) +source(paste0(scripts_dir, "check_overlap.R")) +source(paste0(scripts_dir, "search_mzrange.R")) +source(paste0(scripts_dir, "fit_optim.R")) +source(paste0(scripts_dir, "fit_gaussian.R")) +source(paste0(scripts_dir, "fit_init.R")) +source(paste0(scripts_dir, "get_fwhm.R")) +source(paste0(scripts_dir, "get_stdev.R")) +source(paste0(scripts_dir, "optimize_gaussfit.R")) +source(paste0(scripts_dir, "fit_peaks.R")) +source(paste0(scripts_dir, "fit_gaussians.R")) +source(paste0(scripts_dir, "estimate_area.R")) +source(paste0(scripts_dir, "get_fit_quality.R")) +source(paste0(scripts_dir, "check_overlap.R")) +source(paste0(scripts_dir, "sum_curves.R")) +source(paste0(scripts_dir, "within_ppm.R")) load(breaks_file) @@ -46,21 +40,14 @@ if (grepl("_pos", sample_file)) { # Initialize options(digits = 16) -int_factor <- 1 * 10^5 # Number used to calculate area under Gaussian curve -scale <- 2 # Initial value used to estimate scaling parameter +# Number used to calculate area under Gaussian curve +int_factor <- 1 * 10^5 +# Initial value used to estimate scaling parameter +scale <- 2 width <- 1024 height <- 768 # run the findPeaks function -print(head(sample_avgtechrepl)) -print(head(breaks_fwhm)) -print(int_factor) -print(scale) -print(resol) -print(outdir) -print(scanmode) -print(thresh) -print(width) -print(height) -do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +# do_peakfinding(sample_avgtechrepl, breaks_fwhm, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) +do_peakfinding(sample_avgtechrepl, int_factor, scale, resol, outdir, scanmode, FALSE, thresh, width, height) diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index 8da74d9..dc29e76 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript # adapted from 6-peakGrouping.R # define parameters diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index cf6665c..143299e 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -1,5 +1,4 @@ -#!/usr/bin/Rscript -# adapted from 5-collectSamples.R +## adapted from 5-collectSamples.R # define parameters scanmodes <- c("positive", "negative") diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 870a914..877b3c8 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 11-runSumAdducts.R # define parameters diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R index e58063a..0192d0f 100755 --- a/DIMS/UnidentifiedCalcZscores.R +++ b/DIMS/UnidentifiedCalcZscores.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 10-collectSamplesFilled.R # define parameters diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index be1ac0e..7376eb6 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 7-collectSamplesGroupedHMDB.R # load required packages diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R index 0fbdea4..1611986 100755 --- a/DIMS/UnidentifiedFillMissing.R +++ b/DIMS/UnidentifiedFillMissing.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript # adapted from 9-runFillMissing.R # define parameters diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R index 4600820..08da5d2 100755 --- a/DIMS/UnidentifiedPeakGrouping.R +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -1,4 +1,3 @@ -#!/usr/bin/Rscript ## adapted from 8-peakGrouping.rest.R # define parameters From 79bc26a8ba09e60b0d8a553e31f7363e2b8f0381 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 24 May 2024 14:58:23 +0200 Subject: [PATCH 66/73] code review changes applied --- DIMS/HMDBparts_old.R | 126 ------------------------------- DIMS/UnidentifiedCollectPeaks.nf | 3 +- 2 files changed, 1 insertion(+), 128 deletions(-) delete mode 100644 DIMS/HMDBparts_old.R diff --git a/DIMS/HMDBparts_old.R b/DIMS/HMDBparts_old.R deleted file mode 100644 index 10fdc37..0000000 --- a/DIMS/HMDBparts_old.R +++ /dev/null @@ -1,126 +0,0 @@ -#!/usr/bin/Rscript - -# load required packages -# none - -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") - -# outdir <- cmd_args[1] -# scanmode <- cmd_args[2] -db_path <- cmd_args[1] # location of HMDB db file -breaks_filepath <- cmd_args[2] # location of breaks.fwhm.RData -standard_run <- "no" # cmd_args[5] # "yes" - -# Cut up entire HMDB into small parts based on the new binning/breaks - -# load(paste(outdir, "breaks.fwhm.RData", sep = "/")) -load(breaks_filepath) -#outdir_hmdb <- paste(outdir, "hmdb_part", sep = "/") -#dir.create(outdir_hmdb, showWarnings = FALSE) - -# New: in case of a standard run (m/z 69-606) use external HMDB parts -min_mz <- round(breaks.fwhm[1]) -max_mz <- round(breaks.fwhm[length(breaks.fwhm)]) -# test if standard mz range is used -if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 600 & max_mz < 610) { - # skip generating HMDB parts - use_external_HMDB <- TRUE - save(use_external_HMDB, file="./using_external_HMDB_parts.RData") - hmdb_parts_dir <- cmd_args[6] # "/hpc/dbg_mz/production/DIMS/hmdb_preparts/" - hmdb_parts <- list.files(hmdb_parts_dir, pattern=scanmode) # only positive or negative files - # save(HMDBstukken, file=paste(outdir, "HMDBstukken.RData", sep="/")) - for (hmdb_file in hmdb_parts) { - file.copy(paste(hmdb_parts_dir, hmdb_file, sep="/"), outdir_hmdb, recursive = TRUE) - } -} else { - # generate HMDB parts in case of non-standard mz range - use_external_HMDB <- FALSE - save(use_external_HMDB, file="not_using_external_HMDB_parts.RData") - # db <- cmd_args[3] - load(db_path) - ppm <- as.numeric(cmd_args[4]) - if (scanmode=="negative"){ - label = "MNeg" - HMDB_add_iso=HMDB_add_iso.Neg - } else { - label = "Mpos" - HMDB_add_iso=HMDB_add_iso.Pos - } - - # filter mass range meassured!!! - HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[,label]>=breaks.fwhm[1] & HMDB_add_iso[,label]<=breaks.fwhm[length(breaks.fwhm)]),] - - # sort on mass - outlist = HMDB_add_iso[order(as.numeric(HMDB_add_iso[,label])),] - - n=dim(outlist)[1] - sub=20000 # max rows per file - end=0 - min_1_last=sub - check=0 - outlist_part=NULL - - - if (n < sub) { - outlist_part <- outlist - save(outlist_part, file = paste(outdir_hmdb, paste0(scanmode, "_hmdb.1.RData"), sep = "/")) - } else { - - if (n >= sub & (floor(n/sub) - 1) >= 2){ - for (i in 2:floor(n/sub) - 1){ - start <- -(sub - 1) + i*sub - end <- i*sub - - if (i > 1){ - outlist_i = outlist[c(start:end),] - - n_moved = 0 - - # Calculate 3ppm and replace border, avoid cut within peakgroup! - while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { - outlist_part <- rbind(outlist_part, outlist_i[1,]) - outlist_i <- outlist_i[-1,] - n_moved <- n_moved + 1 - } - - # message(paste("Process", i-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste(outdir_hmdb, paste(scanmode, paste("hmdb",i-1,"RData", sep="."), sep="_"), sep = "/")) - check <- check + dim(outlist_part)[1] - - outlist_part <- outlist_i - min_1_last <- dim(outlist_part)[1] - - } else { - outlist_part <- outlist[c(start:end),] - } - } - } - - start <- end + 1 - end <- n - outlist_i <- outlist[c(start:end),] - n_moved <- 0 - - if (!is.null(outlist_part)) { - # Calculate 3ppm and replace border, avoid cut within peakgroup! - while ((as.numeric(outlist_i[1,label]) - as.numeric(outlist_part[min_1_last,label]))*1e+06/as.numeric(outlist_i[1,label]) < ppm) { - outlist_part = rbind(outlist_part, outlist_i[1,]) - outlist_i = outlist_i[-1,] - n_moved = n_moved + 1 - } - - # message(paste("Process", i+1-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste(scanmode, paste("hmdb", i, "RData", sep = "."), sep = "_")) - check <- check + dim(outlist_part)[1] - } - - outlist_part <- outlist_i - # message(paste("Process", i+2-1,":", dim(outlist_part)[1])) - save(outlist_part, file = paste(scanmode, paste("hmdb", i + 1, "RData", sep="."), sep="_")) - check <- check + dim(outlist_part)[1] - cat("\n", "Check", check == dim(outlist)[1]) - - } -} diff --git a/DIMS/UnidentifiedCollectPeaks.nf b/DIMS/UnidentifiedCollectPeaks.nf index 4f419a7..0983e9e 100644 --- a/DIMS/UnidentifiedCollectPeaks.nf +++ b/DIMS/UnidentifiedCollectPeaks.nf @@ -9,11 +9,10 @@ process UnidentifiedCollectPeaks { path(peaklist_identified) output: - // path('SpectrumPeaks_*_Unidentified.RData') path('*.RData') script: """ - Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $spectrumpeaks_file $params.ppm + Rscript ${baseDir}/CustomModules/DIMS/UnidentifiedCollectPeaks.R $params.ppm """ } From 40da954972d7557dd717bc09c1fa8c661ada29b7 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Mon, 27 May 2024 12:59:16 +0200 Subject: [PATCH 67/73] fixed names of Utils function calls --- DIMS/Utils/fit_gaussian.R | 17 +++++++++++---- DIMS/Utils/fit_gaussians.R | 43 ++++++++++++++++++++++++++++---------- DIMS/Utils/fit_init.R | 6 ++---- DIMS/Utils/fit_optim.R | 7 ++++--- DIMS/Utils/fit_peaks.R | 18 ++++++++-------- DIMS/Utils/get_stdev.R | 3 +-- 6 files changed, 61 insertions(+), 33 deletions(-) diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R index 35275cd..b22ba01 100644 --- a/DIMS/Utils/fit_gaussian.R +++ b/DIMS/Utils/fit_gaussian.R @@ -40,6 +40,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) + print("nu in fit_gaussian, if force == 1") + print(fit_values) # set initial value for scale factor scale <- 2 # test if the mean is outside the m/z range @@ -47,7 +49,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) - + print("nu in fit_gaussian, mean outside m/z range") } else { # test if the fit is bad if (fit_values$qual > fit_quality1) { @@ -62,10 +64,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, outdir, force = 2, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) + print("nu in fit_gaussian, bad fit ") # good fit } else { peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, getArea(fit_values$mean, resol, fit_values$scale, + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) peak_qual <- fit_values$qual peak_min <- mass_vector[1] @@ -76,8 +79,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale #### Two local maxima; need at least 6 data points for this #### } else if (force == 2 && (length(mass_vector) > 6)) { # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) - fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, new_index, scale, resol, + fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) + print("nu in fit_gaussian force == 2") + print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { @@ -140,6 +145,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) + print("nu in fit_gaussian force == 3") + print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || @@ -204,6 +211,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) + print("nu in fit_gaussian force == 4") + print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || @@ -279,7 +288,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_qual <- 0 } else { peak_mean <- c(peak_mean, fit_values$mean) - peak_area <- c(peak_area, get_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) + peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) peak_qual <- fit_values$qual peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R index c4924e1..1a4c34e 100644 --- a/DIMS/Utils/fit_gaussians.R +++ b/DIMS/Utils/fit_gaussians.R @@ -17,15 +17,21 @@ fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) sum((d - int_vector) ^ 2) } - +print("nu in fit_1gaussian") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0) upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) # get optimal value for fitted Gaussian curve - opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + tryCatch(opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), opt_f, control = list(maxit = 10000), method = "L-BFGS-B", - lower = lower, upper = upper) + lower = lower, upper = upper), + error = function(e) { + # in case of error, use regular optim without boundaries + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_f, control = list(maxit = 10000)) + write.table(opt_fit, file = paste0("tryCatch_error_", query_mass, ".txt"), row.names = FALSE) + } ) } else { opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), opt_f, control = list(maxit = 10000)) @@ -58,6 +64,7 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sum((d - int_vector) ^ 2) } +print("nu in fit_2gaussians") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0) @@ -65,15 +72,27 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, # get optimal value for 2 fitted Gaussian curves if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { sigma2 <- sigma1 - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass1), as.numeric(scale1)), - opt_f, control = list(maxit = 10000), - method = "L-BFGS-B", lower = lower, upper = upper) + tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper), + error = function(e) { + # in case of error, use regular optim without boundaries + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000)) + write.table(res, file = paste0("tryCatch_error_2gauss_", query_mass, ".txt"), row.names = FALSE) } ) } else { - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass2), as.numeric(scale2)), - opt_f, control = list(maxit = 10000), - method = "L-BFGS-B", lower = lower, upper = upper) + tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass2), as.numeric(scale2)), + opt_f, control = list(maxit = 10000), + method = "L-BFGS-B", lower = lower, upper = upper), + error = function(e) { + # in case of error, use regular optim without boundaries + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + as.numeric(query_mass1), as.numeric(scale1)), + opt_f, control = list(maxit = 10000)) + write.table(res, file = paste0("tryCatch_error_2gauss_else_", query_mass, ".txt"), row.names = FALSE) } ) } } else { if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { @@ -120,6 +139,7 @@ fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sum((d - int_vector) ^ 2) } +print("nu in fit_3gaussians") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) @@ -174,6 +194,7 @@ fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigm sum((d - int_vector) ^ 2) } +print("nu in fit_4gaussians") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R index c761194..0c4e513 100644 --- a/DIMS/Utils/fit_init.R +++ b/DIMS/Utils/fit_init.R @@ -2,7 +2,6 @@ # variables with fixed values will be removed from function parameters # scale, outdir, plot, width, height # mz_index, start_index, end_index, sample_name not used. -# use_bounds is used, but not defined. # fit_gaussian should be defined before this function is called. fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, outdir, sample_name, scanmode, plot, width, height, @@ -34,17 +33,16 @@ fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, # Find the index in int_vector with the highest intensity max_index <- which(int_vector == max(int_vector)) - +print("nu in fit_init") roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = length(max_index), - useBounds = FALSE, plot, scanmode, int_factor, width, height) + use_bounds = FALSE, plot, scanmode, int_factor, width, height) # put all values for this region of interest into a list roi_value_list <- list("mean" = roi_values$mean, "area" = roi_values$area, "qual" = roi_values$qual, "min" = roi_values$min, "max" = roi_values$max) - return(roi_value_list) } diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R index e280b57..ed71622 100644 --- a/DIMS/Utils/fit_optim.R +++ b/DIMS/Utils/fit_optim.R @@ -32,17 +32,18 @@ fit_optim <- function(mass_vector, int_vector, resol, mass_diff <- mass_max_simple[length(mass_max_simple)] - mass_max_simple[1] # generate a second mass_vector with equally spaced m/z values mass_vector2 <- seq(mass_max_simple[1], mass_max_simple[length(mass_max_simple)], - length = mz_diff * int_factor) + length = mass_diff * int_factor) sigma <- get_stdev(mass_vector2, int_max_simple) scale <- optimize_gaussfit(mass_vector2, int_max_simple, sigma, mass_max) # get an estimate of the area under the peak - area <- get_area(mass_max, resol, scale, sigma, int_factor) - + area <- estimate_area(mass_max, resol, scale, sigma, int_factor) +print("nu in fit_optim") # put all values for this region of interest into a list roi_value_list <- list("mean" = mass_max, "area" = area, "min" = mass_vector2[1], "max" = mass_vector2[length(mass_vector2)]) +print(roi_value_list) return(roi_value_list) } diff --git a/DIMS/Utils/fit_peaks.R b/DIMS/Utils/fit_peaks.R index 92e08d6..02d5408 100644 --- a/DIMS/Utils/fit_peaks.R +++ b/DIMS/Utils/fit_peaks.R @@ -151,8 +151,8 @@ fit_2peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, sum_fit = sum_fit)$fq_new # get parameter values - area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor) - area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor) + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) peak_area <- c(peak_area, area1) peak_area <- c(peak_area, area2) peak_mean <- c(peak_mean, pc[1]) @@ -245,9 +245,9 @@ fit_3peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, sum_fit = sum_fit)$fq_new # get parameter values - area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor) - area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor) - area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor) + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) + area3 <- estimate_area(pc[5], resol, pc[6], sigma3, int_factor) peak_area <- c(peak_area, area1) peak_area <- c(peak_area, area2) peak_area <- c(peak_area, area3) @@ -354,10 +354,10 @@ fit_4peaks <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, sum_fit = sum_fit)$fq_new # get parameter values - area1 <- get_area(pc[1], resol, pc[2], sigma1, int_factor) - area2 <- get_area(pc[3], resol, pc[4], sigma2, int_factor) - area3 <- get_area(pc[5], resol, pc[6], sigma3, int_factor) - area4 <- get_area(pc[7], resol, pc[8], sigma4, int_factor) + area1 <- estimate_area(pc[1], resol, pc[2], sigma1, int_factor) + area2 <- estimate_area(pc[3], resol, pc[4], sigma2, int_factor) + area3 <- estimate_area(pc[5], resol, pc[6], sigma3, int_factor) + area4 <- estimate_area(pc[7], resol, pc[8], sigma4, int_factor) peak_area <- c(peak_area, area1) peak_area <- c(peak_area, area2) peak_area <- c(peak_area, area3) diff --git a/DIMS/Utils/get_stdev.R b/DIMS/Utils/get_stdev.R index cfe0aa4..a385187 100644 --- a/DIMS/Utils/get_stdev.R +++ b/DIMS/Utils/get_stdev.R @@ -7,7 +7,6 @@ get_stdev <- function(mass_vector, int_vector, resol = 140000) { #' @param resol: Value for resolution (integer) #' #' @return stdev: Value for standard deviation - # find maximum intensity in vector max_index <- which(int_vector == max(int_vector)) # find corresponding mass at maximum intensity @@ -15,7 +14,7 @@ get_stdev <- function(mass_vector, int_vector, resol = 140000) { # calculate resolution at given m/z value resol_mz <- resol * (1 / sqrt(2) ^ (log2(max_mass / 200))) # calculate full width at half maximum - fwhm <- mean / resol_mz + fwhm <- max_mass / resol_mz # calculate standard deviation stdev <- (fwhm / 2) * 0.85 return(stdev) From c37ddb8e97c15b3a4f870a8c0444ab8edec1205e Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Thu, 30 May 2024 15:58:25 +0200 Subject: [PATCH 68/73] Added Zscore if statement --- DIMS/GenerateViolinPlots.R | 879 ++++++++++++++++++------------------- 1 file changed, 429 insertions(+), 450 deletions(-) diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index c7026a9..29f41cf 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -29,480 +29,459 @@ file_expected_biomarkers_iem <- cmd_args[6] file_explanation <- cmd_args[7] file_isomers <- cmd_args[8] -print("arguments") -print(run_name) -print(scripts_dir) -print(z_score) -print(path_metabolite_groups) -print(file_ratios_metabolites) -print(file_expected_biomarkers_iem) -print(file_explanation) -print(file_isomers) -print("end arguments") - -# path: output folder for dIEM and violin plots -output_dir <- "./" - -file.copy(file_isomers, output_dir) - -# folder for all metabolite lists (.txt) -# path_metabolite_groups <- "/hpc/dbg_mz/tools/db/metabolite_groups" -# file for ratios step 3 -# file_ratios_metabolites <- "/hpc/dbg_mz/tools/db/dIEM/Ratios_between_metabolites.csv" -# file for algorithm step 4 -# file_expected_biomarkers_iem <- "/hpc/dbg_mz/tools/db/dIEM/Expected_biomarkers_IEM.csv" -# explanation: file with text to be included in violin plots -# file_explanation <- "/hpc/dbg_mz/tools/Explanation_violin_plots.txt" - -# copy list of isomers to project folder. -# file.copy("/hpc/dbg_mz/tools/isomers.txt", output_dir) - -# load functions -source(paste0(scripts_dir, "check_same_samplename.R")) -source(paste0(scripts_dir, "prepare_data.R")) -source(paste0(scripts_dir, "prepare_data_perpage.R")) -source(paste0(scripts_dir, "prepare_toplist.R")) -source(paste0(scripts_dir, "create_violin_plots.R")) -source(paste0(scripts_dir, "prepare_alarmvalues.R")) -source(paste0(scripts_dir, "output_helix.R")) -source(paste0(scripts_dir, "get_patient_data_to_helix.R")) -source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R")) -source(paste0(scripts_dir, "is_diagnostic_patient.R")) - -# number of diseases that score highest in algorithm to plot -top_nr_iem <- 5 -# probability score cut-off for plotting the top diseases -threshold_iem <- 5 -# z-score cutoff of axis on the left for top diseases -ratios_cutoff <- -5 -# number of violin plots per page in PDF -nr_plots_perpage <- 20 - -# binary variable: run function, yes(1) or no(0) -if (z_score == 1) { - algorithm <- ratios <- violin <- 1 -} else { - algorithm <- ratios <- violin <- 0 -} -# are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) -header_row <- 1 -# column name where the data starts (default B) -col_start <- "B" -zscore_cutoff <- 5 -xaxis_cutoff <- 20 -protocol_name <- "DIMS_PL_DIAG" - -#### STEP 1: Preparation #### -# in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS - -# path to DIMS excel file -path_dims_file <- paste0(run_name, ".xlsx") - -# Load the excel file. -dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row) -if (exists("dims_xls")) { - cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file)) -} else { - cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n")) -} - -#### STEP 2: Edit DIMS data ##### -# in: dims_xls ||| out: Data, nr_contr, nr_pat -# Input: the xlsx file that comes out of the pipeline with format: -# [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] -# Output: "_CSV.csv" file that is suited for the algorithm in shiny. - -# Determine the number of Contols and Patients in column names: -nr_contr <- length(grep("C", names(dims_xls))) / 2 -nr_pat <- length(grep("P", names(dims_xls))) / 2 -# total number of samples -nrsamples <- nr_contr + nr_pat -# check whether the number of intensity columns equals the number of Zscore columns -if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { - cat("\n**** Error: there aren't as many intensities listed as Zscores") -} -cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) - -# Move the columns HMDB_code and HMDB_name to the beginning. -hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) -other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] -dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] -# Remove the columns from 'name' to 'pathway' -from_col <- which(colnames(dims_xls_copy) == "name") -to_col <- which(colnames(dims_xls_copy) == "pathway") -dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] -# in case the excel had an empty "plots" column, remove it -if ("plots" %in% colnames(dims_xls_copy)) { - dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))] -} -# Rename columns -names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) -names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) -names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) -names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) - -# intensity columns and mean and standard deviation of controls -numeric_cols <- c(3:ncol(dims_xls_copy)) -# make sure all values are numeric -dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric) - -if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { - cat("\n### Step 2 # Edit dims data is done.\n") -} else { - cat("\n**** Error: Could not execute step 2 \n") -} - -#### STEP 3: Calculate ratios of intensities for metabolites #### -# in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) -# This script loads the file with Ratios (file_ratios_metabolites) and calculates -# the ratios of the intensities of the given metabolites. It also calculates -# Zs-cores based on the avg and sd of the ratios of the controls. - -# Input: dataframe with intenstities and Zscores of controls and patients: -# [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore] - -# Output: "_CSV.csv" file that is suited for the algorithm, with format: -# "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. - -if (ratios == 1) { - cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) - ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) - - # Prepare empty data frame to fill with ratios - ratio_list <- setNames(data.frame(matrix( - ncol = ncol(dims_xls_copy), - nrow = nrow(ratio_input) - )), colnames(dims_xls_copy)) - ratio_list <- as.data.frame(ratio_list) - - # put HMDB info into first two columns of ratio_list - ratio_list[, 1:2] <- ratio_input[, 1:2] - - # look for intensity columns (exclude Zscore columns) - control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) - intensity_cols <- c(control_cols, patient_cols) - # calculate each of the ratios of intensities - for (ratio_index in 1:nrow(ratio_input)) { - ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] - ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] - ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] - ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] - # find these HMDB IDs in dataset. Could be a sum of multiple metabolites - sel_denominator <- sel_numerator <- c() - for (numerator_index in 1:length(ratio_numerator)) { - sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index])) - } - for (denominator_index in 1:length(ratio_denominator)) { - # special case for sum of metabolites (dividing by one) +if (z_score == 1){ + # path: output folder for dIEM and violin plots + output_dir <- "./" + + file.copy(file_isomers, output_dir) + + # load functions + source(paste0(scripts_dir, "check_same_samplename.R")) + source(paste0(scripts_dir, "prepare_data.R")) + source(paste0(scripts_dir, "prepare_data_perpage.R")) + source(paste0(scripts_dir, "prepare_toplist.R")) + source(paste0(scripts_dir, "create_violin_plots.R")) + source(paste0(scripts_dir, "prepare_alarmvalues.R")) + source(paste0(scripts_dir, "output_helix.R")) + source(paste0(scripts_dir, "get_patient_data_to_helix.R")) + source(paste0(scripts_dir, "add_lab_id_and_onderzoeksnummer.R")) + source(paste0(scripts_dir, "is_diagnostic_patient.R")) + + # number of diseases that score highest in algorithm to plot + top_nr_iem <- 5 + # probability score cut-off for plotting the top diseases + threshold_iem <- 5 + # z-score cutoff of axis on the left for top diseases + ratios_cutoff <- -5 + # number of violin plots per page in PDF + nr_plots_perpage <- 20 + + # binary variable: run function, yes(1) or no(0) + if (z_score == 1) { + algorithm <- ratios <- violin <- 1 + } else { + algorithm <- ratios <- violin <- 0 + } + # are the sample names headers on row 1 or row 2 in the DIMS excel? (default 1) + header_row <- 1 + # column name where the data starts (default B) + col_start <- "B" + zscore_cutoff <- 5 + xaxis_cutoff <- 20 + protocol_name <- "DIMS_PL_DIAG" + + #### STEP 1: Preparation #### + # in: run_name, path_dims_file, header_row ||| out: output_dir, DIMS + + # path to DIMS excel file + path_dims_file <- paste0(run_name, ".xlsx") + + # Load the excel file. + dims_xls <- readWorkbook(xlsxFile = path_dims_file, sheet = 1, startRow = header_row) + if (exists("dims_xls")) { + cat(paste0("\nThe excel file is succesfully loaded:\n -> ", path_dims_file)) + } else { + cat(paste0("\n\n**** Error: Could not find an Excel file. Please check location of file:\n -> ", path_dims_file, "\n")) + } + + #### STEP 2: Edit DIMS data ##### + # in: dims_xls ||| out: Data, nr_contr, nr_pat + # Input: the xlsx file that comes out of the pipeline with format: + # [plots] [C] [P] [summary columns] [C_Zscore] [P_Zscore] + # Output: "_CSV.csv" file that is suited for the algorithm in shiny. + + # Determine the number of Contols and Patients in column names: + nr_contr <- length(grep("C", names(dims_xls))) / 2 + nr_pat <- length(grep("P", names(dims_xls))) / 2 + # total number of samples + nrsamples <- nr_contr + nr_pat + # check whether the number of intensity columns equals the number of Zscore columns + if (nr_contr + nr_pat != length(grep("_Zscore", names(dims_xls)))) { + cat("\n**** Error: there aren't as many intensities listed as Zscores") + } + cat(paste0("\n\n------------\n", nr_contr, " controls \n", nr_pat, " patients\n------------\n\n")) + + # Move the columns HMDB_code and HMDB_name to the beginning. + hmdb_info_cols <- c(which(colnames(dims_xls) == "HMDB_code"), which(colnames(dims_xls) == "HMDB_name")) + other_cols <- seq_along(1:ncol(dims_xls))[-hmdb_info_cols] + dims_xls_copy <- dims_xls[, c(hmdb_info_cols, other_cols)] + # Remove the columns from 'name' to 'pathway' + from_col <- which(colnames(dims_xls_copy) == "name") + to_col <- which(colnames(dims_xls_copy) == "pathway") + dims_xls_copy <- dims_xls_copy[, -c(from_col:to_col)] + # in case the excel had an empty "plots" column, remove it + if ("plots" %in% colnames(dims_xls_copy)) { + dims_xls_copy <- dims_xls_copy[, -grep("plots", colnames(dims_xls_copy))] + } + # Rename columns + names(dims_xls_copy) <- gsub("avg.ctrls", "Mean_controls", names(dims_xls_copy)) + names(dims_xls_copy) <- gsub("sd.ctrls", "SD_controls", names(dims_xls_copy)) + names(dims_xls_copy) <- gsub("HMDB_code", "HMDB.code", names(dims_xls_copy)) + names(dims_xls_copy) <- gsub("HMDB_name", "HMDB.name", names(dims_xls_copy)) + + # intensity columns and mean and standard deviation of controls + numeric_cols <- c(3:ncol(dims_xls_copy)) + # make sure all values are numeric + dims_xls_copy[, numeric_cols] <- sapply(dims_xls_copy[, numeric_cols], as.numeric) + + if (exists("dims_xls_copy") & (length(dims_xls_copy) < length(dims_xls))) { + cat("\n### Step 2 # Edit dims data is done.\n") + } else { + cat("\n**** Error: Could not execute step 2 \n") + } + + #### STEP 3: Calculate ratios of intensities for metabolites #### + # in: ratios, file_ratios_metabolites, dims_xls_copy, nr_contr, nr_pat ||| out: Zscore (+file) + # This script loads the file with Ratios (file_ratios_metabolites) and calculates + # the ratios of the intensities of the given metabolites. It also calculates + # Zs-cores based on the avg and sd of the ratios of the controls. + + # Input: dataframe with intenstities and Zscores of controls and patients: + # [HMDB.code] [HMDB.name] [C] [P] [Mean_controls] [SD_controls] [C_Zscore] [P_Zscore] + + # Output: "_CSV.csv" file that is suited for the algorithm, with format: + # "_Ratios_CSV.csv" file, same file as above, but with ratio rows added. + + if (ratios == 1) { + cat(paste0("\nloading ratios file:\n -> ", file_ratios_metabolites, "\n")) + ratio_input <- read.csv(file_ratios_metabolites, sep = ";", stringsAsFactors = FALSE) + + # Prepare empty data frame to fill with ratios + ratio_list <- setNames(data.frame(matrix( + ncol = ncol(dims_xls_copy), + nrow = nrow(ratio_input) + )), colnames(dims_xls_copy)) + ratio_list <- as.data.frame(ratio_list) + + # put HMDB info into first two columns of ratio_list + ratio_list[, 1:2] <- ratio_input[, 1:2] + + # look for intensity columns (exclude Zscore columns) + control_cols <- grep("C", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + patient_cols <- grep("P", colnames(ratio_list)[1:which(colnames(ratio_list) == "Mean_controls")]) + intensity_cols <- c(control_cols, patient_cols) + # calculate each of the ratios of intensities + for (ratio_index in 1:nrow(ratio_input)) { + ratio_numerator <- ratio_input[ratio_index, "HMDB_numerator"] + ratio_numerator <- strsplit(ratio_numerator, "plus")[[1]] + ratio_denominator <- ratio_input[ratio_index, "HMDB_denominator"] + ratio_denominator <- strsplit(ratio_denominator, "plus")[[1]] + # find these HMDB IDs in dataset. Could be a sum of multiple metabolites + sel_denominator <- sel_numerator <- c() + for (numerator_index in 1:length(ratio_numerator)) { + sel_numerator <- c(sel_numerator, which(dims_xls_copy[, "HMDB.code"] == ratio_numerator[numerator_index])) + } + for (denominator_index in 1:length(ratio_denominator)) { + # special case for sum of metabolites (dividing by one) + if (ratio_denominator[denominator_index] != "one") { + sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index])) + } + } + # calculate ratio if (ratio_denominator[denominator_index] != "one") { - sel_denominator <- c(sel_denominator, which(dims_xls_copy[, "HMDB.code"] == ratio_denominator[denominator_index])) + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / + apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) + } else { + # special case for sum of metabolites (dividing by one) + ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) } + # calculate log of ratio + ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols]) } - # calculate ratio - if (ratio_denominator[denominator_index] != "one") { - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) / - apply(dims_xls_copy[sel_denominator, intensity_cols], 2, sum) - } else { - # special case for sum of metabolites (dividing by one) - ratio_list[ratio_index, intensity_cols] <- apply(dims_xls_copy[sel_numerator, intensity_cols], 2, sum) - } - # calculate log of ratio - ratio_list[ratio_index, intensity_cols] <- log2(ratio_list[ratio_index, intensity_cols]) - } - # Calculate means and SD's of the calculated ratios for Controls - ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean) - ratio_list[, "SD_controls"] <- apply(ratio_list[, control_cols], 1, sd) - - # Calc z-scores with the means and SD's of Controls - zscore_cols <- grep("Zscore", colnames(ratio_list)) - for (sample_index in 1:length(zscore_cols)) { - zscore_col <- zscore_cols[sample_index] - # matching intensity column - int_col <- intensity_cols[sample_index] - # test on column names - if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { - # calculate Z-scores - ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"] + # Calculate means and SD's of the calculated ratios for Controls + ratio_list[, "Mean_controls"] <- apply(ratio_list[, control_cols], 1, mean) + ratio_list[, "SD_controls"] <- apply(ratio_list[, control_cols], 1, sd) + + # Calc z-scores with the means and SD's of Controls + zscore_cols <- grep("Zscore", colnames(ratio_list)) + for (sample_index in 1:length(zscore_cols)) { + zscore_col <- zscore_cols[sample_index] + # matching intensity column + int_col <- intensity_cols[sample_index] + # test on column names + if (check_same_samplename(colnames(ratio_list)[int_col], colnames(ratio_list)[zscore_col])) { + # calculate Z-scores + ratio_list[, zscore_col] <- (ratio_list[, int_col] - ratio_list[, "Mean_controls"]) / ratio_list[, "SD_controls"] + } } - } - # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. - dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) - - # Edit the DIMS output Zscores of all patients in format: - # HMDB_code patientname1 patientname2 - names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios)) - names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) - - # for debugging: - write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t") - - # Select only the cols with zscores of the patients - zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] - # Select only the cols with zscores of the controls - zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] - -} - -#### STEP 4: Run the IEM algorithm ######### -# in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file) -# algorithm taken from DOI: 10.3390/ijms21030979 - -if (algorithm == 1) { - # Load data - cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_iem, "\n")) - expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) - # modify column names - names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) - names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) - - # prepare dataframe scaffold rank_patients - rank_patients <- zscore_patients - # Fill df rank_patients with the ranks for each patient - for (patient_index in 3:ncol(zscore_patients)) { - # number of positive zscores in patient - pos <- sum(zscore_patients[, patient_index] > 0) - # sort the column on zscore; NB: this sorts the entire object, not just one column - rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] - # Rank all positive zscores highest to lowest - rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) - # Rank all negative zscores lowest to highest - rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1): - nrow(rank_patients), patient_index])) - } + # Add rows of the ratio hmdb codes to the data of zscores from the pipeline. + dims_xls_ratios <- rbind(ratio_list, dims_xls_copy) - # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). - expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - expected_zscores_original <- expected_zscores - - # determine which columns contain Z-scores and which contain disease info - select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) - select_info_cols <- 1:(min(select_zscore_cols) - 1) - # set some zscores to zero - select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, - select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) - select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") - expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, - select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) - - # calculate rank score: - expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) - rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / - (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) - # combine disease info with rank scores - expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) - - # multiply weight score and rank score - weight_score <- expected_zscores - weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols] - - # sort table on Disease and Absolute_Weight - weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] - - # select columns to check duplicates - dup <- weight_score[, c("Disease", "M.z")] - uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ] - - # calculate probability score - prob_score <- aggregate(uni[, select_zscore_cols], uni["Disease"], sum) - - # list of all diseases that have at least one metabolite Zscore at 0 - for (patient_index in 2:ncol(prob_score)) { - patient_zscore_colname <- colnames(prob_score)[patient_index] - matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) - # determine which Zscores are 0 for this patient - zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0) - # get Disease for these - disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) - # set the probability score of these diseases to 0 - prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0 - } + # Edit the DIMS output Zscores of all patients in format: + # HMDB_code patientname1 patientname2 + names(dims_xls_ratios) <- gsub("HMDB.code", "HMDB_code", names(dims_xls_ratios)) + names(dims_xls_ratios) <- gsub("HMDB.name", "HMDB_name", names(dims_xls_ratios)) + + # for debugging: + write.table(dims_xls_ratios, file = paste0(output_dir, "/ratios.txt"), sep = "\t") + + # Select only the cols with zscores of the patients + zscore_patients <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("P", colnames(dims_xls_ratios)[zscore_cols])])] + # Select only the cols with zscores of the controls + zscore_controls <- dims_xls_ratios[, c(1, 2, zscore_cols[grep("C", colnames(dims_xls_ratios)[zscore_cols])])] - # determine disease rank per patient - disease_rank <- prob_score - # rank diseases in decreasing order - disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) - as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) - # modify column names, Zscores have now been converted to probability scores - colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) - colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) - - # Create conditional formatting for output Excel sheet. Colors according to values. - wb <- createWorkbook() - addWorksheet(wb, "Probability Scores") - writeData(wb, "Probability Scores", prob_score) - conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), - type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) - saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) - # check whether prob_score df exists and has expected dimensions. - if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { - cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") - } else { - cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") } - rm(wb) -} - -#### STEP 5: Make violin plots ##### -# in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, -# top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file - -if (violin == 1) { - - # preparation - zscore_patients_copy <- zscore_patients - # for robust scaler, rename Z-score columns - colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) - colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) - colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) - colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) - - # Make patient list for violin plots - patient_list <- names(zscore_patients)[-c(1, 2)] - - # from table expected_biomarkers, choose selected columns - select_columns <- c("Disease", "HMDB_code", "HMDB_name") - select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) - expected_biomarkers_select <- expected_biomarkers[, select_col_nrs] - # remove duplicates - expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] - - # load file with explanatory information to be included in PDF. - explanation <- readLines(file_explanation) - - # first step: normal violin plots - # Find all text files in the given folder, which contain metabolite lists of which - # each file will be a page in the pdf with violin plots. - # Make a PDF file for each of the categories in metabolite_dirs - metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) - for (metabolite_dir in metabolite_dirs) { - # create a directory for the output PDFs - pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") - dir.create(pdf_dir, showWarnings = FALSE) - cat("making plots in category:", metabolite_dir, "\n") - - # get a list of all metabolite files - metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), - pattern = "*.txt", full.names = FALSE, recursive = FALSE) - # put all metabolites into one list - metab_list_all <- list() - metab_list_names <- c() - cat("making plots from the input files:") - # open the text files and add each to a list of dataframes (metab_list_all) - for (file_index in seq_along(metabolite_files)) { - infile <- metabolite_files[file_index] - metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), - sep = "\t", header = TRUE, quote = "") - # put into list of all lists - metab_list_all[[file_index]] <- metab_list - metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) - cat(paste0("\n", infile)) + #### STEP 4: Run the IEM algorithm ######### + # in: algorithm, file_expected_biomarkers_iem, zscore_patients ||| out: prob_score (+file) + # algorithm taken from DOI: 10.3390/ijms21030979 + + if (algorithm == 1) { + # Load data + cat(paste0("\nloading expected file:\n -> ", file_expected_biomarkers_iem, "\n")) + expected_biomarkers <- read.csv(file_expected_biomarkers_iem, sep = ";", stringsAsFactors = FALSE) + # modify column names + names(expected_biomarkers) <- gsub("HMDB.code", "HMDB_code", names(expected_biomarkers)) + names(expected_biomarkers) <- gsub("Metabolite", "HMDB_name", names(expected_biomarkers)) + + # prepare dataframe scaffold rank_patients + rank_patients <- zscore_patients + # Fill df rank_patients with the ranks for each patient + for (patient_index in 3:ncol(zscore_patients)) { + # number of positive zscores in patient + pos <- sum(zscore_patients[, patient_index] > 0) + # sort the column on zscore; NB: this sorts the entire object, not just one column + rank_patients <- rank_patients[order(-rank_patients[patient_index]), ] + # Rank all positive zscores highest to lowest + rank_patients[1:pos, patient_index] <- as.numeric(ordered(-rank_patients[1:pos, patient_index])) + # Rank all negative zscores lowest to highest + rank_patients[(pos + 1):nrow(rank_patients), patient_index] <- as.numeric(ordered(rank_patients[(pos + 1): + nrow(rank_patients), patient_index])) } - # include list of classes in metabolite list - names(metab_list_all) <- metab_list_names - - # prepare list of metabolites; max nr_plots_perpage on one page - metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) - metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) - metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) - - # for Diagnostics metabolites to be saved in Helix - if(grepl("Diagnost", pdf_dir)) { - # get table that combines DIMS results with stofgroepen/Helix table - dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) - - # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs - if(any(is_diagnostic_patient(dims_helix_table$Patient))){ - # get output file for Helix - output_helix <- output_for_helix(protocol_name, dims_helix_table) - # write output to file - path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv") - write.csv(output_helix, path_helixfile, quote = F, row.names = F) - } - } - - # make violin plots per patient - for (pt_nr in 1:length(patient_list)) { - pt_name <- patient_list[pt_nr] - # for category Diagnostics, make list of metabolites that exceed alarm values for this patient - # for category Other, make list of top highest and lowest Z-scores for this patient - if (grepl("Diagnost", pdf_dir)) { - top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table) - } else { - top_metab_pt <- prepare_toplist(pt_name, zscore_patients) - } - # generate normal violin plots - create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) + # Calculate metabolite score, using the dataframes with only values, and later add the cols without values (1&2). + expected_zscores <- merge(x = expected_biomarkers, y = zscore_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + expected_zscores_original <- expected_zscores + + # determine which columns contain Z-scores and which contain disease info + select_zscore_cols <- grep("_Zscore", colnames(expected_zscores)) + select_info_cols <- 1:(min(select_zscore_cols) - 1) + # set some zscores to zero + select_incr_indisp <- which(expected_zscores$Change == "Increase" & expected_zscores$Dispensability == "Indispensable") + expected_zscores[select_incr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_incr_indisp, + select_zscore_cols], function(x) ifelse (x <= 1.6, 0, x)) + select_decr_indisp <- which(expected_zscores$Change == "Decrease" & expected_zscores$Dispensability == "Indispensable") + expected_zscores[select_decr_indisp, select_zscore_cols] <- lapply(expected_zscores[select_decr_indisp, + select_zscore_cols], function(x) ifelse (x >= -1.2, 0, x)) + + # calculate rank score: + expected_ranks <- merge(x = expected_biomarkers, y = rank_patients, by.x = c("HMDB_code"), by.y = c("HMDB_code")) + rank_scores <- expected_zscores[order(expected_zscores$HMDB_code), select_zscore_cols] / + (expected_ranks[order(expected_ranks$HMDB_code), select_zscore_cols] * 0.9) + # combine disease info with rank scores + expected_metabscore <- cbind(expected_ranks[order(expected_zscores$HMDB_code), select_info_cols], rank_scores) + + # multiply weight score and rank score + weight_score <- expected_zscores + weight_score[, select_zscore_cols] <- expected_metabscore$Total_Weight * expected_metabscore[, select_zscore_cols] + + # sort table on Disease and Absolute_Weight + weight_score <- weight_score[order(weight_score$Disease, weight_score$Absolute_Weight, decreasing = TRUE), ] + + # select columns to check duplicates + dup <- weight_score[, c("Disease", "M.z")] + uni <- weight_score[!duplicated(dup) | !duplicated(dup, fromLast = FALSE), ] + + # calculate probability score + prob_score <- aggregate(uni[, select_zscore_cols], uni["Disease"], sum) + + # list of all diseases that have at least one metabolite Zscore at 0 + for (patient_index in 2:ncol(prob_score)) { + patient_zscore_colname <- colnames(prob_score)[patient_index] + matching_colname_expected <- which(colnames(expected_zscores) == patient_zscore_colname) + # determine which Zscores are 0 for this patient + zscores_zero <- which(expected_zscores[, matching_colname_expected] == 0) + # get Disease for these + disease_zero <- unique(expected_zscores[zscores_zero, "Disease"]) + # set the probability score of these diseases to 0 + prob_score[which(prob_score$Disease %in% disease_zero), patient_index] <- 0 + } + # determine disease rank per patient + disease_rank <- prob_score + # rank diseases in decreasing order + disease_rank[2:ncol(disease_rank)] <- lapply(2:ncol(disease_rank), function(x) + as.numeric(ordered(-disease_rank[1:nrow(disease_rank), x]))) + # modify column names, Zscores have now been converted to probability scores + colnames(prob_score) <- gsub("_Zscore", "_prob_score", colnames(prob_score)) + colnames(disease_rank) <- gsub("_Zscore", "", colnames(disease_rank)) + + # Create conditional formatting for output Excel sheet. Colors according to values. + wb <- createWorkbook() + addWorksheet(wb, "Probability Scores") + writeData(wb, "Probability Scores", prob_score) + conditionalFormatting(wb, "Probability Scores", cols = 2:ncol(prob_score), rows = 1:nrow(prob_score), + type = "colourScale", style = c("white", "#FFFDA2", "red"), rule = c(1, 10, 100)) + saveWorkbook(wb, file = paste0(output_dir, "/dIEM_algoritme_output_", run_name, ".xlsx"), overwrite = TRUE) + # check whether prob_score df exists and has expected dimensions. + if (exists("expected_biomarkers") & (length(disease_rank) == length(prob_score))) { + cat("\n### Step 4 # Running the IEM algorithm is done.\n\n") + } else { + cat("\n**** Error: Could not run IEM algorithm. Check if path to expected_biomarkers csv-file is correct. \n") } + rm(wb) } - # Second step: dIEM plots in separate directory - diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") - dir.create(diem_plot_dir) - - # Select the metabolites that are associated with the top highest scoring IEM, for each patient - # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. - for (pt_nr in 1:length(patient_list)) { - pt_name <- patient_list[pt_nr] - # get top diseases for this patient - pt_colnr <- which(colnames(disease_rank) == pt_name) - pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem) - pt_iems <- disease_rank[pt_top_indices, "Disease"] - pt_top_iems <- pt_prob_score_top_iems <- c() - for (single_iem in pt_iems) { - # get the probability score - prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr] - # use only diseases for which probability score is above threshold - if (prob_score_iem >= threshold_iem) { - pt_top_iems <- c(pt_top_iems, single_iem) - pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem) + #### STEP 5: Make violin plots ##### + # in: algorithm / zscore_patients, violin, nr_contr, nr_pat, Data, path_textfiles, zscore_cutoff, xaxis_cutoff, + # top_diseases, top_metab, output_dir ||| out: pdf file, Helix csv file + + if (violin == 1) { + + # preparation + zscore_patients_copy <- zscore_patients + # for robust scaler, rename Z-score columns + colnames(zscore_patients) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_patients)) + colnames(zscore_controls) <- gsub("_RobustZscore", "_Zscore", colnames(zscore_controls)) + colnames(zscore_patients) <- gsub("_Zscore", "", colnames(zscore_patients)) + colnames(zscore_controls) <- gsub("_Zscore", "", colnames(zscore_controls)) + + # Make patient list for violin plots + patient_list <- names(zscore_patients)[-c(1, 2)] + + # from table expected_biomarkers, choose selected columns + select_columns <- c("Disease", "HMDB_code", "HMDB_name") + select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) + expected_biomarkers_select <- expected_biomarkers[, select_col_nrs] + # remove duplicates + expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] + + # load file with explanatory information to be included in PDF. + explanation <- readLines(file_explanation) + + # first step: normal violin plots + # Find all text files in the given folder, which contain metabolite lists of which + # each file will be a page in the pdf with violin plots. + # Make a PDF file for each of the categories in metabolite_dirs + metabolite_dirs <- list.files(path = path_metabolite_groups, full.names = FALSE, recursive = FALSE) + for (metabolite_dir in metabolite_dirs) { + # create a directory for the output PDFs + pdf_dir <- paste(output_dir, metabolite_dir, sep = "/") + dir.create(pdf_dir, showWarnings = FALSE) + cat("making plots in category:", metabolite_dir, "\n") + + # get a list of all metabolite files + metabolite_files <- list.files(path = paste(path_metabolite_groups, metabolite_dir, sep = "/"), + pattern = "*.txt", full.names = FALSE, recursive = FALSE) + # put all metabolites into one list + metab_list_all <- list() + metab_list_names <- c() + cat("making plots from the input files:") + # open the text files and add each to a list of dataframes (metab_list_all) + for (file_index in seq_along(metabolite_files)) { + infile <- metabolite_files[file_index] + metab_list <- read.table(paste(path_metabolite_groups, metabolite_dir, infile, sep = "/"), + sep = "\t", header = TRUE, quote = "") + # put into list of all lists + metab_list_all[[file_index]] <- metab_list + metab_list_names <- c(metab_list_names, strsplit(infile, ".txt")[[1]][1]) + cat(paste0("\n", infile)) + } + # include list of classes in metabolite list + names(metab_list_all) <- metab_list_names + + # prepare list of metabolites; max nr_plots_perpage on one page + metab_interest_sorted <- prepare_data(metab_list_all, zscore_patients) + metab_interest_controls <- prepare_data(metab_list_all, zscore_controls) + metab_perpage <- prepare_data_perpage(metab_interest_sorted, metab_interest_controls, nr_plots_perpage, nr_pat, nr_contr) + + # for Diagnostics metabolites to be saved in Helix + if(grepl("Diagnost", pdf_dir)) { + # get table that combines DIMS results with stofgroepen/Helix table + dims_helix_table <- get_patient_data_to_helix(metab_interest_sorted, metab_list_all) + + # check if run contains Diagnostics patients (e.g. "P2024M"), not for research runs + if(any(is_diagnostic_patient(dims_helix_table$Patient))){ + # get output file for Helix + output_helix <- output_for_helix(protocol_name, dims_helix_table) + # write output to file + path_helixfile <- paste0(output_dir, "/output_Helix_", run_name,".csv") + write.csv(output_helix, path_helixfile, quote = F, row.names = F) + } } + + # make violin plots per patient + for (pt_nr in 1:length(patient_list)) { + pt_name <- patient_list[pt_nr] + # for category Diagnostics, make list of metabolites that exceed alarm values for this patient + # for category Other, make list of top highest and lowest Z-scores for this patient + if (grepl("Diagnost", pdf_dir)) { + top_metab_pt <- prepare_alarmvalues(pt_name, dims_helix_table) + } else { + top_metab_pt <- prepare_toplist(pt_name, zscore_patients) + } + + # generate normal violin plots + create_violin_plots(pdf_dir, pt_name, metab_perpage, top_metab_pt) + + } + } - # prepare data for plotting dIEM violin plots - # If prob_score_top_iem is an empty list, don't make a plot - if (length(pt_top_iems) > 0) { - # Sorting from high to low, both prob_score_top_iems and pt_top_iems. - pt_prob_score_order <- order(-pt_prob_score_top_iems) - pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1) - pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order] - pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order] - # getting metabolites for each top_iem disease exactly like in metab_list_all - metab_iem_all <- list() - metab_iem_names <- c() - for (single_iem_index in 1:length(pt_top_iem_sorted)) { - single_iem <- pt_top_iem_sorted[single_iem_index] - single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index] - select_rows <- which(expected_biomarkers_select$Disease == single_iem) - metab_list <- expected_biomarkers_select[select_rows, ] - metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score)) - metab_list <- metab_list[, -1] - metab_iem_all[[single_iem_index]] <- metab_list + # Second step: dIEM plots in separate directory + diem_plot_dir <- paste(output_dir, "dIEM_plots", sep = "/") + dir.create(diem_plot_dir) + + # Select the metabolites that are associated with the top highest scoring IEM, for each patient + # disease_rank is from step 4: the dIEM algorithm. The lower the value, the more likely. + for (pt_nr in 1:length(patient_list)) { + pt_name <- patient_list[pt_nr] + # get top diseases for this patient + pt_colnr <- which(colnames(disease_rank) == pt_name) + pt_top_indices <- which(disease_rank[, pt_colnr] <= top_nr_iem) + pt_iems <- disease_rank[pt_top_indices, "Disease"] + pt_top_iems <- pt_prob_score_top_iems <- c() + for (single_iem in pt_iems) { + # get the probability score + prob_score_iem <- prob_score[which(prob_score$Disease == single_iem), pt_colnr] + # use only diseases for which probability score is above threshold + if (prob_score_iem >= threshold_iem) { + pt_top_iems <- c(pt_top_iems, single_iem) + pt_prob_score_top_iems <- c(pt_prob_score_top_iems, prob_score_iem) + } } - # put all metabolites into one list - names(metab_iem_all) <- metab_iem_names - # get Zscore information from zscore_patients_copy, similar to normal violin plots - metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy) - metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls) - # make sure every page has 20 metabolites - diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat) + # prepare data for plotting dIEM violin plots + # If prob_score_top_iem is an empty list, don't make a plot + if (length(pt_top_iems) > 0) { + # Sorting from high to low, both prob_score_top_iems and pt_top_iems. + pt_prob_score_order <- order(-pt_prob_score_top_iems) + pt_prob_score_top_iems <- round(pt_prob_score_top_iems, 1) + pt_prob_score_top_iem_sorted <- pt_prob_score_top_iems[pt_prob_score_order] + pt_top_iem_sorted <- pt_top_iems[pt_prob_score_order] + # getting metabolites for each top_iem disease exactly like in metab_list_all + metab_iem_all <- list() + metab_iem_names <- c() + for (single_iem_index in 1:length(pt_top_iem_sorted)) { + single_iem <- pt_top_iem_sorted[single_iem_index] + single_prob_score <- pt_prob_score_top_iem_sorted[single_iem_index] + select_rows <- which(expected_biomarkers_select$Disease == single_iem) + metab_list <- expected_biomarkers_select[select_rows, ] + metab_iem_names <- c(metab_iem_names, paste0(single_iem, ", probability score ", single_prob_score)) + metab_list <- metab_list[, -1] + metab_iem_all[[single_iem_index]] <- metab_list + } + # put all metabolites into one list + names(metab_iem_all) <- metab_iem_names + + # get Zscore information from zscore_patients_copy, similar to normal violin plots + metab_iem_sorted <- prepare_data(metab_iem_all, zscore_patients_copy) + metab_iem_controls <- prepare_data(metab_iem_all, zscore_controls) + # make sure every page has 20 metabolites + diem_metab_perpage <- prepare_data_perpage(metab_iem_sorted, metab_iem_controls, nr_plots_perpage, nr_pat) + + # generate dIEM violin plots + create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt) - # generate dIEM violin plots - create_violin_plots(diem_plot_dir, pt_name, diem_metab_perpage, top_metab_pt) + } else { + cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ". + Therefore, this pdf was not made:\t ", pt_name, "_iem \n")) + } - } else { - cat(paste0("\n\n**** This patient had no prob_scores higher than ", threshold_iem, ". - Therefore, this pdf was not made:\t ", pt_name, "_iem \n")) } } - -} +} \ No newline at end of file From 3f7b6909d4748d1e20895e8ff3ca5c6306cf54a9 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Thu, 30 May 2024 15:58:56 +0200 Subject: [PATCH 69/73] Fixed missing negative HMDB part --- DIMS/HMDBparts_main.R | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/DIMS/HMDBparts_main.R b/DIMS/HMDBparts_main.R index 932c9e4..14335bf 100644 --- a/DIMS/HMDBparts_main.R +++ b/DIMS/HMDBparts_main.R @@ -25,10 +25,8 @@ for (scanmode in scanmodes) { HMDB_add_iso[ ,column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # remove adducts and isotopes, put internal standard at the beginning - outlist_is <- outlist[grep("IS", outlist[ , "CompoundName"], fixed = TRUE), ] outlist <- outlist[grep("HMDB", rownames(outlist), fixed = TRUE), ] outlist <- outlist[-grep("_", rownames(outlist), fixed = TRUE), ] - outlist <- rbind(outlist_is, outlist) # sort on m/z value outlist <- outlist[order(outlist[ , column_label]), ] nr_rows <- dim(outlist)[1] @@ -47,11 +45,12 @@ for (scanmode in scanmodes) { save(outlist_part, file=paste0(scanmode, "_hmdb_main.", i, ".RData")) } } -} -# finish last hmdb part -start <- end + 1 -end <- nr_rows + # finish last hmdb part + start <- end + 1 + end <- nr_rows -outlist_part <- outlist[c(start:end), ] -save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData")) + outlist_part <- outlist[c(start:end), ] + save(outlist_part, file = paste0(scanmode, "_hmdb_main.", i + 1, ".RData")) + +} \ No newline at end of file From b7b56d8dc084fc196c6f6f928f1f229b8025df71 Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 31 May 2024 15:09:23 +0200 Subject: [PATCH 70/73] code review changes applied --- DIMS/AssignToBins.R | 12 ++++----- DIMS/GenerateBreaks.R | 2 +- DIMS/GenerateExcel.R | 46 ++++++++++++++++----------------- DIMS/HMDBparts.R | 13 +++++----- DIMS/UnidentifiedCalcZscores.R | 4 +-- DIMS/UnidentifiedPeakGrouping.R | 20 +++++++------- DIMS/Utils/estimate_area.R | 2 +- DIMS/Utils/search_mzrange.R | 10 +++---- 8 files changed, 53 insertions(+), 56 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 8479c09..4ca7fce 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -25,7 +25,7 @@ pos_results <- NULL neg_results <- NULL # read in the data for 1 sample -raw_data <- suppressMessages(xcmsRaw(mzml_filepath)) +raw_data <- suppressMessages(xcms::xcmsRaw(mzml_filepath)) # for TIC plots: prepare txt files with data for plots tic_intensity_persample <- cbind(round(raw_data@scantime, 2), raw_data@tic) @@ -38,7 +38,7 @@ pos_bins <- bins neg_bins <- bins # Generate a matrix -raw_data_matrix <- rawMat(raw_data) +raw_data_matrix <- xcms::rawMat(raw_data) # Get time values for positive and negative scans pos_times <- raw_data@scantime[raw_data@polarity == "positive"] @@ -111,13 +111,13 @@ rownames(pos_results_transpose) <- sample_name rownames(neg_results_transpose) <- sample_name # delete the last value of breaks_fwhm_avg to match dimensions of pos_results and neg_results -breaks_fwhm_avg_minus1 <- breaks_fwhm_avg[-length(breaks_fwhm_avg)] +breaks_fwhm_avg_minuslast <- breaks_fwhm_avg[-length(breaks_fwhm_avg)] # Format as string and show precision of float to 5 digits -breaks_fwhm_avg_minus1 <- sprintf("%.5f", breaks_fwhm_avg_minus1) +breaks_fwhm_avg_minuslast <- sprintf("%.5f", breaks_fwhm_avg_minuslast) # Use this as the column names -colnames(pos_results_transpose) <- breaks_fwhm_avg_minus1 -colnames(neg_results_transpose) <- breaks_fwhm_avg_minus1 +colnames(pos_results_transpose) <- breaks_fwhm_avg_minuslast +colnames(neg_results_transpose) <- breaks_fwhm_avg_minuslast # transpose back pos_results_final <- t(pos_results_transpose) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index e475b34..6aacfe8 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -19,7 +19,7 @@ breaks_fwhm_avg <- NULL bins <- NULL # read in mzML file -raw_data <- suppressMessages(xcmsRaw(filepath)) +raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) # trim (remove) scans at the start and end trim_left <- round(raw_data@scantime[length(raw_data@scantime) * trim]) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 7d34204..d36ac29 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -37,10 +37,10 @@ robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { #' @param perc: Percentage of outliers which will be removed from controls (float) #' #' @return trimmed_control_intensities: Intensities trimmed for outliers - nr_toremove <- ceiling(length(control_col_ids) * perc / 100) + nr_to_remove <- ceiling(length(control_col_ids) * perc / 100) sorted_control_intensities <- sort(as.numeric(control_intensities)) - trimmed_control_intensities <- sorted_control_intensities[(nr_toremove + 1) : - (length(sorted_control_intensities) - nr_toremove)] + trimmed_control_intensities <- sorted_control_intensities[(nr_to_remove + 1) : + (length(sorted_control_intensities) - nr_to_remove)] return(trimmed_control_intensities) } @@ -233,37 +233,37 @@ if (z_score == 1) { file_png <- paste0(plot_dir, "/", hmdb_name, "_box.png") if (is.null(temp_png)) { - temp_png <- readPng(file_png) + temp_png <- loder::readPng(file_png) img_dim <- dim(temp_png)[c(1, 2)] cell_dim <- img_dim * imagesize_multiplier setColWidths(wb, filelist, cols = 1, widths = cell_dim[2] / 20) } - insertImage(wb, - filelist, - file_png, - startRow = p + 1, - startCol = 1, - height = cell_dim[1], - width = cell_dim[2], - units = "px") + openxlsx::insertImage(wb, + filelist, + file_png, + startRow = p + 1, + startCol = 1, + height = cell_dim[1], + width = cell_dim[2], + units = "px") if (p %% 100 == 0) { cat("at row: ", p, "\n") } } - setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4) - setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20) + openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist) + 1), heights = cell_dim[1] / 4) + openxlsx::setColWidths(wb, filelist, cols = c(2:ncol(outlist)), widths = 20) } else { - setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) - setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) + openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) + openxlsx::setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) } # write Excel file -writeData(wb, sheet = 1, outlist, startCol = 1) +openxlsx::writeData(wb, sheet = 1, outlist, startCol = 1) xlsx_name <- paste0(outdir, "/", project, ".xlsx") -saveWorkbook(wb, xlsx_name, overwrite = TRUE) +openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) rm(wb) #### INTERNAL STANDARDS #### @@ -324,7 +324,7 @@ sample_count <- length(repl_pattern) # change the order of the x-axis summed plots to a natural sorted one sample_naturalorder <- unique(as.character(is_summed$Sample)) -sample_naturalorder <- str_sort(sample_naturalorder, numeric = TRUE) +sample_naturalorder <- stringr::str_sort(sample_naturalorder, numeric = TRUE) is_summed$Sample_level <- factor(is_summed$Sample, levels = c(sample_naturalorder)) is_pos$Sample_level <- factor(is_pos$Sample, levels = c(sample_naturalorder)) is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder)) @@ -333,10 +333,10 @@ is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder)) # theme for all IS bar plots theme_is_bar <- function(my_plot) { my_plot + - scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + - theme(legend.position = "none", - axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), - axis.text.y = element_text(size = 6) + ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + + ggplot2::theme(legend.position = "none", + axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 6), + axis.text.y = element_text(size = 6) ) } diff --git a/DIMS/HMDBparts.R b/DIMS/HMDBparts.R index 520286b..3e7356d 100644 --- a/DIMS/HMDBparts.R +++ b/DIMS/HMDBparts.R @@ -12,7 +12,8 @@ load(breaks_file) min_mz <- round(breaks_fwhm[1]) max_mz <- round(breaks_fwhm[length(breaks_fwhm)]) -# In case of a standard run (m/z 69-606) use external HMDB parts +# In case of a standard run use external HMDB parts +# m/z is approximately 70 to 600: set limits between 68-71 for min and 599-610 for max if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < 610) { # skip generating HMDB parts hmdb_parts_path <- cmd_args[3] @@ -30,18 +31,18 @@ if (standard_run == "yes" & min_mz > 68 & min_mz < 71 & max_mz > 599 & max_mz < for (scanmode in scanmodes) { if (scanmode == "negative") { column_label <- "MNeg" - HMDB_add_iso <- HMDB_add_iso.Neg + hmdb_add_iso <- hmdb_add_iso.Neg } else if (scanmode == "positive") { column_label <- "Mpos" - HMDB_add_iso <- HMDB_add_iso.Pos + hmdb_add_iso <- hmdb_add_iso.Pos } # filter mass range meassured - HMDB_add_iso = HMDB_add_iso[which(HMDB_add_iso[ , column_label] >= breaks_fwhm[1] & - HMDB_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] + hmdb_add_iso = hmdb_add_iso[which(hmdb_add_iso[ , column_label] >= breaks_fwhm[1] & + hmdb_add_iso[ , column_label] <= breaks_fwhm[length(breaks_fwhm)]), ] # sort on mass - outlist <- HMDB_add_iso[order(as.numeric(HMDB_add_iso[ , column_label])),] + outlist <- hmdb_add_iso[order(as.numeric(hmdb_add_iso[ , column_label])),] nr_rows <- dim(outlist)[1] # maximum number of rows per file diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R index 0192d0f..e62776c 100755 --- a/DIMS/UnidentifiedCalcZscores.R +++ b/DIMS/UnidentifiedCalcZscores.R @@ -7,8 +7,8 @@ scripts_dir <- cmd_args[1] ppm <- as.numeric(cmd_args[2]) z_score <- as.numeric(cmd_args[3]) -source(paste0(scripts_dir, "mergeDuplicatedRows.R")) -source(paste0(scripts_dir, "statistics_z.R")) +source(paste0(scripts_dir, "merge_duplicate_rows.R")) +source(paste0(scripts_dir, "calculate_zscores.R")) # for each scan mode, collect all filled peak group lists scanmodes <- c("positive", "negative") diff --git a/DIMS/UnidentifiedPeakGrouping.R b/DIMS/UnidentifiedPeakGrouping.R index 08da5d2..edeb2f7 100755 --- a/DIMS/UnidentifiedPeakGrouping.R +++ b/DIMS/UnidentifiedPeakGrouping.R @@ -25,32 +25,32 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { # while (dim(outlist_copy)[1] > 0) { while (dim(outlist_copy)[1] > nrow_div) { - sel <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1] + select_max_height <- which(as.numeric(outlist_copy[, "height.pkt"]) == max(as.numeric(outlist_copy[, "height.pkt"])))[1] # ppm range around max - mzref <- as.numeric(outlist_copy[sel, "mzmed.pkt"]) + mzref <- as.numeric(outlist_copy[select_max_height, "mzmed.pkt"]) pkmin <- -(range * mzref - mzref) pkmax <- 2 * mzref - pkmin - selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax - tmplist <- outlist_copy[selp, , drop = FALSE] + select_peaks_within_range <- as.numeric(outlist_copy[, "mzmed.pkt"]) > pkmin & as.numeric(outlist_copy[, "mzmed.pkt"]) < pkmax + tmplist <- outlist_copy[select_peaks_within_range, , drop = FALSE] nrsamples <- length(unique(tmplist[, "samplenr"])) if (nrsamples > 0) { - mzmed_pgrp <- mean(as.numeric(outlist_copy[selp, "mzmed.pkt"])) + mzmed_pgrp <- mean(as.numeric(outlist_copy[select_peaks_within_range, "mzmed.pkt"])) mzmin_pgrp <- -(range * mzmed_pgrp - mzmed_pgrp) mzmax_pgrp <- 2 * mzmed_pgrp - mzmin_pgrp # select peaks within mz range - selp <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp - tmplist <- outlist_copy[selp, , drop = FALSE] + select_peaks_within_range <- as.numeric(outlist_copy[, "mzmed.pkt"]) > mzmin_pgrp & as.numeric(outlist_copy[, "mzmed.pkt"]) < mzmax_pgrp + tmplist <- outlist_copy[select_peaks_within_range, , drop = FALSE] # remove used peaks tmp <- as.vector(which(tmplist[, "height.pkt"] == -1)) if (length(tmp) > 0) tmplist <- tmplist[-tmp, , drop = FALSE] nrsamples <- length(unique(tmplist[, "samplenr"])) - fq_worst_pgrp <- as.numeric(max(outlist_copy[selp, "fq"])) - fq_best_pgrp <- as.numeric(min(outlist_copy[selp, "fq"])) + fq_worst_pgrp <- as.numeric(max(outlist_copy[select_peaks_within_range, "fq"])) + fq_best_pgrp <- as.numeric(min(outlist_copy[select_peaks_within_range, "fq"])) ints_allsamps <- rep(0, length(names(repl_pattern_filtered))) names(ints_allsamps) <- names(repl_pattern_filtered) @@ -65,7 +65,7 @@ grouping_rest <- function(outdir, unidentified_peaklist, scanmode, ppm) { mzmin_pgrp, mzmax_pgrp, ints_allsamps, NA, NA, NA, NA)) } - outlist_copy <- outlist_copy[-which(selp == TRUE), , drop = FALSE] + outlist_copy <- outlist_copy[-which(select_peaks_within_range == TRUE), , drop = FALSE] } outpgrlist <- as.data.frame(outpgrlist) diff --git a/DIMS/Utils/estimate_area.R b/DIMS/Utils/estimate_area.R index 1d24de2..b191f55 100644 --- a/DIMS/Utils/estimate_area.R +++ b/DIMS/Utils/estimate_area.R @@ -20,7 +20,7 @@ estimate_area <- function(mass_max, resol, scale, sigma, int_factor) { mz_min <- mass_max - 2 * fwhm mz_max <- mass_max + 2 * fwhm mz_range <- mz_max - mz_min - mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int.factor) + mass_vector2 <- seq(mz_min, mz_max, length = mz_range * int_factor) # estimate area under the curve area_curve <- sum(scale * dnorm(mass_vector2, mass_max, sigma)) / 100 diff --git a/DIMS/Utils/search_mzrange.R b/DIMS/Utils/search_mzrange.R index ba6920e..dcdc42a 100644 --- a/DIMS/Utils/search_mzrange.R +++ b/DIMS/Utils/search_mzrange.R @@ -69,7 +69,7 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol, outdir, sample_name, scanmode, plot, width, height, mz_index, start_index, end_index) - + print(roi_values) if (roi_values$qual[1] == 1) { # get optimized fit values roi_values <- fit_optim(mass_vector, int_vector, resol, plot, @@ -98,7 +98,6 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r roi_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) - allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) @@ -138,11 +137,9 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r # Check only zeros if (sum(int_vector) == 0) next - roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, - resol, outdir, sample_name, scanmode, - plot, width, height, + roi_values <- fit_init(mass_vector, int_vector, int_factor, scale, resol, + outdir, sample_name, scanmode, plot, width, height, mz_index, start_index, end_index) - if (roi_values$qual[1] == 1) { roi_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) @@ -168,7 +165,6 @@ search_mzrange <- function(ints_fullrange, allpeaks_values, int_factor, scale, r } else { roi_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) - allpeaks_values$mean <- c(allpeaks_values$mean, roi_values$mean) allpeaks_values$area <- c(allpeaks_values$area, roi_values$area) allpeaks_values$nr <- c(allpeaks_values$nr, sample_name) From 72e53d5d265b9c53e063db77c45dc1a055f65d3a Mon Sep 17 00:00:00 2001 From: Mia Pras-Raves Date: Fri, 31 May 2024 15:11:25 +0200 Subject: [PATCH 71/73] code review changes applied --- DIMS/Utils/do_peakfinding.R | 5 +--- DIMS/Utils/fit_gaussian.R | 47 +++++++++++++++++++++++++++---------- DIMS/Utils/fit_gaussians.R | 30 +++++------------------ 3 files changed, 42 insertions(+), 40 deletions(-) diff --git a/DIMS/Utils/do_peakfinding.R b/DIMS/Utils/do_peakfinding.R index 004d96f..fff6afe 100644 --- a/DIMS/Utils/do_peakfinding.R +++ b/DIMS/Utils/do_peakfinding.R @@ -43,10 +43,7 @@ do_peakfinding <- function(sample_avgtechrepl, int_factor, scale, resol, outdir, "height.pkt" = allpeaks_values$area) # remove peaks with height = 0 - index <- which(outlist_persample[, "height.pkt"] == 0) - if (length(index) > 0) { - outlist_persample <- outlist_persample[-index, ] - } + outlist_persample <- outlist_persample[outlist_persample[, "height.pkt"] != 0, ] # save output to file save(outlist_persample, file = paste0(sample_name, "_", scanmode, ".RData")) diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R index b22ba01..ed3cd0c 100644 --- a/DIMS/Utils/fit_gaussian.R +++ b/DIMS/Utils/fit_gaussian.R @@ -40,19 +40,20 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) - print("nu in fit_gaussian, if force == 1") - print(fit_values) + print("fit_gaussian, force == 1, values fit_values:") + print(fit_values) # set initial value for scale factor scale <- 2 # test if the mean is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { # run this function again with fixed boundaries + print("nu in fit_gaussian, mean outside m/z range") return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) - print("nu in fit_gaussian, mean outside m/z range") } else { # test if the fit is bad if (fit_values$qual > fit_quality1) { + print("fit_gaussian, force == 1, qual > fit_quality") # Try to fit two curves; find two local maxima new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 # test if there are two indices in new_index @@ -64,9 +65,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, outdir, force = 2, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) - print("nu in fit_gaussian, bad fit ") # good fit } else { + print("fit_gaussian, force == 1, qual < fit_quality") peak_mean <- c(peak_mean, fit_values$mean) peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) @@ -81,18 +82,21 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) - print("nu in fit_gaussian force == 2") - print(fit_values) + print("fit_gaussian force == 2, values fit_values:") + print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { + print("fit_gaussian, force == 2, mean outside m/z range") # check if fit quality is bad if (fit_values$qual > fit_quality) { + print("fit_gaussian, force == 2, qual > fit_quality") # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 2, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { + print("fit_gaussian, fource == 2, qual < fit_quality") # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! for (i in 1:length(fit_values$mean)){ @@ -110,8 +114,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale } # if all means are within range } else { + print("fit_gaussian, force == 2, means within m/z range") # check for bad fit if (fit_values$qual > fit_quality) { + print("fit_gaussian, force == 2, qual > fit_quality") # Try to fit three curves; find three local maxima new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 # test if there are three indices in new_index @@ -125,6 +131,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale plot, scanmode, int_factor, width, height)) # good fit, all means are within m/z range } else { + print("fit_gaussian, force == 2, qual < fit_quality") # check if means are within 3 ppm and sum if so tmp <- fit_values$qual nr_means_new <- -1 @@ -145,20 +152,22 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) - print("nu in fit_gaussian force == 3") - print(fit_values) + print("fit_gaussian force == 3, values fit_values:") + print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) { - + print("fit_gaussian, force == 3, mean outside mz range") # check if fit quality is bad if (fit_values$qual > fit_quality) { + print("fit_gaussian, force == 3, qual > fit_quality") # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { + print("fit_gaussian, force == 3, qual < fit_quality") # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! for (i in 1:length(fit_values$mean)) { @@ -176,8 +185,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale } # if all means are within range } else { + print("fit_gaussian, force == 3, means within range") # check for bad fit if (fit_values$qual > fit_quality) { + print("fit_gaussian, force == 3, qual > fit_quality") # Try to fit four curves; find four local maxima new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 # test if there are four indices in new_index @@ -191,6 +202,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale int_factor, width, height)) # good fit, all means are within m/z range } else { + print("fit_gaussian, force == 3, qual < fit_quality") # check if means are within 4 ppm and sum if so tmp <- fit_values$qual nr_means_new <- -1 @@ -211,22 +223,24 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) - print("nu in fit_gaussian force == 4") - print(fit_values) + print("fit_gaussian force == 4, values fit_values:") + print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] || fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) { - + print("fit_gaussian, force == 4, mean outside mz range") # check if quality of fit is bad if (fit_values$qual > fit_quality) { + print("fit_gaussian, force == 4, qual > fit_quality") # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { + print("fit_gaussian, force == 4, qual < fit_quality") # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! for (i in 1:length(fit_values$mean)) { @@ -244,14 +258,17 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale } # if all means are within range } else { + print("fit_gaussian, force == 4, means within range") # check for bad fit if (fit_values$qual > fit_quality) { + print("fit_gaussian, force == 4, qual > fit_quality") # Try to fit 1 curve, force = 5 return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 5, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) # good fit, all means are within m/z range } else { + print("fit_gaussian, force == 4, qual < fit_quality") # check if means are within 4 ppm and sum if so tmp <- fit_values$qual nr_means_new <- -1 @@ -275,8 +292,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale max_index <- which(int_vector == max(int_vector)) fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) + print("fit_gaussian, force > 4, values fit_values:") + print(fit_values) # check for bad fit if (fit_values$qual > fit_quality1) { + print("fit_gaussian, force > 4, qual > fit_quality1") # remove if (plot) dev.off() # get fit values from fit_optim @@ -287,6 +307,7 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_max <- fit_values$max peak_qual <- 0 } else { + print("fit_gaussian, force > 4, qual < fit_quality1") peak_mean <- c(peak_mean, fit_values$mean) peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) peak_qual <- fit_values$qual @@ -301,6 +322,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale "qual" = peak_qual, "min" = peak_min, "max" = peak_max) + print("fit_gaussian, end script, values roi_value_list:") + print(roi_value_list) return(roi_value_list) } diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R index 1a4c34e..2e811fd 100644 --- a/DIMS/Utils/fit_gaussians.R +++ b/DIMS/Utils/fit_gaussians.R @@ -23,15 +23,9 @@ print("nu in fit_1gaussian") lower <- c(mass_vector[1], 0, mass_vector[1], 0) upper <- c(mass_vector[length(mass_vector)], Inf, mass_vector[length(mass_vector)], Inf) # get optimal value for fitted Gaussian curve - tryCatch(opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), + opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), opt_f, control = list(maxit = 10000), method = "L-BFGS-B", - lower = lower, upper = upper), - error = function(e) { - # in case of error, use regular optim without boundaries - opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), - opt_f, control = list(maxit = 10000)) - write.table(opt_fit, file = paste0("tryCatch_error_", query_mass, ".txt"), row.names = FALSE) - } ) + lower = lower, upper = upper) } else { opt_fit <- optim(c(as.numeric(query_mass), as.numeric(scale)), opt_f, control = list(maxit = 10000)) @@ -72,27 +66,15 @@ print("nu in fit_2gaussians") # get optimal value for 2 fitted Gaussian curves if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { sigma2 <- sigma1 - tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), as.numeric(query_mass1), as.numeric(scale1)), opt_f, control = list(maxit = 10000), - method = "L-BFGS-B", lower = lower, upper = upper), - error = function(e) { - # in case of error, use regular optim without boundaries - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass1), as.numeric(scale1)), - opt_f, control = list(maxit = 10000)) - write.table(res, file = paste0("tryCatch_error_2gauss_", query_mass, ".txt"), row.names = FALSE) } ) + method = "L-BFGS-B", lower = lower, upper = upper) } else { - tryCatch(opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), + opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), as.numeric(query_mass2), as.numeric(scale2)), opt_f, control = list(maxit = 10000), - method = "L-BFGS-B", lower = lower, upper = upper), - error = function(e) { - # in case of error, use regular optim without boundaries - opt_fit <- optim(c(as.numeric(query_mass1), as.numeric(scale1), - as.numeric(query_mass1), as.numeric(scale1)), - opt_f, control = list(maxit = 10000)) - write.table(res, file = paste0("tryCatch_error_2gauss_else_", query_mass, ".txt"), row.names = FALSE) } ) + method = "L-BFGS-B", lower = lower, upper = upper) } } else { if (is.null(query_mass2) && is.null(scale2) && is.null(sigma2)) { From c7b44f6c6421cff96437c8268cc327de9843fa11 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Tue, 4 Jun 2024 10:59:39 +0200 Subject: [PATCH 72/73] Changes for using Utils --- DIMS/CollectFilled.R | 4 +- DIMS/GenerateViolinPlots.nf | 10 ++-- DIMS/SpectrumPeakFinding.R | 6 +- DIMS/SumAdducts.R | 2 +- DIMS/UnidentifiedCalcZscores.R | 4 +- DIMS/UnidentifiedCollectPeaks.R | 9 --- DIMS/UnidentifiedFillMissing.R | 20 +++---- DIMS/Utils/calculate_zscores.R | 6 +- DIMS/Utils/fit_gaussian.R | 100 ++++++++++++++----------------- DIMS/Utils/fit_gaussians.R | 4 -- DIMS/Utils/fit_init.R | 1 - DIMS/Utils/fit_optim.R | 2 - DIMS/Utils/get_element_info.R | 4 +- DIMS/Utils/identify_noisepeaks.R | 2 +- DIMS/Utils/replace_zeros.R | 4 +- 15 files changed, 76 insertions(+), 102 deletions(-) diff --git a/DIMS/CollectFilled.R b/DIMS/CollectFilled.R index 48fba8c..a0ad358 100755 --- a/DIMS/CollectFilled.R +++ b/DIMS/CollectFilled.R @@ -22,7 +22,7 @@ for (scanmode in scanmodes) { outlist_total <- rbind(outlist_total, peakgrouplist_filled) } # remove duplicates; peak groups with exactly the same m/z - outlist_total <- mergeDuplicatedRows(outlist_total) + outlist_total <- merge_duplicate_rows(outlist_total) # sort on mass outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] # load replication pattern @@ -30,7 +30,7 @@ for (scanmode in scanmodes) { repl_pattern <- get(load(pattern_file)) # calculate Z-scores if (z_score == 1) { - outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE) + outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) outlist_stats_more <- cbind( diff --git a/DIMS/GenerateViolinPlots.nf b/DIMS/GenerateViolinPlots.nf index 641d27a..44323ac 100644 --- a/DIMS/GenerateViolinPlots.nf +++ b/DIMS/GenerateViolinPlots.nf @@ -9,11 +9,11 @@ process GenerateViolinPlots { val(analysis_id) output: - path('Diagnostics/*.pdf'), emit: diag_plot_files - path('Other/*.pdf'), emit: other_plot_files - path('dIEM_plots/*.pdf'), emit: diem_plot_files - path('*.xlsx'), emit: excel_file - path('*.csv'), emit: helix_file + path('Diagnostics/*.pdf'), emit: diag_plot_files, optional: true + path('Other/*.pdf'), emit: other_plot_files, optional: true + path('dIEM_plots/*.pdf'), emit: diem_plot_files, optional: true + path('*.xlsx'), emit: excel_file, optional: true + path('*.csv'), emit: helix_file, optional: true script: """ diff --git a/DIMS/SpectrumPeakFinding.R b/DIMS/SpectrumPeakFinding.R index 143299e..8b249e2 100644 --- a/DIMS/SpectrumPeakFinding.R +++ b/DIMS/SpectrumPeakFinding.R @@ -26,7 +26,7 @@ for (scanmode in scanmodes) { for (file_nr in 1:length(peaklist_files)) { cat("\n", peaklist_files[file_nr]) load(peaklist_files[file_nr]) - if (is.null(outlist.persample) || (dim(outlist.persample)[1] == 0)) { + if (is.null(outlist_persample) || (dim(outlist_persample)[1] == 0)) { tmp <- strsplit(peaklist_files[file_nr], "/")[[1]] fname <- tmp[length(tmp)] fname <- strsplit(fname, ".RData")[[1]][1] @@ -38,9 +38,9 @@ for (scanmode in scanmodes) { } } else { if (file_nr == 1) { - outlist_total <- outlist.persample + outlist_total <- outlist_persample } else { - outlist_total <- rbind(outlist_total, outlist.persample) + outlist_total <- rbind(outlist_total, outlist_persample) } } } diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 877b3c8..699da4a 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -43,7 +43,7 @@ sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number int_cols <- c(int_cols_C, int_cols_P) ints <- peaklist[metab, int_cols] } else { - ints <- peaklist[metab, c(7:(length(grpnames_long) + 6))] + ints <- peaklist[metab, c(3:(length(grpnames_long) + 2))] } total <- apply(ints, 2, sum) diff --git a/DIMS/UnidentifiedCalcZscores.R b/DIMS/UnidentifiedCalcZscores.R index e62776c..ec31a3f 100755 --- a/DIMS/UnidentifiedCalcZscores.R +++ b/DIMS/UnidentifiedCalcZscores.R @@ -23,7 +23,7 @@ for (scanmode in scanmodes) { } # remove duplicates; peak groups with exactly the same m/z - outlist_total <- mergeDuplicatedRows(outlist_total) + outlist_total <- merge_duplicate_rows(outlist_total) # sort on mass outlist_total <- outlist_total[order(outlist_total[, "mzmed.pgrp"]), ] @@ -34,7 +34,7 @@ for (scanmode in scanmodes) { if (z_score == 1) { # calculate Z-scores - outlist_stats <- statistics_z(outlist_total, sortCol = NULL, adducts = FALSE) + outlist_stats <- calculate_zscores(outlist_total, adducts = FALSE) nr_removed_samples <- length(which(repl_pattern[] == "character(0)")) order_index_int <- order(colnames(outlist_stats)[8:(length(repl_pattern) - nr_removed_samples + 7)]) outlist_stats_more <- cbind(outlist_stats[, 1:7], diff --git a/DIMS/UnidentifiedCollectPeaks.R b/DIMS/UnidentifiedCollectPeaks.R index 7376eb6..1536213 100755 --- a/DIMS/UnidentifiedCollectPeaks.R +++ b/DIMS/UnidentifiedCollectPeaks.R @@ -52,15 +52,6 @@ for (scanmode in scanmodes) { } outlist_part <- outlist[c(start_part:end_part), ] - # # add ppm extra before start - # if (part != 1) { - # mz_start <- outlist_part[1, "mzmed.pkt"] - # mz_ppm_range <- ppm * as.numeric(mz_start) / 1e+06 - # mz_start_min_ppm <- mz_start - mz_ppm_range - # outlist_before_part <- outlist %>% filter(mzmed.pkt >= mz_start_min_ppm & mzmed.pkt < mz_start) - - # outlist_part <- rbind(outlist_before_part, outlist_part) - # } # add ppm extra after end if (part != num_parts) { diff --git a/DIMS/UnidentifiedFillMissing.R b/DIMS/UnidentifiedFillMissing.R index 1611986..78b7951 100755 --- a/DIMS/UnidentifiedFillMissing.R +++ b/DIMS/UnidentifiedFillMissing.R @@ -12,15 +12,15 @@ ppm <- as.numeric(cmd_args[5]) outdir <- "./" # load in function scripts -source(paste0(scripts_dir, "replaceZeros.R")) -source(paste0(scripts_dir, "generateGaussian.R")) -source(paste0(scripts_dir, "getFwhm.R")) -source(paste0(scripts_dir, "getSD.R")) -source(paste0(scripts_dir, "getArea.R")) -source(paste0(scripts_dir, "optimizeGauss.R")) -source(paste0(scripts_dir, "ident.hires.noise.HPC.R")) -source(paste0(scripts_dir, "elementInfo.R")) -source(paste0(scripts_dir, "globalAssignments.HPC.R")) +source(paste0(scripts_dir, "replace_zeros.R")) +source(paste0(scripts_dir, "fit_optim.R")) +source(paste0(scripts_dir, "get_fwhm.R")) +source(paste0(scripts_dir, "get_stdev.R")) +source(paste0(scripts_dir, "estimate_area.R")) +source(paste0(scripts_dir, "optimize_gaussfit.R")) +source(paste0(scripts_dir, "identify_noisepeaks.R")) +source(paste0(scripts_dir, "get_element_info.R")) +source(paste0(scripts_dir, "atomic_info.R")) # peakgrouplist_files <- c(peakgrouplist_file1, peakgrouplist_file2) # for (peakgrouplist_file in peakgrouplist_files) { @@ -41,7 +41,7 @@ outpgrlist_identified <- get(load(peakgrouplist_file)) outputfile_name <- gsub(".RData", "_filled.RData", peakgrouplist_file) # replace missing values (zeros) with random noise -peakgrouplist_filled <- replaceZeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) +peakgrouplist_filled <- replace_zeros(outpgrlist_identified, repl_pattern, scanmode, resol, outdir, thresh, ppm) # save output save(peakgrouplist_filled, file = outputfile_name) diff --git a/DIMS/Utils/calculate_zscores.R b/DIMS/Utils/calculate_zscores.R index 9791e45..d687376 100644 --- a/DIMS/Utils/calculate_zscores.R +++ b/DIMS/Utils/calculate_zscores.R @@ -1,8 +1,7 @@ ## adapted from statistics_z.R # refactor: change column names from avg.ctrls to avg_ctrls, sd.ctrls to sd_ctrls -# remove parameter sort_col # check logic of parameter adducts -calculate_zscores <- function(peakgroup_list, sort_col, adducts) { +calculate_zscores <- function(peakgroup_list, adducts) { #' Calculate Z-scores for peak groups based on average and standard deviation of controls #' #' @param peakgroup_list: Peak group list (matrix) @@ -13,6 +12,8 @@ calculate_zscores <- function(peakgroup_list, sort_col, adducts) { case_label <- "P" control_label <- "C" + # get index for new column names + startcol <- ncol(peakgroup_list) + 3 # calculate mean and standard deviation for Control group ctrl_cols <- grep(control_label, colnames(peakgroup_list), fixed = TRUE) @@ -35,7 +36,6 @@ calculate_zscores <- function(peakgroup_list, sort_col, adducts) { } # apply new column names to columns at end plus avg and sd columns - startcol <- ncol(peakgroup_list) + 3 colnames(peakgroup_list)[startcol:ncol(peakgroup_list)] <- colnames_zscores # add ppm deviation column diff --git a/DIMS/Utils/fit_gaussian.R b/DIMS/Utils/fit_gaussian.R index ed3cd0c..6be9660 100644 --- a/DIMS/Utils/fit_gaussian.R +++ b/DIMS/Utils/fit_gaussian.R @@ -5,8 +5,8 @@ # remove plot sections (commented out) # several functions need to be loaded before this function can run fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale, resol, - outdir, force, use_bounds, plot, scanmode, - int_factor, width, height) { + outdir, force, use_bounds, plot, scanmode, + int_factor, width, height) { #' Fit 1, 2, 3 or 4 Gaussian peaks in small region of m/z #' #' @param mass_vector2: Vector of equally spaced m/z values (float) @@ -40,20 +40,16 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 1 Gaussian peak (mean, scale, sigma, qual) fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) - print("fit_gaussian, force == 1, values fit_values:") - print(fit_values) # set initial value for scale factor scale <- 2 # test if the mean is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)]) { # run this function again with fixed boundaries - print("nu in fit_gaussian, mean outside m/z range") return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 1, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { # test if the fit is bad if (fit_values$qual > fit_quality1) { - print("fit_gaussian, force == 1, qual > fit_quality") # Try to fit two curves; find two local maxima new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 # test if there are two indices in new_index @@ -65,12 +61,11 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, outdir, force = 2, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) - # good fit + # good fit } else { - print("fit_gaussian, force == 1, qual < fit_quality") peak_mean <- c(peak_mean, fit_values$mean) peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, - fit_values$sigma, int_factor)) + fit_values$sigma, int_factor)) peak_qual <- fit_values$qual peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] @@ -82,21 +77,16 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale # determine fit values for 2 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_2peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) - print("fit_gaussian force == 2, values fit_values:") - print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)]) { - print("fit_gaussian, force == 2, mean outside m/z range") # check if fit quality is bad if (fit_values$qual > fit_quality) { - print("fit_gaussian, force == 2, qual > fit_quality") # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 2, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { - print("fit_gaussian, fource == 2, qual < fit_quality") # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! for (i in 1:length(fit_values$mean)){ @@ -112,12 +102,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] } - # if all means are within range + # if all means are within range } else { - print("fit_gaussian, force == 2, means within m/z range") # check for bad fit if (fit_values$qual > fit_quality) { - print("fit_gaussian, force == 2, qual > fit_quality") # Try to fit three curves; find three local maxima new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 # test if there are three indices in new_index @@ -129,9 +117,8 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, outdir, force = 3, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) - # good fit, all means are within m/z range + # good fit, all means are within m/z range } else { - print("fit_gaussian, force == 2, qual < fit_quality") # check if means are within 3 ppm and sum if so tmp <- fit_values$qual nr_means_new <- -1 @@ -144,30 +131,33 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale } # restore original quality score fit_values$qual <- tmp - } + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } } - # Three local maxima; need at least 6 data points for this + # Three local maxima; need at least 6 data points for this } else if (force == 3 && (length(mass_vector) > 6)) { # determine fit values for 3 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_3peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) - print("fit_gaussian force == 3, values fit_values:") - print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)]) { - print("fit_gaussian, force == 3, mean outside mz range") # check if fit quality is bad if (fit_values$qual > fit_quality) { - print("fit_gaussian, force == 3, qual > fit_quality") # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { - print("fit_gaussian, force == 3, qual < fit_quality") # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! for (i in 1:length(fit_values$mean)) { @@ -183,12 +173,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] } - # if all means are within range + # if all means are within range } else { - print("fit_gaussian, force == 3, means within range") # check for bad fit if (fit_values$qual > fit_quality) { - print("fit_gaussian, force == 3, qual > fit_quality") # Try to fit four curves; find four local maxima new_index <- which(diff(sign(diff(int_vector))) == -2) + 1 # test if there are four indices in new_index @@ -200,10 +188,9 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale return(fit_gaussian(mass_vector2, mass_vector, int_vector, new_index, scale, resol, outdir, force = 4, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) - # good fit, all means are within m/z range + # good fit, all means are within m/z range } else { - print("fit_gaussian, force == 3, qual < fit_quality") - # check if means are within 4 ppm and sum if so + # check if means are within 4 ppm and sum if so tmp <- fit_values$qual nr_means_new <- -1 nr_means <- length(fit_values$mean) @@ -215,32 +202,36 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale } # restore original quality score fit_values$qual <- tmp + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + } } - - #### Four local maxima; need at least 6 data points for this #### + + #### Four local maxima; need at least 6 data points for this #### } else if (force == 4 && (length(mass_vector) > 6)) { # determine fit values for 4 Gaussian peaks (mean, scale, sigma, qual) fit_values <- fit_4peaks(mass_vector2, mass_vector, int_vector, max_index, scale, resol, use_bounds, plot, fit_quality, int_factor) - print("fit_gaussian force == 4, values fit_values:") - print(fit_values) # test if one of the means is outside the m/z range if (fit_values$mean[1] < mass_vector[1] || fit_values$mean[1] > mass_vector[length(mass_vector)] || fit_values$mean[2] < mass_vector[1] || fit_values$mean[2] > mass_vector[length(mass_vector)] || fit_values$mean[3] < mass_vector[1] || fit_values$mean[3] > mass_vector[length(mass_vector)] || fit_values$mean[4] < mass_vector[1] || fit_values$mean[4] > mass_vector[length(mass_vector)]) { - print("fit_gaussian, force == 4, mean outside mz range") # check if quality of fit is bad if (fit_values$qual > fit_quality) { - print("fit_gaussian, force == 4, qual > fit_quality") # run this function again with fixed boundaries return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force, use_bounds = TRUE, plot, scanmode, int_factor, width, height)) } else { - print("fit_gaussian, force == 4, qual < fit_quality") # check which mean is outside range and remove it from the list of means # NB: peak_mean and other variables have not been given values from 2-peak fit yet! for (i in 1:length(fit_values$mean)) { @@ -256,19 +247,16 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_min <- mass_vector[1] peak_max <- mass_vector[length(mass_vector)] } - # if all means are within range + # if all means are within range } else { - print("fit_gaussian, force == 4, means within range") # check for bad fit if (fit_values$qual > fit_quality) { - print("fit_gaussian, force == 4, qual > fit_quality") # Try to fit 1 curve, force = 5 return(fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = 5, use_bounds = FALSE, plot, scanmode, int_factor, width, height)) - # good fit, all means are within m/z range + # good fit, all means are within m/z range } else { - print("fit_gaussian, force == 4, qual < fit_quality") # check if means are within 4 ppm and sum if so tmp <- fit_values$qual nr_means_new <- -1 @@ -281,10 +269,19 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale } # restore original quality score fit_values$qual <- tmp - } + + for (i in 1:length(fit_values$mean)){ + peak_mean <- c(peak_mean, fit_values$mean[i]) + peak_area <- c(peak_area, fit_values$area[i]) + } + peak_qual <- fit_values$qual + peak_min <- mass_vector[1] + peak_max <- mass_vector[length(mass_vector)] + + } } - - # More than four local maxima; fit 1 peak. + + # More than four local maxima; fit 1 peak. } else { scale <- 2 fit_quality1 <- 0.40 @@ -292,13 +289,10 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale max_index <- which(int_vector == max(int_vector)) fit_values <- fit_1peak(mass_vector2, mass_vector, int_vector, max_index, scale, resol, plot, fit_quality1, use_bounds) - print("fit_gaussian, force > 4, values fit_values:") - print(fit_values) # check for bad fit if (fit_values$qual > fit_quality1) { - print("fit_gaussian, force > 4, qual > fit_quality1") # remove - if (plot) dev.off() + if (plot) dev.off() # get fit values from fit_optim fit_values <- fit_optim(mass_vector, int_vector, resol, plot, scanmode, int_factor, width, height) peak_mean <- c(peak_mean, fit_values$mean) @@ -307,7 +301,6 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale peak_max <- fit_values$max peak_qual <- 0 } else { - print("fit_gaussian, force > 4, qual < fit_quality1") peak_mean <- c(peak_mean, fit_values$mean) peak_area <- c(peak_area, estimate_area(fit_values$mean, resol, fit_values$scale, fit_values$sigma, int_factor)) peak_qual <- fit_values$qual @@ -322,8 +315,5 @@ fit_gaussian <- function(mass_vector2, mass_vector, int_vector, max_index, scale "qual" = peak_qual, "min" = peak_min, "max" = peak_max) - print("fit_gaussian, end script, values roi_value_list:") - print(roi_value_list) return(roi_value_list) } - diff --git a/DIMS/Utils/fit_gaussians.R b/DIMS/Utils/fit_gaussians.R index 2e811fd..05bff86 100644 --- a/DIMS/Utils/fit_gaussians.R +++ b/DIMS/Utils/fit_gaussians.R @@ -17,7 +17,6 @@ fit_1gaussian <- function(mass_vector, int_vector, sigma, query_mass, scale, use d <- params[2] * dnorm(mass_vector, mean = params[1], sd = sigma) sum((d - int_vector) ^ 2) } -print("nu in fit_1gaussian") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0) @@ -58,7 +57,6 @@ fit_2gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sum((d - int_vector) ^ 2) } -print("nu in fit_2gaussians") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0) @@ -121,7 +119,6 @@ fit_3gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sum((d - int_vector) ^ 2) } -print("nu in fit_3gaussians") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) @@ -176,7 +173,6 @@ fit_4gaussians <- function(mass_vector, int_vector, sigma1, sigma2, sigma3, sigm sum((d - int_vector) ^ 2) } -print("nu in fit_4gaussians") if (use_bounds) { # determine lower and upper boundaries lower <- c(mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0, mass_vector[1], 0) diff --git a/DIMS/Utils/fit_init.R b/DIMS/Utils/fit_init.R index 0c4e513..ecae337 100644 --- a/DIMS/Utils/fit_init.R +++ b/DIMS/Utils/fit_init.R @@ -33,7 +33,6 @@ fit_init <- function(mass_vector, int_vector, int_factor, scale, resol, # Find the index in int_vector with the highest intensity max_index <- which(int_vector == max(int_vector)) -print("nu in fit_init") roi_values <- fit_gaussian(mass_vector2, mass_vector, int_vector, max_index, scale, resol, outdir, force = length(max_index), use_bounds = FALSE, plot, scanmode, int_factor, width, height) diff --git a/DIMS/Utils/fit_optim.R b/DIMS/Utils/fit_optim.R index ed71622..404cc7a 100644 --- a/DIMS/Utils/fit_optim.R +++ b/DIMS/Utils/fit_optim.R @@ -38,12 +38,10 @@ fit_optim <- function(mass_vector, int_vector, resol, # get an estimate of the area under the peak area <- estimate_area(mass_max, resol, scale, sigma, int_factor) -print("nu in fit_optim") # put all values for this region of interest into a list roi_value_list <- list("mean" = mass_max, "area" = area, "min" = mass_vector2[1], "max" = mass_vector2[length(mass_vector2)]) -print(roi_value_list) return(roi_value_list) } diff --git a/DIMS/Utils/get_element_info.R b/DIMS/Utils/get_element_info.R index c4bdd92..a619f6a 100644 --- a/DIMS/Utils/get_element_info.R +++ b/DIMS/Utils/get_element_info.R @@ -10,7 +10,7 @@ get_element_info <- function(name, elements = NULL) { # get information on all elements if (!is.list(elements) || length(elements) == 0 ) { - all_elements <- initializePSE() + elements <- initializePSE() } # extract information for a particular adduct if (name == "CH3OH+H") { @@ -19,6 +19,6 @@ get_element_info <- function(name, elements = NULL) { } else { regular_expr <- paste0("^", name, "$") } - element_info <- all_elements[[grep(regular_expr, sapply(all_elements, function(x) { x$name }))]] + element_info <- elements[[grep(regular_expr, sapply(elements, function(x) { x$name }))]] return(element_info) } \ No newline at end of file diff --git a/DIMS/Utils/identify_noisepeaks.R b/DIMS/Utils/identify_noisepeaks.R index 5e557cb..cf79279 100644 --- a/DIMS/Utils/identify_noisepeaks.R +++ b/DIMS/Utils/identify_noisepeaks.R @@ -43,7 +43,7 @@ identify_noisepeaks <- function(peakgroup_list, all_adducts, scanmode = "Negativ add2label <- paste0("[M+", look4[adduct_index], "]", adduct_scanmode) } - noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName "], add2label) + noise_mz_adduct[, "CompoundName"] <- paste0(noise_mz_adduct[, "CompoundName"], add2label) adduct_info <- get_element_info(look4[adduct_index], all_adducts) if (scanmode == "Positive") { adduct_mass <- adduct_info$mass[1] + adduct_info$isotope$mass[1] - hydrogen_mass diff --git a/DIMS/Utils/replace_zeros.R b/DIMS/Utils/replace_zeros.R index cf65340..0ef06a3 100644 --- a/DIMS/Utils/replace_zeros.R +++ b/DIMS/Utils/replace_zeros.R @@ -26,8 +26,8 @@ replace_zeros <- function(peakgroup_list, repl_pattern, scanmode, resol, outdir, next } for (zero_index in 1:length(zero_intensity)) { - area <- generate_gaussian(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh, - resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area + area <- fit_optim(peakgroup_list[zero_intensity[zero_index], "mzmed.pgrp"], thresh, + resol, FALSE, scanmode, int_factor = 1 * 10^5, 1, 1)$area peakgroup_list[zero_intensity[zero_index], names(repl_pattern)[sample_index]] <- rnorm(n = 1, mean = area, sd = 0.25 * area) } From 3a0fcbb467edeaca541375b978ff16bd2276ad48 Mon Sep 17 00:00:00 2001 From: Robert Ernst Date: Tue, 18 Jun 2024 10:18:29 +0200 Subject: [PATCH 73/73] Revert "Feature/giabeval" --- CheckQC/CheckQC.nf | 6 +- CheckQC/check_qc.py | 194 +----------------- CheckQC/test_check_qc.py | 194 ++++++------------ ExomeDepth/CallCNV.nf | 2 +- ExomeDepth/GetRefset.nf | 2 +- ExomeDepth/Summary.nf | 2 +- ExonCov/ImportBam.nf | 2 +- ExonCov/SampleQC.nf | 2 +- Kinship/Kinship.nf | 2 +- .../1.0.0/test_get_gender_from_bam_chrx.py | 4 +- TrendAnalysis/TrendAnalysis.nf | 2 +- UPD/IGV.nf | 2 +- Utils/CreateHSmetricsSummary.nf | 2 +- Utils/EditSummaryFileHappy.nf | 30 --- Utils/GetStatsFromFlagstat.nf | 2 +- Utils/ParseChildFromFullTrio.nf | 2 +- Utils/SavePedFile.nf | 2 +- Utils/VersionLog.nf | 5 +- Utils/create_hsmetrics_summary.py | 2 +- Utils/get_stats_from_flagstat.py | 24 +-- 20 files changed, 92 insertions(+), 391 deletions(-) delete mode 100644 Utils/EditSummaryFileHappy.nf diff --git a/CheckQC/CheckQC.nf b/CheckQC/CheckQC.nf index 1cc6ee6..0dbf4d7 100644 --- a/CheckQC/CheckQC.nf +++ b/CheckQC/CheckQC.nf @@ -6,13 +6,13 @@ process CheckQC { input: val(identifier) - path(input_files, stageAs: "?/*") + path(input_files) output: - path("${identifier}_checkqc_summary.csv", emit: qc_output) + path("${identifier}_summary.csv", emit: qc_output) script: """ - python ${projectDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier}_checkqc ${input_files} + python ${projectDir}/CustomModules/CheckQC/check_qc.py ${params.qc_settings_path} '.' ${identifier} ${input_files} """ } diff --git a/CheckQC/check_qc.py b/CheckQC/check_qc.py index d0b86f7..a47b14a 100644 --- a/CheckQC/check_qc.py +++ b/CheckQC/check_qc.py @@ -5,7 +5,6 @@ from os import strerror as os_strerror from pathlib import Path import re -from string import punctuation import sys import warnings @@ -15,19 +14,6 @@ def non_empty_existing_path(file_or_dir): - """ - This function checks whether the provided file or dir exists and is not empty. - - Args: - file_or_dir (string): Input file or directory - - Raises: - FileNotFoundError: If input string file_or_dir is neither a file nor a dir. - OSError: If input is not a dir and file is empty. - - Returns: - string: Provided input file or directory. If dir, suffix '/' might be added. - """ input_path = Path(file_or_dir) if not input_path.is_file() and not input_path.is_dir(): raise FileNotFoundError(errno_ENOENT, os_strerror(errno_ENOENT), file_or_dir) @@ -39,15 +25,6 @@ def non_empty_existing_path(file_or_dir): def parse_arguments_and_check(args_in): - """ - Parses arguments and validates / checks format of input. - - Args: - args_in (list of strings): Commandline input arguments. - - Returns: - Namespace: Convert argument strings to objects and assign them as attributes of the namespace. - """ parser = argparse.ArgumentParser( description="Check and summarize sample quality using qc metrics and their thresholds." ) @@ -75,18 +52,6 @@ def parse_arguments_and_check(args_in): def read_yaml(yaml_file): - """ - Read input yaml - - Args: - yaml_file (string): String with path to yaml file - - Raises: - ValueError: If reading the file returns None (not recognized as YAML format). - - Returns: - Object: Content of the YAML file. - """ yaml_loaded = yaml.safe_load(open(yaml_file)) if not yaml_loaded: raise ValueError("Could not load YAML.") @@ -94,52 +59,19 @@ def read_yaml(yaml_file): def check_allowed_operators(qc_operator): - """ - Check if provided qc_operator is allowed. - - Args: - qc_operator (string): (Custom / math) operator - - Raises: - ValueError: If provided qc_operator is invalid / unsupported. - """ operators = ["<", "<=", ">", ">=", "==", "!=", "match"] if qc_operator not in operators: raise ValueError(f"Unsupported operator provided: {qc_operator}. Please select from: {operators}") -def check_required_keys_metrics(qc_metrics): - """ - Check if all required settings are included in the qc_settings - - Args: - qc_metrics (list with dicts): qc settings per qc metric - - Raises: - KeyError: Required key is not provided for the qc metric settings - """ +def check_required_keys_metrics(qc_settings): for req_key in ["filename", "qc_col", "threshold", "operator", "report_cols"]: - if any([req_key not in setting.keys() for setting in qc_metrics]): + if any([req_key not in setting.keys() for setting in qc_settings["metrics"]]): raise KeyError(f"Required key {req_key} not in all metrics settings.") def select_metrics(filename, input_files): - """ - Using regular expression to match the qc metric filename with the input files - - Args: - filename (string): Filename of qc metric, could be a regex. - input_files (list): All qc metrics input files. - - Returns: - list: Input files matching the given filename. - """ - # Change filename without special characters (except for '_', '-' and '.') into - # a regex pattern to match absolute and relative paths in input_files. - special_symbols = set(punctuation) - set(".", "_", "-") - if not any([char in special_symbols for char in set(filename)]): - filename = ".*" + filename - metrics = list(filter(re.compile(f"{filename}").match, input_files)) + metrics = list(filter(re.compile(f".*{filename}").match, input_files)) if not metrics: warnings.warn(UserWarning(f"No input file provided with filename pattern {filename}")) return None @@ -147,21 +79,6 @@ def select_metrics(filename, input_files): def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): - """ - Get valid columns to include in final output report - - Args: - qc_report_cols (list, string): column name(s) to include in report. - qc_metric_cols (list, string): column name(s) in qc metric. - qc_col (string): column name that contains the qc value/score. - - Raises: - TypeError: qc_report_cols is neither a string nor list. - ValueError: Provided column names in qc_report_cols does not exist in the qc metric. - - Returns: - list of strings: Valid column names to include in report. - """ not_existing_cols = list(set(qc_report_cols) - set(qc_metric_cols)) if qc_report_cols == "@all": qc_report_cols = qc_metric_cols @@ -177,19 +94,6 @@ def get_columns_to_report(qc_report_cols, qc_metric_cols, qc_col): def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshold): - """ - Add and rename columns in qc_metric. - - Args: - qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values - qc_title (string): Title of the qc check - qc_col (string): qc column name with qc value/score - qc_operator (string): (Custom / math) operator - qc_threshold (string, int or float): qc threshold - - Returns: - Pandas DataFrame: DataFrame with qc metric. - """ qc_metric_assigned = qc_metric.assign( qc_title=qc_title.lower(), qc_status="PASS", @@ -201,21 +105,6 @@ def add_and_rename_columns(qc_metric, qc_title, qc_col, qc_operator, qc_threshol def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): - """ - Get rows that fail provided qc threshold - - Args: - qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values - qc_col (string): column name that contains the qc value/score. - qc_operator (string): (Custom / math) operator - qc_threshold (string, int or float): qc threshold - - Raises: - TypeError: If qc threshold is neither 'match', str, int or float. - - Returns: - object: The indexes of failed rows in qc_metric - """ # Select failed rows using qc_threshold regex pattern and qc_operator 'match' if qc_operator == "match" and isinstance(qc_threshold, str): return qc_metric[qc_col].str.match(qc_threshold) @@ -232,19 +121,6 @@ def get_failed_rows(qc_metric, qc_col, qc_operator, qc_threshold): def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): - """ - Failed samples are added to the output metric, and removed from qc_metric. - - Args: - qc_metric (pandas DataFrame): DataFrame with columns required to judge qc values - failed_rows (object): Object with indexes of failed rows in qc_metric - report_cols (list): Valid column names (strings) to include in report. - sample_cols (list): Columnames (strings) of sample names. - - Returns: - qc_metric (DataFrame): DataFrame of qc metric without failed rows - qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows - """ qc_metric_out = DataFrame(columns=["sample", "qc_check", "qc_status", "qc_msg", "qc_value"]) failed_samples = [] if failed_rows.to_list(): @@ -276,17 +152,6 @@ def add_failed_samples_metric(qc_metric, failed_rows, report_cols, sample_cols): def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): - """ - Passed samples are added to the output metric - - Args: - qc_metric (DataFrame): DataFrame of qc metric without failed rows - qc_metric_out (DataFrame): DataFrame of qc metric to report with failed rows - sample_cols (list): Columnames (strings) of sample names. - - Returns: - pandas DataFrame: Sorted qc metric for both passed and failed samples, without duplicates. - """ # Add passed samples to output for sample_col in sample_cols: qc_metric_out = concat([ @@ -297,28 +162,12 @@ def add_passed_samples_metric(qc_metric, qc_metric_out, sample_cols): .loc[:, qc_metric_out.columns] ) ]) - # Some qc metrics did result in errors when merging the tables. - # The merge failed when the column qc_value has floats stored as strings. - # Try to convert column qc_value to float. - # If ValueError is raised, probably because column is a string, continue. - try: - qc_metric_out["qc_value"] = qc_metric_out["qc_value"].astype("float") - except ValueError: - pass # In case 'multiple sample qc check', # output could contain duplicate rows for individual samples used in multiple comparisons. return qc_metric_out.sort_values(by=["qc_check", "qc_status"]).drop_duplicates(keep="first") def create_and_write_output(qc_output, output_path, output_prefix): - """ - Joined qc metrics is created and written to output file. - - Args: - qc_output (pandas DataFrame): Sorted judged qc metric for both passed and failed samples, without duplicates. - output_path (string): Relative or absolute path where output should be saved. - output_prefix (string): Output prefix for output file. - """ # Add qc_summary qc_output.insert(1, "qc_summary", "PASS") qc_output.loc[qc_output.isin(["FAIL"]).any(axis=1), "qc_summary"] = "FAIL" @@ -327,20 +176,8 @@ def create_and_write_output(qc_output, output_path, output_prefix): def read_and_judge_metrics(qc, metrics): - """ - Read and judge each single qc metric and join results. - - Args: - qc (dict): qc settings of the metric - metrics (list): List of input files specific for single qc metric - - Returns: - pandas DataFrame: Joined and judged qc metrics. - """ for qc_file in metrics: - qc_metric_raw = read_csv( - qc_file, comment=qc.get("comment", None), delimiter=qc.get("delim", "\t"), quotechar=qc.get("quotechar", '"') - ) + qc_metric_raw = read_csv(qc_file, comment=qc.get("comment", None), delimiter="\t", quotechar='"') report_cols = get_columns_to_report(qc["report_cols"], qc_metric_raw.columns.to_list(), qc["qc_col"]) qc_metric_edit = add_and_rename_columns(qc_metric_raw, qc["title"], qc["qc_col"], qc["operator"], qc["threshold"]) failed_rows = get_failed_rows(qc_metric_edit, "qc_value", qc["operator"], qc["threshold"]) @@ -374,30 +211,9 @@ def read_and_judge_metrics(qc, metrics): def check_qc(input_files, settings, output_path, output_prefix): - """ - Main function to judge input files on configured qc settings. - It creates a single results table, each row representing - sample (string): sample name - qc_summary: Summarized status of all qcs for sample (pass or fail) - qc columns (5 per each qc metric); - qc_check (string): QC check consiting of qc title, operator and threshold - qc_status (string): Status of performed qc check (pass or fail) - qc_msg (string): String with human readable message if sample failed qc check, empty if passed. - qc_value (string, int, float): qc value/score to check. - - Args: - input_files (list): All qc metrics input files. - settings (string): Path to yaml file - output_path (string): Relative or absolute path where output should be saved. - output_prefix (string): Output prefix for output file. - - Raises: - ValueError: No input files found to match any qc metric patterns defined in settings. - ValueError: Duplicated samples with different values found in some of the input files. - """ # A single qc metric file can be used multiple times, by defining a metric section for each check in the qc settings. qc_settings = read_yaml(settings) - check_required_keys_metrics(qc_settings["metrics"]) + check_required_keys_metrics(qc_settings) duplicated_sample_file = [] for qc_metric_settings in qc_settings["metrics"]: check_allowed_operators(qc_metric_settings["operator"]) diff --git a/CheckQC/test_check_qc.py b/CheckQC/test_check_qc.py index baedf3d..d6ef6ae 100644 --- a/CheckQC/test_check_qc.py +++ b/CheckQC/test_check_qc.py @@ -15,7 +15,7 @@ @pytest.fixture(scope="module", autouse=True) def setup_test_path(tmp_path_factory): test_tmp_path = str(tmp_path_factory.mktemp("test")) + "/" - # Create empty files + # create empty files open(str(test_tmp_path) + "/empty.txt", "a").close() open(str(test_tmp_path) + "/empty.yaml", "a").close() return test_tmp_path @@ -66,95 +66,44 @@ def test_not_existing_operator(self): class TestCheckRequiredKeysMetrics(): def test_required_keys_present(self): - qc_metrics = [ + qc_settings = {"metrics": [ {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake", "report_cols": "fake"}, - ] - check_qc.check_required_keys_metrics(qc_metrics) + ]} + check_qc.check_required_keys_metrics(qc_settings) assert True @pytest.mark.parametrize( - "incomplete_qc_metrics", + "incomplete_qc_settings", [ - [{"filename": "fakename"}], - [ + {"metrics": [{"filename": "fakename"}]}, + {"metrics": [ {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake", "report_cols": "fake"}, - # Missing report_cols - {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, - ], - [ - # Missing report_cols - {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, - # Missing report_cols - {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, - ] + {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols + ]}, + {"metrics": [ + {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols + {"filename": "fake", "qc_col": "fake", "threshold": "fake", "operator": "fake"}, # missing report_cols + ]} ] ) - def test_missing_keys(self, incomplete_qc_metrics): + def test_missing_keys(self, incomplete_qc_settings): with pytest.raises(KeyError) as required_error: - check_qc.check_required_keys_metrics(incomplete_qc_metrics) + check_qc.check_required_keys_metrics(incomplete_qc_settings) error_val = str(required_error.value) assert "not in all metrics settings." in error_val assert error_val.split(" ")[2] in ["filename", "qc_col", "threshold", "operator", "report_cols"] class TestSelectMetrics(): - @pytest.mark.parametrize("filename_or_regex,input_files,expected", [ - # Multi match - ("test", ["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]), - # Single match - ("test", ["test1.txt", "fake2.txt"], ["test1.txt"]), - # Match with relative path - ("test", ["./random/path/to/test1.txt"], ["./random/path/to/test1.txt"]), - # Match with absolute path - ("test", ["/random/path/to/test1.txt"], ["/random/path/to/test1.txt"]), - # Match regex: kinship file suffix - ( - ".*.kinship_check.out$", - ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"], - ["240101_A00295_0001_AHWCFKDSX7_CREv4_1.kinship_check.out"] - ), - # Match on word truth and SNP - ( - ".*truth.*SNP", - [ - '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', - '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv' - ], - ['2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] - ), - # Match on word truth and SNP - ( - ".*truth.*SNP", - [ - 'U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', - 'U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv' - ], - ['U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv'] - ), - # Match if 'truth' is absent and contains 'SNP' - # ?: Match expression but do not capture it - # ?! Match if 'truth' is absent. - ( - "(?:(?!truth).)*SNP.*$", - [ - '2/U000000CFGIAB12878a_GIAB12878_nist2.19_truth_fix_header.vcf.gz_SNP_ALL.csv', - '12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv' - ], - ['12/U000000CFGIAB12878b_U000000CFGIAB12878c_SNP_ALL.csv'] - ), - + @pytest.mark.parametrize("input_files,expected", [ + (["test1.txt", "test2.txt"], ["test1.txt", "test2.txt"]), # multi match + (["test1.txt", "fake2.txt"], ["test1.txt"]), # single match ]) - def test_select_metric(self, filename_or_regex, input_files, expected): - metrics = check_qc.select_metrics(filename_or_regex, input_files) + def test_select_metric(self, input_files, expected): + metrics = check_qc.select_metrics("test", input_files) assert metrics == expected - @pytest.mark.parametrize("filename_or_regex,input_files", [ - # No match - ("test", ["fake1.txt", "fake2.txt"]), - # No match, filename with specialchar assumed to be a regex - ("specialchar_@", ["12/specialchar_@.txt"]), - ]) - def test_no_match(self, filename_or_regex, input_files): + def test_no_match(self): with pytest.warns(UserWarning) as match_warning: return_val = check_qc.select_metrics("test", ["fake1.txt", "fake2.txt"]) warn_msg = match_warning[0].message.args[0] @@ -166,14 +115,10 @@ def test_no_match(self, filename_or_regex, input_files): class TestGetColumnsToReport(): @pytest.mark.parametrize("report_cols,metric_cols,qc_col,expected", [ (["col1"], ["col1"], "col1", ["qc_title", "qc_value"]), - # Additional report col - (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), - # Different order output - (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]), - # Additional metric col - (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]), - # Special @all option - ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), + (["col1", "col2"], ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), # additional report col + (["col1", "col2"], ["col1", "col2"], "col2", ["qc_title", "qc_value", "col1"]), # different order output + (["col1"], ["col1", "col3"], "col1", ["qc_title", "qc_value"]), # additional metric col + ("@all", ["col1", "col2"], "col1", ["qc_title", "qc_value", "col2"]), # special @all option ]) def test_get_columns_to_report(self, report_cols, metric_cols, qc_col, expected): qc_report_cols = check_qc.get_columns_to_report(report_cols, metric_cols, qc_col) @@ -195,11 +140,11 @@ class TestAddAndRenameColumns(): def test_add_and_rename_columns(self): fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": ["0.01"]}) qc_metric_out = check_qc.add_and_rename_columns(fake_qc_metric, "FAKE_title", "fake_qc_col", "fake_op", "fake_thres") - # Assert expected column values + # assert expected column values assert qc_metric_out["qc_title"].values == "fake_title" assert qc_metric_out["qc_status"].values == "PASS" assert qc_metric_out["qc_check"].values == "fake_thres fake_op fake_qc_col" - # Assert all expected columns exist + # assert all expected columns exist assert not list( set(['sample', 'qc_value', 'qc_title', 'qc_status', 'qc_check', 'qc_msg']) - set(qc_metric_out.columns) ) @@ -208,14 +153,10 @@ def test_add_and_rename_columns(self): class TestGetFailedRows(): @pytest.mark.parametrize("qc_op,qc_thres", [ - # Test match - ("match", "fake_thres"), - # Test string - ("==", "FAIL"), - # Test int - ("==", 1), - # Test float - ("==", 0.1), + ("match", "fake_thres"), # test match + ("==", "FAIL"), # test string + ("==", 1), # test int + ("==", 0.1), # test float ]) def test_correct(self, qc_op, qc_thres): fake_qc_metric = DataFrame({"sample": ["sample1"], "fake_qc_col": [qc_thres]}) @@ -240,12 +181,9 @@ def test_add_failed_samples_single_sample_col(self): failed_rows = fake_qc_metric.loc[fake_qc_metric["sample_col"] == "sample2"].index qc_metric, qc_metric_out = check_qc.add_failed_samples_metric( fake_qc_metric, failed_rows, fake_qc_metric.columns.to_list(), ["sample_col"]) - # Test rename column - assert "sample" in qc_metric_out.columns.to_list() - # Test removal failed sample - assert "sample2" not in qc_metric["sample_col"].to_list() - # Test added failed sample - assert "sample2" in qc_metric_out["sample"].to_list() + assert "sample" in qc_metric_out.columns.to_list() # test rename column + assert "sample2" not in qc_metric["sample_col"].to_list() # test removal failed sample + assert "sample2" in qc_metric_out["sample"].to_list() # test added failed sample assert len(qc_metric) == 1 and len(qc_metric_out) == 1 assert qc_metric_out["qc_status"].values == "FAIL" @@ -255,21 +193,18 @@ def test_add_failed_samples_multi_sample_col(self): columns=["sample_col1", "sample_col2"] ) fake_kinship_metric = fake_kinship_metric.assign(qc_check="checks", qc_value="wrong") - # Define sample1 vs sample2 and sample1 vs sample3 as failed - failed_rows = fake_kinship_metric.iloc[0:2].index + failed_rows = fake_kinship_metric.iloc[0:2].index # define sample1 vs sample2 and sample1 vs sample3 as failed qc_metric, qc_metric_out = check_qc.add_failed_samples_metric( fake_kinship_metric, failed_rows, fake_kinship_metric.columns.to_list(), ["sample_col1", "sample_col2"]) for failed_sample in ["sample1", "sample2", "sample3"]: - # Test removal failed sample + # test removal failed sample assert failed_sample not in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel()) - # Test added failed sample - assert failed_sample in qc_metric_out["sample"].to_list() + assert failed_sample in qc_metric_out["sample"].to_list() # test added failed sample assert qc_metric_out["qc_status"].values.all() == "FAIL" assert len(qc_metric) == 1 twice_failed = qc_metric_out.loc[qc_metric_out["sample"] == "sample1"] - # Assert join with ';' on column qc_value - assert "wrong;wrong" == twice_failed["qc_value"].item() - # Assert join with ';' on column qc_msg + assert "wrong;wrong" == twice_failed["qc_value"].item() # assert join with ';' on column qc_value + # assert join with ';' on column qc_msg assert "sample1 sample2 checks wrong;sample1 sample3 checks wrong" == twice_failed["qc_msg"].item() for passed_sample in ["sample4", "sample5"]: assert passed_sample in list(qc_metric[["sample_col1", "sample_col2"]].values.ravel()) @@ -299,23 +234,20 @@ def test_add_passed_samples_multi_sample_col(self): ) qc_metric_out = check_qc.add_passed_samples_metric( fake_qc_metric, fake_sample_qc, ["sample_col1", "sample_col2"]) - # Test rename column - assert "sample" in qc_metric_out.columns.to_list() - # Test removal duplicates - assert qc_metric_out["sample"].to_list().count("s4") == 1 - # Test additional columns ignored - assert "new_col" not in qc_metric_out.columns.to_list() + assert "sample" in qc_metric_out.columns.to_list() # test rename column + assert qc_metric_out["sample"].to_list().count("s4") == 1 # test removal duplicates + assert "new_col" not in qc_metric_out.columns.to_list() # test additional columns ignored class TestCreateAndWriteOutput(): @pytest.mark.parametrize("exp_summary,qc_output", [ - # All qc checks passed + # all qc checks passed ("PASS", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["PASS"]})), - # Single qc check failed + # single qc check failed ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["PASS"], "qc_status_kinship": ["FAIL"]})), - # All qc check failed + # all qc check failed ("FAIL", DataFrame({"sample": ["s1"], "qc_status_cov": ["FAIL"], "qc_status_kinship": ["FAIL"]})), - # Not restricted to qc_status_ column name + # not restricted to qc_status_ column name ("PASS", DataFrame({"sample": ["s1"], "random_col1": ["PASS"], "random_col2": ["PASS"]})), ]) def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output): @@ -330,61 +262,61 @@ def test_create_and_write_output(self, setup_test_path, exp_summary, qc_output): class TestGetOutputMetrics(): @pytest.mark.parametrize("data_in,nr_rows", [ - # Single sample + # single sample (["sample1_fake_check.txt"], 1), - # Multiple single samples + # multiple single samples (["sample1_fake_check.txt", "sample2_fake_check.txt"], 2), - # Single multi samples + # single multi samples (["240101_fake_check.txt"], 2), - # Multiple multi samples + # multiple multi samples (["240101_fake_check.txt", "240102_fake_check.txt"], 4), - # Multi and single sample + # multi and single sample (["sample1_fake_check.txt", "240101_fake_check.txt"], 3), ]) def test_input_ok(self, data_in, nr_rows, dataset, datadir): datadir_files = [f"{datadir}/{filename}" for filename in data_in] + # input1 = datadir / "sample1_fake_check.txt" df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files) assert not df_output.empty observed_cols = df_output.columns.to_list() - # Shape results in tuple with no. rows and no. cols - assert df_output.shape[0] == nr_rows + assert df_output.shape[0] == nr_rows # shape results in tuple with no. rows and no. cols assert len(observed_cols) == 5 assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc'] @pytest.mark.parametrize("data_in,nr_rows,exp_warn_msg", [ - # Single sample duplicate + # single sample duplicate (["sample1_fake_check.txt"]*2, 1, "Sample IDs occur multiple times in input:"), - # Single multi samples duplicate + # single multi samples duplicate (["240101_fake_check.txt"]*2, 2, "Sample IDs occur multiple times in input:"), - # Multiple multi samples, duplicate samples + # multiple multi samples, duplicate samples (["240101_fake_check.txt", "240101_v2_fake_check.txt"], 4, "Different qc values for duplicated sample IDs in input:"), ]) def test_input_warn(self, data_in, nr_rows, exp_warn_msg, dataset, datadir): datadir_files = [f"{datadir}/{filename}" for filename in data_in] + # input1 = datadir / "sample1_fake_check.txt" with pytest.warns(UserWarning) as match_warning: df_output = check_qc.read_and_judge_metrics(dataset["settings_single_metric"]["metrics"][0], datadir_files) warn_msg = match_warning[0].message.args[0] assert exp_warn_msg in warn_msg assert not df_output.empty observed_cols = df_output.columns.to_list() - # Shape: tuple with no. rows and no. cols - assert df_output.shape[0] == nr_rows + assert df_output.shape[0] == nr_rows # Shape: tuple with no. rows and no. cols assert len(observed_cols) == 5 assert observed_cols == ['sample', 'qc_check_fc', 'qc_status_fc', 'qc_msg_fc', 'qc_value_fc'] class TestCheckQc(): @pytest.mark.parametrize("settings,data_in,exp_shape", [ - # Single metric, single sample input + # single metric, single sample input ("settings_single_metric", ["sample1_fake_check.txt"], (1, 5)), - # Two metrics, single sample input + # two metrics, single sample input ("settings_two_metrics", ["sample1_fake_check.txt"], (1, 9)), - # Single metric, multiple samples input + # single metric, multiple samples input ("settings_single_metric", ["240101_fake_check.txt"], (2, 5)), ("settings_single_metric", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 5)), - # Two metrics, multiple sample input + # two metrics, multiple sample input ("settings_two_metrics", ["240101_fake_check.txt", "240102_fake_check.txt"], (4, 9)), - # Two metricS, multi and single sample input + # two metric, multi and single sample input ("settings_two_metrics", ["sample1_fake_check.txt", "240101_fake_check.txt"], (3, 9)), ]) def test_ok(self, settings, data_in, exp_shape, datadir, dataset, mocker, ): diff --git a/ExomeDepth/CallCNV.nf b/ExomeDepth/CallCNV.nf index fda8ef1..9a3197e 100644 --- a/ExomeDepth/CallCNV.nf +++ b/ExomeDepth/CallCNV.nf @@ -22,4 +22,4 @@ process CallCNV { source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py callcnv ./ ${bam_file} ${analysis_id} ${sample_id} """ -} +} \ No newline at end of file diff --git a/ExomeDepth/GetRefset.nf b/ExomeDepth/GetRefset.nf index fbe9d7b..efff299 100644 --- a/ExomeDepth/GetRefset.nf +++ b/ExomeDepth/GetRefset.nf @@ -18,4 +18,4 @@ process GetRefset { python ${params.dx_resources_path}/${params.exomedepth_path}/exomedepth_db.py add_sample_return_refset_bam ${bam_file} --print_refset_stdout | \ tr -d '\n' """ -} +} \ No newline at end of file diff --git a/ExomeDepth/Summary.nf b/ExomeDepth/Summary.nf index a883376..3244d8c 100644 --- a/ExomeDepth/Summary.nf +++ b/ExomeDepth/Summary.nf @@ -17,4 +17,4 @@ process Summary { source ${params.dx_resources_path}/${params.exomedepth_path}/venv/bin/activate python ${params.dx_resources_path}/${params.exomedepth_path}/run_ExomeDepth.py summary ${exomedepth_logs} > ${analysis_id}_exomedepth_summary.txt """ -} +} \ No newline at end of file diff --git a/ExonCov/ImportBam.nf b/ExonCov/ImportBam.nf index dc26f50..6afbfe8 100644 --- a/ExonCov/ImportBam.nf +++ b/ExonCov/ImportBam.nf @@ -21,4 +21,4 @@ process ImportBam { --exon_bed ${params.dxtracks_path}/${params.exoncov_bed} \ ${analysis_id} WES ${bam_file} | tr -d '\n' """ -} +} \ No newline at end of file diff --git a/ExonCov/SampleQC.nf b/ExonCov/SampleQC.nf index cde757f..b0a2b95 100644 --- a/ExonCov/SampleQC.nf +++ b/ExonCov/SampleQC.nf @@ -19,4 +19,4 @@ process SampleQC { flask --app ${params.exoncov_path}/ExonCov sample_qc \ -s ${samples} -p ${panels} > ${analysis_id}.ExonCovQC_check.out """ -} +} \ No newline at end of file diff --git a/Kinship/Kinship.nf b/Kinship/Kinship.nf index fd051fe..9241dbf 100644 --- a/Kinship/Kinship.nf +++ b/Kinship/Kinship.nf @@ -19,4 +19,4 @@ process Kinship { cp king.kin0 ${analysis_id}.kinship python ${projectDir}/CustomModules/Kinship/check_kinship.py ${analysis_id}.kinship ${ped_file} > ${analysis_id}.kinship_check.out """ -} +} \ No newline at end of file diff --git a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py index 7674f45..0a494ea 100644 --- a/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py +++ b/MosaicHunter/1.0.0/test_get_gender_from_bam_chrx.py @@ -26,9 +26,7 @@ class TestGetGenderFromBam: ("./test_bam.bam", 20, "X:2699520-154931044", 5.5, 7.5, ("M", False)), ("./test_bam.bam", 20, "X:2699520-154931044", 4.5, 6.5, ("F", True)), ]) - def test_get_gender_from_bam( - self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome - ): + def test_get_gender_from_bam(self, bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female, expected_outcome): assert expected_outcome == get_gender_from_bam_chrx.get_gender_from_bam_chrx( bam, mapping_qual, locus_x, ratio_x_threshold_male, ratio_x_threshold_female) diff --git a/TrendAnalysis/TrendAnalysis.nf b/TrendAnalysis/TrendAnalysis.nf index 5009861..c8968ab 100644 --- a/TrendAnalysis/TrendAnalysis.nf +++ b/TrendAnalysis/TrendAnalysis.nf @@ -12,4 +12,4 @@ process TrendAnalysis { source ${params.trend_analysis_path}/venv/bin/activate python ${params.trend_analysis_path}/trend_analysis.py upload processed_data ${analysis_id} . """ -} +} \ No newline at end of file diff --git a/UPD/IGV.nf b/UPD/IGV.nf index f2974c7..2ea72af 100644 --- a/UPD/IGV.nf +++ b/UPD/IGV.nf @@ -19,4 +19,4 @@ process IGV { source ${params.upd_path}/venv/bin/activate python ${params.upd_path}/make_UPD_igv.py ${ped_file} ${analysis_id} $trio_sample ${vcf_files} -c """ -} +} \ No newline at end of file diff --git a/Utils/CreateHSmetricsSummary.nf b/Utils/CreateHSmetricsSummary.nf index 45553cb..2ca443f 100644 --- a/Utils/CreateHSmetricsSummary.nf +++ b/Utils/CreateHSmetricsSummary.nf @@ -14,4 +14,4 @@ process CreateHSmetricsSummary { """ python2 ${projectDir}/CustomModules/Utils/create_hsmetrics_summary.py ${hsmetrics_files} > HSMetrics_summary.txt """ -} +} \ No newline at end of file diff --git a/Utils/EditSummaryFileHappy.nf b/Utils/EditSummaryFileHappy.nf deleted file mode 100644 index a4aba58..0000000 --- a/Utils/EditSummaryFileHappy.nf +++ /dev/null @@ -1,30 +0,0 @@ -process EditSummaryFileHappy { - tag "$meta.id" - label 'EditSummaryFileHappy' - shell = ['/bin/bash', '-euo', 'pipefail'] - - input: - // meta should have the keys 'id', 'query' and 'truth' - tuple(val(meta), path(summary_csv)) - - output: - path("${meta.truth}_${meta.query}_INDEL_PASS.csv"), emit: indel_pass_csv - path("${meta.truth}_${meta.query}_INDEL_ALL.csv"), emit: indel_all_csv - path("${meta.truth}_${meta.query}_SNP_PASS.csv"), emit: snp_pass_csv - path("${meta.truth}_${meta.query}_SNP_ALL.csv"), emit: snp_all_csv - - script: - """ - # Add samplenames as columns (header and row values) at start of line - sed '1s/^/samples,sample_truth,sample_query,/; 2,\$s/^/${meta.truth}_${meta.query},${meta.truth},${meta.query},/' ${summary_csv} > ${summary_csv}.tmp - - # Split file including header (first line) - awk -F',' 'FNR==1{hdr=\$0;next} { - print hdr>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv"; - print \$0>>"${meta.truth}_${meta.query}_"\$4"_"\$5".csv" - }' ${summary_csv}.tmp - - # Remove tmp files - rm ${summary_csv}.tmp - """ -} diff --git a/Utils/GetStatsFromFlagstat.nf b/Utils/GetStatsFromFlagstat.nf index 27e6bbe..99018ed 100644 --- a/Utils/GetStatsFromFlagstat.nf +++ b/Utils/GetStatsFromFlagstat.nf @@ -14,4 +14,4 @@ process GetStatsFromFlagstat { """ python2 ${projectDir}/CustomModules/Utils/get_stats_from_flagstat.py ${flagstat_files} > run_stats.txt """ -} +} \ No newline at end of file diff --git a/Utils/ParseChildFromFullTrio.nf b/Utils/ParseChildFromFullTrio.nf index b9377bd..9963277 100644 --- a/Utils/ParseChildFromFullTrio.nf +++ b/Utils/ParseChildFromFullTrio.nf @@ -17,4 +17,4 @@ process ParseChildFromFullTrio { """ python2 ${projectDir}/CustomModules/Utils/parse_child_from_fulltrio.py ${ped_file} ${sample_ids} | tr -d '\n' """ -} +} \ No newline at end of file diff --git a/Utils/SavePedFile.nf b/Utils/SavePedFile.nf index 00f45a3..1c49c07 100644 --- a/Utils/SavePedFile.nf +++ b/Utils/SavePedFile.nf @@ -14,4 +14,4 @@ process SavePedFile { """ cp --remove-destination "\$(readlink ${ped_file})" ./ """ -} +} \ No newline at end of file diff --git a/Utils/VersionLog.nf b/Utils/VersionLog.nf index cff4f49..4a295b1 100644 --- a/Utils/VersionLog.nf +++ b/Utils/VersionLog.nf @@ -10,7 +10,6 @@ process VersionLog { output: path('repository_version.log') - path("versions.yml", emit: versions) script: """ @@ -18,8 +17,6 @@ process VersionLog { do echo "\${git_dir}" >> repository_version.log git --git-dir=\${git_dir}/.git log --pretty=oneline --decorate -n 2 >> repository_version.log - described_tags=\$(git --git-dir=\${git_dir}/.git describe --tags) - echo "\${git_dir}: \"\${described_tags}\"" >> versions.yml done """ -} +} \ No newline at end of file diff --git a/Utils/create_hsmetrics_summary.py b/Utils/create_hsmetrics_summary.py index 582b61e..405e1ce 100644 --- a/Utils/create_hsmetrics_summary.py +++ b/Utils/create_hsmetrics_summary.py @@ -8,7 +8,7 @@ parser.add_argument('hsmetrics_files', type=argparse.FileType('r'), nargs='*', help='HSMetric file') arguments = parser.parse_args() - interval_files_pattern = re.compile(r"BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]") + interval_files_pattern = re.compile("BAIT_INTERVALS=\[(\S*)\].TARGET_INTERVALS=\[(\S*)\]") summary_header = [] summary_data = {} for hsmetrics_file in arguments.hsmetrics_files: diff --git a/Utils/get_stats_from_flagstat.py b/Utils/get_stats_from_flagstat.py index 52560bc..2f46abb 100644 --- a/Utils/get_stats_from_flagstat.py +++ b/Utils/get_stats_from_flagstat.py @@ -43,24 +43,12 @@ print("\n\t{0} %duplication\n".format(100*sample_dups/sample_mapped)) - print( - "Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, " - "150bp={total_150bp:,} bp)".format( - total=counts['total'], - total_75bp=counts['total']*75, - total_100bp=counts['total']*100, - total_150bp=counts['total']*150 - ) - ) - print( - "Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, " - "150bp={total_150bp:,} bp)".format( - total=counts['mapped'], - total_75bp=counts['mapped']*75, - total_100bp=counts['mapped']*100, - total_150bp=counts['mapped']*150 - ) - ) + print("Total raw reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format( + total=counts['total'], total_75bp=counts['total']*75, total_100bp=counts['total']*100, total_150bp=counts['total']*150 + )) + print("Total mapped reads: {total:,} reads (Total throughput, 75bp={total_75bp:,} bp, 100bp={total_100bp:,} bp, 150bp={total_150bp:,} bp)".format( + total=counts['mapped'], total_75bp=counts['mapped']*75, total_100bp=counts['mapped']*100, total_150bp=counts['mapped']*150 + )) print("Average mapped per lib: {:,} reads".format(int(round(float(counts['mapped'])/float(counts['files']))))) print("Average dups per lib: {:,} reads".format(int(round(float(counts['dups'])/float(counts['files']))))) print("Average dups % per lib: {:.2f} %".format(100*float(counts['dups'])/float(counts['mapped'])))