From c4ca7eb348810f4c60c2ba982e0d35a818f469fd Mon Sep 17 00:00:00 2001 From: ALuesink Date: Fri, 2 Aug 2024 09:28:51 +0200 Subject: [PATCH 01/18] Removed AddOnFunctions folder --- .../10-collectSamplesFilled_extraoutput.R | 94 ----- .../add_lab_id_and_onderzoeksnummer.R | 10 - DIMS/AddOnFunctions/checkOverlap.R | 16 - DIMS/AddOnFunctions/check_same_samplename.R | 4 - DIMS/AddOnFunctions/create_violin_plots.R | 116 ------- DIMS/AddOnFunctions/elementInfo.R | 6 - DIMS/AddOnFunctions/export.R | 13 - DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R | 24 -- DIMS/AddOnFunctions/fit1Peak.R | 120 ------- DIMS/AddOnFunctions/fit2G.R | 43 --- DIMS/AddOnFunctions/fit2peaks.R | 100 ------ DIMS/AddOnFunctions/fit3G.R | 18 - DIMS/AddOnFunctions/fit3peaks.R | 122 ------- DIMS/AddOnFunctions/fit4G.R | 18 - DIMS/AddOnFunctions/fit4peaks.R | 149 -------- DIMS/AddOnFunctions/fitG.R | 18 - DIMS/AddOnFunctions/fitGaussian.R | 327 ------------------ DIMS/AddOnFunctions/fitGaussianInit.R | 60 ---- DIMS/AddOnFunctions/generateBreaksFwhm.R | 26 -- DIMS/AddOnFunctions/generateExcelFile.R | 24 -- DIMS/AddOnFunctions/generateGaussian.R | 48 --- DIMS/AddOnFunctions/getArea.R | 23 -- DIMS/AddOnFunctions/getDeltaMZ.R | 5 - DIMS/AddOnFunctions/getFitQuality.R | 65 ---- DIMS/AddOnFunctions/getFwhm.R | 24 -- DIMS/AddOnFunctions/getPatients.R | 9 - DIMS/AddOnFunctions/getSD.R | 9 - .../get_patient_data_to_helix.R | 31 -- DIMS/AddOnFunctions/globalAssignments.HPC.R | 114 ------ DIMS/AddOnFunctions/iden.code.R | 45 --- DIMS/AddOnFunctions/ident.hires.noise.HPC.R | 242 ------------- DIMS/AddOnFunctions/isWithinXppm.R | 51 --- DIMS/AddOnFunctions/is_diagnostic_patient.R | 6 - DIMS/AddOnFunctions/mergeDuplicatedRows.R | 49 --- DIMS/AddOnFunctions/normalization_2.1.R | 75 ---- DIMS/AddOnFunctions/optimizeGauss.R | 11 - DIMS/AddOnFunctions/output_helix.R | 31 -- DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R | 88 ----- DIMS/AddOnFunctions/prepare_alarmvalues.R | 52 --- DIMS/AddOnFunctions/prepare_data.R | 42 --- DIMS/AddOnFunctions/prepare_data_perpage.R | 47 --- DIMS/AddOnFunctions/prepare_toplist.R | 29 -- DIMS/AddOnFunctions/remove.dupl.2.1.R | 32 -- DIMS/AddOnFunctions/remove.dupl.R | 39 --- DIMS/AddOnFunctions/removeFromRepl.pat.R | 31 -- DIMS/AddOnFunctions/replaceZeros.R | 90 ----- DIMS/AddOnFunctions/replaceZeros_setseed.R | 108 ------ DIMS/AddOnFunctions/run.vbs | 22 -- DIMS/AddOnFunctions/runVBAMacro.R | 48 --- DIMS/AddOnFunctions/searchMZRange.R | 187 ---------- DIMS/AddOnFunctions/sourceDir.R | 8 - DIMS/AddOnFunctions/statistics_z.R | 69 ---- DIMS/AddOnFunctions/sumCurves.R | 39 --- DIMS/AddOnFunctions/trimZeros.R | 8 - 54 files changed, 3085 deletions(-) delete mode 100644 DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R delete mode 100644 DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R delete mode 100644 DIMS/AddOnFunctions/checkOverlap.R delete mode 100644 DIMS/AddOnFunctions/check_same_samplename.R delete mode 100644 DIMS/AddOnFunctions/create_violin_plots.R delete mode 100644 DIMS/AddOnFunctions/elementInfo.R delete mode 100644 DIMS/AddOnFunctions/export.R delete mode 100644 DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R delete mode 100644 DIMS/AddOnFunctions/fit1Peak.R delete mode 100644 DIMS/AddOnFunctions/fit2G.R delete mode 100644 DIMS/AddOnFunctions/fit2peaks.R delete mode 100644 DIMS/AddOnFunctions/fit3G.R delete mode 100644 DIMS/AddOnFunctions/fit3peaks.R delete mode 100644 DIMS/AddOnFunctions/fit4G.R delete mode 100644 DIMS/AddOnFunctions/fit4peaks.R delete mode 100644 DIMS/AddOnFunctions/fitG.R delete mode 100644 DIMS/AddOnFunctions/fitGaussian.R delete mode 100644 DIMS/AddOnFunctions/fitGaussianInit.R delete mode 100644 DIMS/AddOnFunctions/generateBreaksFwhm.R delete mode 100644 DIMS/AddOnFunctions/generateExcelFile.R delete mode 100644 DIMS/AddOnFunctions/generateGaussian.R delete mode 100644 DIMS/AddOnFunctions/getArea.R delete mode 100644 DIMS/AddOnFunctions/getDeltaMZ.R delete mode 100644 DIMS/AddOnFunctions/getFitQuality.R delete mode 100644 DIMS/AddOnFunctions/getFwhm.R delete mode 100644 DIMS/AddOnFunctions/getPatients.R delete mode 100644 DIMS/AddOnFunctions/getSD.R delete mode 100644 DIMS/AddOnFunctions/get_patient_data_to_helix.R delete mode 100644 DIMS/AddOnFunctions/globalAssignments.HPC.R delete mode 100644 DIMS/AddOnFunctions/iden.code.R delete mode 100644 DIMS/AddOnFunctions/ident.hires.noise.HPC.R delete mode 100644 DIMS/AddOnFunctions/isWithinXppm.R delete mode 100644 DIMS/AddOnFunctions/is_diagnostic_patient.R delete mode 100644 DIMS/AddOnFunctions/mergeDuplicatedRows.R delete mode 100644 DIMS/AddOnFunctions/normalization_2.1.R delete mode 100644 DIMS/AddOnFunctions/optimizeGauss.R delete mode 100644 DIMS/AddOnFunctions/output_helix.R delete mode 100644 DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R delete mode 100644 DIMS/AddOnFunctions/prepare_alarmvalues.R delete mode 100644 DIMS/AddOnFunctions/prepare_data.R delete mode 100644 DIMS/AddOnFunctions/prepare_data_perpage.R delete mode 100644 DIMS/AddOnFunctions/prepare_toplist.R delete mode 100644 DIMS/AddOnFunctions/remove.dupl.2.1.R delete mode 100644 DIMS/AddOnFunctions/remove.dupl.R delete mode 100644 DIMS/AddOnFunctions/removeFromRepl.pat.R delete mode 100644 DIMS/AddOnFunctions/replaceZeros.R delete mode 100644 DIMS/AddOnFunctions/replaceZeros_setseed.R delete mode 100644 DIMS/AddOnFunctions/run.vbs delete mode 100644 DIMS/AddOnFunctions/runVBAMacro.R delete mode 100644 DIMS/AddOnFunctions/searchMZRange.R delete mode 100644 DIMS/AddOnFunctions/sourceDir.R delete mode 100644 DIMS/AddOnFunctions/statistics_z.R delete mode 100644 DIMS/AddOnFunctions/sumCurves.R delete mode 100644 DIMS/AddOnFunctions/trimZeros.R diff --git a/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R b/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R deleted file mode 100644 index 21ccdce..0000000 --- a/DIMS/AddOnFunctions/10-collectSamplesFilled_extraoutput.R +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/Rscript - -.libPaths(new = "/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") - -# load required packages -# none - -# define parameters -cmd_args <- commandArgs(trailingOnly = TRUE) -for (arg in cmd_args) cat(" ", arg, "\n") - -outdir <- cmd_args[1] -scanmode <- cmd_args[2] -normalization <- cmd_args[3] -scripts <- cmd_args[4] -z_score <- as.numeric(cmd_args[5]) -ppm <- as.numeric(cmd_args[6]) - -#outdir <- "/Users/nunen/Documents/Metab/processed/test_old" -#scanmode <- "negative" -#normalization <- "disabled" -#scripts <- "/Users/nunen/Documents/Metab/DIMS/scripts" -#db <- "/Users/nunen/Documents/Metab/DIMS/db/HMDB_add_iso_corrNaCl_withIS_withC5OH.RData" -#z_score <- 0 - -object.files = list.files(paste(outdir, "9-samplePeaksFilled", sep="/"), full.names=TRUE, pattern=scanmode) -outlist.tot=NULL -for (i in 1:length(object.files)) { - load(object.files[i]) - print(print(object.files[i])) - outlist.tot = rbind(outlist.tot, final.outlist.idpat3) -} - -source(paste(scripts, "AddOnFunctions/sourceDir.R", sep="/")) -sourceDir(paste(scripts, "AddOnFunctions", sep="/")) - -# remove duplicates -outlist.tot = mergeDuplicatedRows(outlist.tot) - -# sort on mass -outlist.tot = outlist.tot[order(outlist.tot[,"mzmed.pgrp"]),] - -# normalization -load(paste0(outdir, "/repl.pattern.",scanmode,".RData")) - -if (normalization != "disabled") { - outlist.tot = normalization_2.1(outlist.tot, fileName, names(repl.pattern.filtered), on=normalization, assi_label="assi_HMDB") -} - -if (z_score == 1) { - outlist.stats = statistics_z(outlist.tot, sortCol=NULL, adducts=FALSE) - nr.removed.samples=length(which(repl.pattern.filtered[]=="character(0)")) - order.index.int=order(colnames(outlist.stats)[8:(length(repl.pattern.filtered)-nr.removed.samples+7)]) - outlist.stats.more = cbind(outlist.stats[,1:7], - outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+8):(length(repl.pattern.filtered)-nr.removed.samples+8+6)], - outlist.stats[,8:(length(repl.pattern.filtered)-nr.removed.samples+7)][order.index.int], - outlist.stats[,(length(repl.pattern.filtered)-nr.removed.samples+5+10):ncol(outlist.stats)]) - - tmp.index=grep("_Zscore", colnames(outlist.stats.more), fixed = TRUE) - tmp.index.order=order(colnames(outlist.stats.more[,tmp.index])) - tmp = outlist.stats.more[,tmp.index[tmp.index.order]] - outlist.stats.more=outlist.stats.more[,-tmp.index] - outlist.stats.more=cbind(outlist.stats.more,tmp) - outlist.tot = outlist.stats.more -} - -# filter identified compounds -index.1=which((outlist.tot[,"assi_HMDB"]!="") & (!is.na(outlist.tot[,"assi_HMDB"]))) -index.2=which((outlist.tot[,"iso_HMDB"]!="") & (!is.na(outlist.tot[,"iso_HMDB"]))) -index=union(index.1,index.2) -outlist.ident = outlist.tot[index,] -outlist.not.ident = outlist.tot[-index,] - -if (z_score == 1) { - outlist.ident$ppmdev=as.numeric(outlist.ident$ppmdev) - outlist.ident <- outlist.ident[which(outlist.ident["ppmdev"] >= -ppm & outlist.ident["ppmdev"] <= ppm),] -} -# NAs in theormz_noise <======================================================================= uitzoeken!!! -outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0 -outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise) -outlist.ident$theormz_noise[which(is.na(outlist.ident$theormz_noise))] = 0 -outlist.ident$theormz_noise=as.numeric(outlist.ident$theormz_noise) - -save(outlist.not.ident, outlist.ident, file=paste(outdir, "/outlist_identified_", scanmode, ".RData", sep="")) - -# Extra output in Excel-readable format: -remove_columns <- c("fq.best", "fq.worst", "mzmin.pgrp", "mzmax.pgrp") -remove_colindex <- which(colnames(outlist.ident) %in% remove_columns) -outlist.ident <- outlist.ident[ , -remove_colindex] -write.table(outlist.ident, file=paste0(outdir, "/outlist_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) -remove_colindex <- which(colnames(outlist.not.ident) %in% remove_columns) -outlist.not.ident <- outlist.not.ident[ , -remove_colindex] -write.table(outlist.not.ident, file=paste0(outdir, "/outlist_not_identified_", scanmode, ".txt"), sep="\t", row.names = FALSE) - diff --git a/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R b/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R deleted file mode 100644 index 9d91a55..0000000 --- a/DIMS/AddOnFunctions/add_lab_id_and_onderzoeksnummer.R +++ /dev/null @@ -1,10 +0,0 @@ -add_lab_id_and_onderzoeksnummer <- function(df_metabs_helix) { - # Split patient number into labnummer and Onderzoeksnummer - for (row in 1:nrow(df_metabs_helix)) { - df_metabs_helix[row,"labnummer"] <- gsub("^P|\\.[0-9]*", "", df_metabs_helix[row,"Patient"]) - labnummer_split <- strsplit(as.character(df_metabs_helix[row, "labnummer"]), "M")[[1]] - df_metabs_helix[row, "Onderzoeksnummer"] <- paste0("MB", labnummer_split[1], "/", labnummer_split[2]) - } - - return(df_metabs_helix) -} diff --git a/DIMS/AddOnFunctions/checkOverlap.R b/DIMS/AddOnFunctions/checkOverlap.R deleted file mode 100644 index f7ef818..0000000 --- a/DIMS/AddOnFunctions/checkOverlap.R +++ /dev/null @@ -1,16 +0,0 @@ -checkOverlap <- function(range1,range2){ - if (length(intersect(range1,range2))==2) { - # Overlap - # message("Overlap, smaller range is used") - if (length(range1) >= length(range2)){ - range1=range1[-length(range1)] - } else { - range2=range2[-1] - } - } else if (length(intersect(range1,range2))==3){ - # message("Overlap, smaller range is used") - range1=range1[-length(range1)] - range2=range2[-1] - } - return(list("range1"=range1,"range2"=range2)) -} diff --git a/DIMS/AddOnFunctions/check_same_samplename.R b/DIMS/AddOnFunctions/check_same_samplename.R deleted file mode 100644 index 16ae9cf..0000000 --- a/DIMS/AddOnFunctions/check_same_samplename.R +++ /dev/null @@ -1,4 +0,0 @@ -# function to test whether intensity and Z-score columns match -check_same_samplename <- function(int_col_name, zscore_col_name) { - paste0(int_col_name, "_Zscore") == zscore_col_name -} diff --git a/DIMS/AddOnFunctions/create_violin_plots.R b/DIMS/AddOnFunctions/create_violin_plots.R deleted file mode 100644 index df398be..0000000 --- a/DIMS/AddOnFunctions/create_violin_plots.R +++ /dev/null @@ -1,116 +0,0 @@ -create_violin_plots <- function(pdf_dir, pt_name, metab_perpage, top_metab_pt=NULL) { - - # set parameters for plots - plot_height <- 9.6 - plot_width <- 6 - fontsize <- 1 - nr_plots_perpage <- 20 - circlesize <- 0.8 - colors_4plot <- c("#22E4AC", "#00B0F0", "#504FFF","#A704FD","#F36265","#DA0641") - # green blue blue/purple purple orange red - - # patient plots, create the PDF device - pt_name_sub <- pt_name - suffix <- "" - if (grepl("Diagnostics", pdf_dir) & is_diagnostic_patient(pt_name)) { - prefix <- "MB" - suffix <- "_DIMS_PL_DIAG" - # substitute P and M in P2020M00001 into right format for Helix - pt_name_sub <- gsub("[PM]", "", pt_name) - pt_name_sub <- gsub("\\..*", "", pt_name_sub) - } else if (grepl("Diagnostics", pdf_dir)) { - prefix <- "Dx_" - } else if (grepl("IEM", pdf_dir)) { - prefix <- "IEM_" - } else { - prefix <- "R_" - } - - pdf(paste0(pdf_dir, "/", prefix, pt_name_sub, suffix, ".pdf"), - onefile = TRUE, - width = plot_width, - height = plot_height) - - # page headers: - page_headers <- names(metab_perpage) - - # put table into PDF file, if not empty - if (!is.null(dim(top_metab_pt))) { - plot.new() - # get the names and numbers in the table aligned - table_theme <- ttheme_default(core = list(fg_params = list(hjust=0, x=0.05, fontsize=6)), - colhead = list(fg_params = list(fontsize=8, fontface="bold"))) - grid.table(top_metab_pt, theme = table_theme, rows = NULL) - # g <- tableGrob(top_metab_pt) - # grid.draw(g) - text(x=0.45, y=1.02, paste0("Top deviating metabolites for patient: ", pt_name), font=1, cex=1) - } - - # violin plots - for (page_index in 1:length(metab_perpage)) { - # extract list of metabolites to plot on a page - metab_list_2plot <- metab_perpage[[page_index]] - # extract original data for patient of interest (pt_name) before cut-offs - pt_list_2plot_orig <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] - # cut off Z-scores higher than 20 or lower than -5 (for nicer plots) - metab_list_2plot$value[metab_list_2plot$value > 20] <- 20 - metab_list_2plot$value[metab_list_2plot$value < -5] <- -5 - # extract data for patient of interest (pt_name) - pt_list_2plot <- metab_list_2plot[which(metab_list_2plot$variable == pt_name), ] - # restore original Z-score before cut-off, for showing Z-scores in PDF - pt_list_2plot$value_orig <- pt_list_2plot_orig$value - # remove patient of interest (pt_name) from list; violins will be made up of controls and other patients - metab_list_2plot <- metab_list_2plot[-which(metab_list_2plot$variable == pt_name), ] - # subtitle per page - sub_perpage <- gsub("_", " ", page_headers[page_index]) - # for IEM plots, put subtitle on two lines - sub_perpage <- gsub("probability", "\nprobability", sub_perpage) - # add size parameter for showing Z-score of patient per metabolite - Z_size <- rep(3, nrow(pt_list_2plot)) - # set size to 0 if row is empty - Z_size[is.na(pt_list_2plot$value)] <- 0 - - # draw violin plot. shape=22 gives square for patient of interest - ggplot_object <- ggplot(metab_list_2plot, aes(x=value, y=HMDB_name)) + - theme(axis.text.y=element_text(size=rel(fontsize)), plot.caption = element_text(size=rel(fontsize))) + - xlim(-5, 20) + - geom_violin(scale="width") + - geom_point(data = pt_list_2plot, aes(color=value), size = 3.5*circlesize, shape=22, fill="white") + - scale_fill_gradientn(colors = colors_4plot, values = NULL, space = "Lab", na.value = "grey50", guide = "colourbar", aesthetics = "colour") + - # add Z-score value for patient of interest at x=16 - geom_text(data = pt_list_2plot, aes(16, label = paste0("Z=", round(value_orig, 2))), hjust = "left", vjust = +0.2, size = Z_size) + - # add labels. Use font Courier to get all the plots in the same location. - labs(x = "Z-scores", y = "Metabolites", subtitle = sub_perpage, color = "z-score") + - theme(axis.text.y = element_text(family = "Courier", size=6)) + - # do not show legend - theme(legend.position="none") + - # add title - ggtitle(label = paste0("Results for patient ", pt_name)) + - # labs(x = "Z-scores", y = "Metabolites", title = paste0("Results for patient ", pt_name), subtitle = sub_perpage, color = "z-score") + - # add vertical lines - geom_vline(xintercept = 2, col = "grey", lwd = 0.5, lty=2) + - geom_vline(xintercept = -2, col = "grey", lwd = 0.5, lty=2) - - suppressWarnings(print(ggplot_object)) - - } - - # add explanation of violin plots, version number etc. - # plot.new() - plot(NA, xlim=c(0,5), ylim=c(0,5), bty='n', xaxt='n', yaxt='n', xlab='', ylab='') - if (length(explanation) > 0) { - text(0.2, 5, explanation[1], pos=4, cex=0.8) - for (line_index in 2:length(explanation)) { - text_y_position <- 5 - (line_index*0.2) - text(-0.2, text_y_position, explanation[line_index], pos=4, cex=0.5) - } - # full_explanation <- paste(explanation[2:length(explanation)], sep=" \n") - # text(0.2, 4, full_explanation, pos=4, cex=0.6) - #explanation_grob=textGrob(apply(full_explanation, 2, paste, collapse="\n")) - #grid.arrange(explanation_grob) - } - - # close the PDF file - dev.off() - -} diff --git a/DIMS/AddOnFunctions/elementInfo.R b/DIMS/AddOnFunctions/elementInfo.R deleted file mode 100644 index 18eaeff..0000000 --- a/DIMS/AddOnFunctions/elementInfo.R +++ /dev/null @@ -1,6 +0,0 @@ -elementInfo <- function(name, elements = NULL) { # from Rdisop function .getElement - if (!is.list(elements) || length(elements)==0 ) { - elements <- initializePSE() } - if (name=="CH3OH+H"){rex<-"^CH3OH\\+H$"}else{rex <- paste ("^",name,"$", sep="")} - elements [[grep (rex, sapply (elements, function(x) {x$name}))]] -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/export.R b/DIMS/AddOnFunctions/export.R deleted file mode 100644 index cab79a0..0000000 --- a/DIMS/AddOnFunctions/export.R +++ /dev/null @@ -1,13 +0,0 @@ -export <- function(peaklist, plotdir, adducts, control_label, case_label, patients, sub, fileName){ - # peaklist = outlist.adducts - # adducts=TRUE - # control_label="C" - # case_label="P" - # patients = getPatients(outlist.adducts) - # sub=3000 - - # peaklist = statistics_z_4export(as.data.frame(peaklist), plotdir, patients, adducts, control_label, case_label) - - # generateExcelFile(peaklist, file.path(plotdir), imageNum=2, fileName, subName=c("","_box"), sub, adducts) - -} diff --git a/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R b/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R deleted file mode 100644 index c00aa92..0000000 --- a/DIMS/AddOnFunctions/findPeaks.Gauss.HPC.R +++ /dev/null @@ -1,24 +0,0 @@ -### fit Gaussian estimate mean and integrate to obtain intensity -findPeaks.Gauss.HPC <- function(plist, breaks.fwhm, int.factor, scale, resol, outdir, scanmode, plot, thresh, width, height) { - sampname <- colnames(plist)[1] - - range <- as.vector(plist) - names(range) <- rownames(plist) - - values <- list("mean"=NULL, "area"=NULL, "nr"=NULL, "min"=NULL, "max"=NULL, "qual"=NULL, "spikes"=0) - - values <- searchMZRange(range, values, int.factor, scale, resol, outdir, sampname, scanmode, plot, width, height, thresh) - - outlist.persample <- NULL - outlist.persample <- cbind("samplenr"=values$nr, "mzmed.pkt"=values$mean, "fq"=values$qual, "mzmin.pkt"=values$min, "mzmax.pkt"=values$max, "height.pkt"=values$area) - - index <- which(outlist.persample[ ,"height.pkt"]==0) - if (length(index) > 0) { - outlist.persample <- outlist.persample[-index,] - } - - # save(outlist.persample, file=paste(outdir, paste(sampname, "_", scanmode, ".RData", sep=""), sep="/")) - save(outlist.persample, file=paste("./", sampname, "_", scanmode, ".RData", sep="")) - - cat(paste("There were", values$spikes, "spikes!")) -} diff --git a/DIMS/AddOnFunctions/fit1Peak.R b/DIMS/AddOnFunctions/fit1Peak.R deleted file mode 100644 index b09ce20..0000000 --- a/DIMS/AddOnFunctions/fit1Peak.R +++ /dev/null @@ -1,120 +0,0 @@ -fit1Peak <- function(x2,x,y,index,scale,resol,plot,FQ,useBounds) { - #FQ=FQ1 - - if (length(y)<3){ - message("Range to small, no fit possible!") - } else { - - if ((length(y)==4)) { - mu = weighted.mean(x,y) - sigma = getSD(x,y) - fitP = fitG_2(x,y,sigma,mu,scale,useBounds) - } else { - - if ((length(x) - length(index)) < 2) { - range1=c((length(x)-4):length(x)) - } else if (length(index) < 2) { - range1=c(1:5) - } else { - range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) - } - - if (range1[1]==0) range1=range1[-1] - - # remove NA - if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] - - mu = weighted.mean(x[range1],y[range1]) - sigma = getSD(x[range1],y[range1]) - fitP = fitG_2(x,y,sigma,mu,scale,useBounds) - } - - p2 = fitP$par - - #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) - fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new - - if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") - - scale_new = 1.2*scale - # cat(fq_new) - - if (fq_new > FQ) { # <=== bad fit? - # optimize scaling factor - fq = 0 - scale = 0 - - if (sum(y)>sum(p2[2]*dnorm(x,p2[1],sigma))){ - while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) { - fq = fq_new - scale = scale_new - - #cat(scale) - fitP = fitG_2(x,y,sigma,mu,scale,useBounds) - p2 = fitP$par - - #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) - fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new - scale_new=1.2*scale - - if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") - # cat(paste("fq_new: ", fq_new)) - # cat(paste("scale_new: ", scale_new)) - # (round(fq, digits = 4) != round(fq_new, digits = 4)) - } - } else { - while ((round(fq, digits = 3) != round(fq_new, digits = 3)) & (scale_new<10000)) { - fq = fq_new - scale = scale_new - - # cat(scale) - fitP = fitG_2(x,y,sigma,mu,scale,useBounds) - p2 = fitP$par - - #fq_new = abs(sum(y) - sum(p2[2]*dnorm(x,p2[1],sigma)))/sum(y) - fq_new = getFitQuality(x,y,p2[1],p2[1],resol,p2[2],sigma)$fq_new - scale_new=0.8*scale - - if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") - # cat(paste("fq_new: ", fq_new)) - } - } - - if (fq < fq_new) { - # cat(paste("fq_new: ", fq_new)) - # cat(paste("fq: ", fq)) - # cat(paste("scale_new: ", scale_new)) - # cat(paste("scale: ", scale)) - - fitP = fitG_2(x,y,sigma,mu,scale,useBounds) - p2 = fitP$par - fq_new = fq - # cat(paste("==> fq_new: ", fq_new)) - if (plot & (fq_new < FQ)) lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="dark green") - - } - } - - if (plot & (fq_new < FQ)) { - # plot ################### - #lines(x2,p2[2]*dnorm(x2,p2[1],sigma), col="green") - fwhm = getFwhm(p2[1],resol) - half_max = max(p2[2]*dnorm(x2,p2[1],sigma))*0.5 - lines(c(p2[1] - 0.5*fwhm, p2[1] + 0.5*fwhm),c(half_max,half_max),col="orange") - abline(v = p2[1], col="green") - h=c(paste("mean =", p2[1], sep=" "), - paste("fq =", fq_new, sep=" ")) - legend("topright", legend=h) - ########################## - - - # abline(v = x[6], col="red") - # fwhm = getFwhm(x[6]) - # abline(v =x[6] + 0.6*fwhm, col="red") - # abline(v =x[6] - 0.6*fwhm, col="red") - - } - } - - return(list("mean"=p2[1], "scale"=p2[2], "sigma"=sigma, "qual"=fq_new)) -} diff --git a/DIMS/AddOnFunctions/fit2G.R b/DIMS/AddOnFunctions/fit2G.R deleted file mode 100644 index 9dc76dd..0000000 --- a/DIMS/AddOnFunctions/fit2G.R +++ /dev/null @@ -1,43 +0,0 @@ -fit2G_2 <- function(x,y,sig1,sig2,mu1,scale1,mu2,scale2,useBounds){ - - f = function(p){ - d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) - sum((d-y)^2) - } - - if (useBounds){ - lower = c(x[1],0,x[1],0) - upper = c(x[length(x)],Inf,x[length(x)],Inf) - - if (is.null(mu2) && is.null(scale2) && is.null(sig2)){ - sig2=sig1 - optim(c(as.numeric(mu1), - as.numeric(scale1), - as.numeric(mu1), - as.numeric(scale1)), - f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) - } else { - optim(c(as.numeric(mu1), - as.numeric(scale1), - as.numeric(mu2), - as.numeric(scale2)), - f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) - } - - } else { - if (is.null(mu2) && is.null(scale2) && is.null(sig2)){ - sig2=sig1 - optim(c(as.numeric(mu1), - as.numeric(scale1), - as.numeric(mu1), - as.numeric(scale1)), - f,control=list(maxit=10000)) - } else{ - optim(c(as.numeric(mu1), - as.numeric(scale1), - as.numeric(mu2), - as.numeric(scale2)), - f,control=list(maxit=10000)) - } - } -} diff --git a/DIMS/AddOnFunctions/fit2peaks.R b/DIMS/AddOnFunctions/fit2peaks.R deleted file mode 100644 index 4d9c64d..0000000 --- a/DIMS/AddOnFunctions/fit2peaks.R +++ /dev/null @@ -1,100 +0,0 @@ -fit2peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){ - - peak.mean = NULL - peak.area = NULL - peak.scale = NULL - peak.sigma = NULL - - range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) - if (range1[1]==0) range1=range1[-1] - - range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) - - if (length(x)0) range1=range1[-remove] - remove=which(range2<1) - if (length(remove)>0) range2=range2[-remove] - - # remove NA - if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] - if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] - - mu1 = weighted.mean(x[range1],y[range1]) - sigma1 = getSD(x[range1],y[range1]) - - # message(paste("fit2peaks mu =>", mu1)) - # message(paste("fit2peaks sigma =>", sigma1)) - # message(paste("fit2peaks scale =>", scale)) - - fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) - p = fitP$par - - mu2 = weighted.mean(x[range2],y[range2]) - sigma2 = getSD(x[range2],y[range2]) - fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) - p2 = fitP$par - - fit2P = fit2G_2(x, y, sigma1, sigma2, p[1], p[2], p2[1], p2[2],useBounds) - p3 = fit2P$par - - if (is.null(sigma2)) sigma2=sigma1 - - - # plot ################### - sumFit2 = (p3[2]*dnorm(x2,p3[1],sigma1))+(p3[4]*dnorm(x2,p3[3],sigma2)) - sumFit = (p3[2]*dnorm(x,p3[1],sigma1))+(p3[4]*dnorm(x,p3[3],sigma2)) - fq=getFitQuality(x,y,sort(c(p3[1],p3[3]))[1],sort(c(p3[1],p3[3]))[2],resol,sumFit=sumFit)$fq_new - - fwhm = getFwhm(p3[1],resol) - half_max = max(p3[2]*dnorm(x2,p3[1],sigma1))*0.5 - if (plot & (fq < FQ)) lines(c(p3[1] - 0.5*fwhm, p3[1] + 0.5*fwhm),c(half_max,half_max),col="orange") - if (plot & (fq < FQ)) lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="grey") - if (plot & (fq < FQ)) abline(v = p3[3], col="grey") - - fwhm = getFwhm(p3[3],resol) - half_max = max(p3[4]*dnorm(x2,p3[3],sigma2))*0.5 - if (plot & (fq < FQ)) lines(c(p3[3] - 0.5*fwhm, p3[3] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,sumFit2,col="black") - - if (plot & (fq < FQ)) lines(x2,p3[2]*dnorm(x2,p3[1],sigma1),col="grey") - if (plot & (fq < FQ)) abline(v = p3[1], col="grey") - - - h2=c(paste("mean =", p3[1], sep=" "), - paste("mean =", p3[3], sep=" "), - paste("fq =", fq, sep=" ")) - - if (plot & (fq < FQ)) legend("topright", legend=h2) - ########################## - - # lines(x2,p3[4]*dnorm(x2,p3[3],sigma2),col="red") - # area1 = sum(p3[2]*dnorm(x2,p3[1],sigma1)) - # area2 = sum(p3[4]*dnorm(x2,p3[3],sigma2)) - - # area1 = max(p3[2]*dnorm(x2,p3[1],sigma1)) - # area2 = max(p3[4]*dnorm(x2,p3[3],sigma2)) - - area1 = getArea(p3[1],resol,p3[2],sigma1,int.factor) - area2 = getArea(p3[3],resol,p3[4],sigma2,int.factor) - - peak.area = c(peak.area, area1) - peak.area = c(peak.area, area2) - - peak.mean = c(peak.mean, p3[1]) - peak.mean = c(peak.mean, p3[3]) - - peak.scale = c(peak.scale, p3[2]) - peak.scale = c(peak.scale, p3[4]) - - peak.sigma = c(peak.sigma, sigma1) - peak.sigma = c(peak.sigma, sigma2) - - return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fit3G.R b/DIMS/AddOnFunctions/fit3G.R deleted file mode 100644 index 05543c5..0000000 --- a/DIMS/AddOnFunctions/fit3G.R +++ /dev/null @@ -1,18 +0,0 @@ -fit3G_2 <- function(x,y,sig1,sig2,sig3,mu1,scale1,mu2,scale2,mu3,scale3,useBounds){ - - f = function(p){ - d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) - sum((d-y)^2) - } - - if (useBounds){ - lower = c(x[1],0,x[1],0,x[1],0) - upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf) - - optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) - - } else { - optim(c(mu1,scale1,mu2,scale2,mu3,scale3),f,control=list(maxit=10000)) - } - -} diff --git a/DIMS/AddOnFunctions/fit3peaks.R b/DIMS/AddOnFunctions/fit3peaks.R deleted file mode 100644 index e373f14..0000000 --- a/DIMS/AddOnFunctions/fit3peaks.R +++ /dev/null @@ -1,122 +0,0 @@ -fit3peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor){ - - peak.mean = NULL - peak.area = NULL - peak.scale = NULL - peak.sigma = NULL - - range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) - range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) - range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2) - - remove=which(range1<1) - if (length(remove)>0) { - range1=range1[-remove] - } - remove=which(range2<1) - if (length(remove)>0) { - range2=range2[-remove] - } - - if (length(x)0) range1=range1[-remove] - remove=which(range2<1) - if (length(remove)>0) range2=range2[-remove] - remove=which(range3<1) - if (length(remove)>0) range3=range3[-remove] - - # remove NA - if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] - if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] - if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))] - - mu1 = weighted.mean(x[range1],y[range1]) - sigma1 = getSD(x[range1],y[range1]) - fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) - p = fitP$par - - mu2 = weighted.mean(x[range2],y[range2]) - sigma2 = getSD(x[range2],y[range2]) - fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) - p2 = fitP$par - - mu3 = weighted.mean(x[range3],y[range3]) - sigma3 = getSD(x[range3],y[range3]) - fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds) - p3 = fitP$par - - fit3P = fit3G_2(x, y, sigma1, sigma2, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], useBounds) - p4 = fit3P$par - - # plot ############################## - sumFit2 = (p4[2]*dnorm(x2,p4[1],sigma1))+(p4[4]*dnorm(x2,p4[3],sigma2))+(p4[6]*dnorm(x2,p4[5],sigma3)) - sumFit = (p4[2]*dnorm(x,p4[1],sigma1))+(p4[4]*dnorm(x,p4[3],sigma2))+(p4[6]*dnorm(x,p4[5],sigma3)) - fq=getFitQuality(x,y,sort(c(p4[1],p4[3],p4[5]))[1],sort(c(p4[1],p4[3],p4[5]))[3],resol,sumFit=sumFit)$fq_new - - if (plot & (fq < FQ)) lines(x2,p4[2]*dnorm(x2,p4[1],sigma1),col="yellow") - if (plot & (fq < FQ)) abline(v = p4[1], col="yellow") - fwhm = getFwhm(p4[1],resol) - half_max = max(p4[2]*dnorm(x2,p4[1],sigma1))*0.5 - if (plot & (fq < FQ)) lines(c(p4[1] - 0.5*fwhm, p4[1] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,p4[4]*dnorm(x2,p4[3],sigma2),col="yellow") - if (plot & (fq < FQ)) abline(v = p4[3], col="yellow") - fwhm = getFwhm(p4[3],resol) - half_max = max(p4[4]*dnorm(x2,p4[3],sigma2))*0.5 - if (plot & (fq < FQ)) lines(c(p4[3] - 0.5*fwhm, p4[3] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,p4[6]*dnorm(x2,p4[5],sigma3),col="yellow") - if (plot & (fq < FQ)) abline(v = p4[5], col="yellow") - fwhm = getFwhm(p4[5],resol) - half_max = max(p4[6]*dnorm(x2,p4[5],sigma3))*0.5 - if (plot & (fq < FQ)) lines(c(p4[5] - 0.5*fwhm, p4[5] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,sumFit2,col="red") - - h2=c(paste("mean =", p4[1], sep=" "), - paste("mean =", p4[3], sep=" "), - paste("mean =", p4[5], sep=" "), - paste("fq =", fq, sep=" ")) - - if (plot & (fq < FQ)) legend("topright", legend=h2) - ######################################### - - # area1 = sum(p4[2]*dnorm(x2,p4[1],sigma1)) - # area2 = sum(p4[4]*dnorm(x2,p4[3],sigma2)) - # area3 = sum(p4[6]*dnorm(x2,p4[5],sigma3)) - - # area1 = max(p4[2]*dnorm(x2,p4[1],sigma1)) - # area2 = max(p4[4]*dnorm(x2,p4[3],sigma2)) - # area3 = max(p4[6]*dnorm(x2,p4[5],sigma3)) - - area1 = getArea(p4[1],resol,p4[2],sigma1,int.factor) - area2 = getArea(p4[3],resol,p4[4],sigma2,int.factor) - area3 = getArea(p4[5],resol,p4[6],sigma3,int.factor) - - peak.area = c(peak.area, area1) - peak.area = c(peak.area, area2) - peak.area = c(peak.area, area3) - - peak.mean = c(peak.mean, p4[1]) - peak.mean = c(peak.mean, p4[3]) - peak.mean = c(peak.mean, p4[5]) - - peak.scale = c(peak.scale, p4[2]) - peak.scale = c(peak.scale, p4[4]) - peak.scale = c(peak.scale, p4[6]) - - peak.sigma = c(peak.sigma, sigma1) - peak.sigma = c(peak.sigma, sigma2) - peak.sigma = c(peak.sigma, sigma3) - - return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) - -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fit4G.R b/DIMS/AddOnFunctions/fit4G.R deleted file mode 100644 index 6cad63d..0000000 --- a/DIMS/AddOnFunctions/fit4G.R +++ /dev/null @@ -1,18 +0,0 @@ -fit4G_2 <- function(x,y,sig1,sig2,sig3,sig4,mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4,useBounds){ - - f = function(p){ - d = p[2]*dnorm(x,mean=p[1],sd=sig1) + p[4]*dnorm(x,mean=p[3],sd=sig2) + p[6]*dnorm(x,mean=p[5],sd=sig3) + p[8]*dnorm(x,mean=p[7],sd=sig4) - sum((d-y)^2) - } - - if (useBounds){ - lower = c(x[1],0,x[1],0,x[1],0,x[1],0) - upper = c(x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf,x[length(x)],Inf) - - optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) - - } else { - optim(c(mu1,scale1,mu2,scale2,mu3,scale3,mu4,scale4),f,control=list(maxit=10000)) - } - -} diff --git a/DIMS/AddOnFunctions/fit4peaks.R b/DIMS/AddOnFunctions/fit4peaks.R deleted file mode 100644 index a506541..0000000 --- a/DIMS/AddOnFunctions/fit4peaks.R +++ /dev/null @@ -1,149 +0,0 @@ -fit4peaks <- function(x2,x,y,index,scale,resol,useBounds=FALSE,plot=FALSE,FQ,int.factor) { - - peak.mean = NULL - peak.area = NULL - peak.scale = NULL - peak.sigma = NULL - - range1=c(index[1]-2,index[1]-1,index[1],index[1]+1,index[1]+2) - range2=c(index[2]-2,index[2]-1,index[2],index[2]+1,index[2]+2) - range3=c(index[3]-2,index[3]-1,index[3],index[3]+1,index[3]+2) - range4=c(index[4]-2,index[4]-1,index[4],index[4]+1,index[4]+2) - if (range1[1]==0) range1=range1[-1] - if (length(x)length(x)) - if (length(remove)>0) { - range4=range4[-remove] - # message(length(range4)) - } - - # check for negative or 0 - remove=which(range1<1) - if (length(remove)>0) range1=range1[-remove] - remove=which(range2<1) - if (length(remove)>0) range2=range2[-remove] - remove=which(range3<1) - if (length(remove)>0) range3=range3[-remove] - remove=which(range4<1) - if (length(remove)>0) range4=range4[-remove] - - # remove NA - if (length(which(is.na(y[range1])))!=0) range1=range1[-which(is.na(y[range1]))] - if (length(which(is.na(y[range2])))!=0) range2=range2[-which(is.na(y[range2]))] - if (length(which(is.na(y[range3])))!=0) range3=range3[-which(is.na(y[range3]))] - if (length(which(is.na(y[range4])))!=0) range4=range4[-which(is.na(y[range4]))] - - mu1 = weighted.mean(x[range1],y[range1]) - sigma1 = getSD(x[range1],y[range1]) - fitP = fitG_2(x[range1],y[range1],sigma1,mu1,scale,useBounds) - p = fitP$par - - mu2 = weighted.mean(x[range2],y[range2]) - sigma2 = getSD(x[range2],y[range2]) - fitP = fitG_2(x[range2],y[range2],sigma2,mu2,scale,useBounds) - p2 = fitP$par - - mu3 = weighted.mean(x[range3],y[range3]) - sigma3 = getSD(x[range3],y[range3]) - fitP = fitG_2(x[range3],y[range3],sigma3,mu3,scale,useBounds) - p3 = fitP$par - - mu4 = weighted.mean(x[range4],y[range4]) - sigma4 = getSD(x[range4],y[range4]) - fitP = fitG_2(x[range4],y[range4],sigma4,mu4,scale,useBounds) - p4 = fitP$par - - fit4P = fit4G_2(x, y, sigma1, sigma2, sigma3, sigma3, p[1], p[2], p2[1], p2[2], p3[1], p3[2], p4[1], p4[2], useBounds) - p5 = fit4P$par - - # plot ##################################### - sumFit2 = (p5[2]*dnorm(x2,p5[1],sigma1))+(p5[4]*dnorm(x2,p5[3],sigma2))+(p5[6]*dnorm(x2,p5[5],sigma3))+(p5[8]*dnorm(x2,p5[7],sigma3)) - sumFit = (p5[2]*dnorm(x,p5[1],sigma1))+(p5[4]*dnorm(x,p5[3],sigma2))+(p5[6]*dnorm(x,p5[5],sigma3))+(p5[8]*dnorm(x,p5[7],sigma3)) - fq=getFitQuality(x,y,sort(c(p5[1],p5[3],p5[5],p5[7]))[1],sort(c(p5[1],p5[3],p5[5],p5[7]))[4],resol,sumFit=sumFit)$fq_new - - if (plot & (fq < FQ)) lines(x2,p5[2]*dnorm(x2,p5[1],sigma1),col="purple") - if (plot & (fq < FQ)) abline(v = p5[1], col="purple") - fwhm = getFwhm(p5[1],resol) - half_max = max(p5[2]*dnorm(x2,p5[1],sigma1))*0.5 - if (plot & (fq < FQ)) lines(c(p5[1] - 0.5*fwhm, p5[1] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,p5[4]*dnorm(x2,p5[3],sigma2),col="purple") - if (plot & (fq < FQ)) abline(v = p5[3], col="purple") - fwhm = getFwhm(p5[3],resol) - half_max = max(p5[4]*dnorm(x2,p5[3],sigma2))*0.5 - if (plot & (fq < FQ)) lines(c(p5[3] - 0.5*fwhm, p5[3] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,p5[6]*dnorm(x2,p5[5],sigma3),col="purple") - if (plot & (fq < FQ)) abline(v = p5[5], col="purple") - fwhm = getFwhm(p5[5],resol) - half_max = max(p5[6]*dnorm(x2,p5[5],sigma3))*0.5 - if (plot & (fq < FQ)) lines(c(p5[5] - 0.5*fwhm, p5[5] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,p5[8]*dnorm(x2,p5[7],sigma3),col="purple") - if (plot & (fq < FQ)) abline(v = p5[7], col="purple") - fwhm = getFwhm(p5[7],resol) - half_max = max(p5[8]*dnorm(x2,p5[7],sigma3))*0.5 - if (plot & (fq < FQ)) lines(c(p5[7] - 0.5*fwhm, p5[7] + 0.5*fwhm),c(half_max,half_max),col="orange") - - if (plot & (fq < FQ)) lines(x2,sumFit2,col="blue") - - #fq = abs(sum(y) - sum(sumFit))/sum(y) - #fq=abs(sum(y) - sum(sumFit))/sum(y) - #fq=mean(abs(sumFit - y)/sumFit) - - - h2=c(paste("mean =", p5[1], sep=" "), - paste("mean =", p5[3], sep=" "), - paste("mean =", p5[5], sep=" "), - paste("mean =", p5[7], sep=" "), - paste("fq =", fq, sep=" ")) - - if (plot & (fq < FQ)) legend("topright", legend=h2) - ############################################# - - # area1 = sum(p5[2]*dnorm(x2,p5[1],sigma1)) - # area2 = sum(p5[4]*dnorm(x2,p5[3],sigma2)) - # area3 = sum(p5[6]*dnorm(x2,p5[5],sigma3)) - # area4 = sum(p5[8]*dnorm(x2,p5[7],sigma4)) - - # area1 = max(p5[2]*dnorm(x2,p5[1],sigma1)) - # area2 = max(p5[4]*dnorm(x2,p5[3],sigma2)) - # area3 = max(p5[6]*dnorm(x2,p5[5],sigma3)) - # area4 = max(p5[8]*dnorm(x2,p5[7],sigma4)) - - area1 = getArea(p5[1],resol,p5[2],sigma1,int.factor) - area2 = getArea(p5[3],resol,p5[4],sigma2,int.factor) - area3 = getArea(p5[5],resol,p5[6],sigma3,int.factor) - area4 = getArea(p5[7],resol,p5[8],sigma4,int.factor) - - peak.area = c(peak.area, area1) - peak.area = c(peak.area, area2) - peak.area = c(peak.area, area3) - peak.area = c(peak.area, area4) - - peak.mean = c(peak.mean, p5[1]) - peak.mean = c(peak.mean, p5[3]) - peak.mean = c(peak.mean, p5[5]) - peak.mean = c(peak.mean, p5[7]) - - peak.scale = c(peak.scale, p5[2]) - peak.scale = c(peak.scale, p5[4]) - peak.scale = c(peak.scale, p5[6]) - peak.scale = c(peak.scale, p5[8]) - - peak.sigma = c(peak.sigma, sigma1) - peak.sigma = c(peak.sigma, sigma2) - peak.sigma = c(peak.sigma, sigma3) - peak.sigma = c(peak.sigma, sigma4) - - return(list("mean"=peak.mean, "scale"=peak.scale, "sigma"=peak.sigma, "area"=peak.area, "qual"=fq)) - -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/fitG.R b/DIMS/AddOnFunctions/fitG.R deleted file mode 100644 index b1b595d..0000000 --- a/DIMS/AddOnFunctions/fitG.R +++ /dev/null @@ -1,18 +0,0 @@ -fitG_2 <- function(x,y,sig,mu,scale,useBounds) { - - f = function(p) { - d = p[2]*dnorm(x,mean=p[1],sd=sig) - sum((d-y)^2) - } - - if (useBounds){ - lower = c(x[1],0,x[1],0) - upper = c(x[length(x)],Inf,x[length(x)],Inf) - - optim(c(as.numeric(mu), as.numeric(scale)), - f,control=list(maxit=10000),method="L-BFGS-B",lower=lower,upper=upper) - } else { - #optim(c(mu,scale),f) - optim(c(as.numeric(mu),as.numeric(scale)),f,control=list(maxit=10000)) - } -} diff --git a/DIMS/AddOnFunctions/fitGaussian.R b/DIMS/AddOnFunctions/fitGaussian.R deleted file mode 100644 index 3c5027d..0000000 --- a/DIMS/AddOnFunctions/fitGaussian.R +++ /dev/null @@ -1,327 +0,0 @@ -fitGaussian <- function(x2,x,y,index,scale,resol,outdir,force,useBounds,plot,scanmode,int.factor,width,height) { - # force=length(index) - # useBounds=FALSE - - peak.mean = NULL - peak.area = NULL - peak.qual = NULL - peak.min = NULL - peak.max = NULL - - FQ1 = 0.15 - FQ = 0.2 - - # One local max - if (force==1){ - - retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds) - - scale = 2 - - if (retVal$mean[1]x[length(x)]) { # <=== mean outside range - - # do it again - return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=1,useBounds=TRUE,plot,scanmode,int.factor,width,height)) - - } else { # <=== mean within range - - if (retVal$qual > FQ1){ # <=== bad fit - - # Try to fit two curves - - # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. - # The +1 below takes care of the fact that the result of diff is shorter than the input vector. - new_index=which(diff(sign(diff(y)))==-2)+1 - - if (length(new_index)!=2) { - new_index = round(length(x)/3) - new_index = c(new_index, 2*new_index) - } - - #length(new_index) - return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=2,useBounds=FALSE,plot,scanmode,int.factor,width,height)) - - } else { # <=== good fit - - peak.mean = c(peak.mean, retVal$mean) - #peak.area = c(peak.area, sum(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma))) - # "Centroid mode" - # peak.area = c(peak.area, max(retVal$scale*dnorm(x2,retVal$mean,retVal$sigma))) - peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor)) - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - } - } - - # Two local max - } else if (force==2 & (length(x)>6)) { - - # fit two curves - retVal = fit2peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) # <=== fit 2 curves - - if (retVal$mean[1]x[length(x)] | # <=== one of means outside range - retVal$mean[2]x[length(x)]) { - - # check quality - if (retVal$qual > FQ) { # <=== bad fit - - # do it again - return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=2,useBounds=TRUE,plot,scanmode,int.factor,width,height)) - - # good fit - } else { - - # check which mean is outside range - # Todo ========> check this!!!!!!!!!!!!!!! - for (i in 1:length(retVal$mean)){ - if (retVal$mean[i]x[length(x)] ) { - peak.mean = c(peak.mean, -i) - peak.area = c(peak.area, -i) - } else { - peak.mean = c(peak.mean, retVal$mean[i]) - peak.area = c(peak.area, retVal$area[i]) - } - } - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - } - } else { # <=== all means within range - - if (retVal$qual > FQ) { # <=== bad fit - - # Try to fit three curves - new_index=which(diff(sign(diff(y)))==-2)+1 - - if (length(new_index)!=3) { - new_index = round(length(x)/4) - new_index = c(new_index, 2*new_index, 3*new_index) - } - - #length(new_index) - return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=3,useBounds=FALSE,plot,scanmode,int.factor,width,height)) - - } else { # <======== good fit - - # check if means are within 3 ppm and sum if so - tmp = retVal$qual - - nMeanNew = -1 - nMean = length(retVal$mean) - while (nMean!=nMeanNew){ - nMean = length(retVal$mean) - retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) - nMeanNew = length(retVal$mean) - } - - retVal$qual = tmp - - h2=NULL - - for (i in 1:length(retVal$mean)){ - h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) - - peak.mean = c(peak.mean, retVal$mean[i]) - peak.area = c(peak.area, retVal$area[i]) - } - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - - h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) - if (plot) legend("topright", legend=h2) - } - } - - # Three local max - } else if (force==3 & (length(x)>6)){ - - retVal = fit3peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) - - # outside range - if (retVal$mean[1]x[length(x)] | # <=== one of means outside range - retVal$mean[2]x[length(x)] | - retVal$mean[3]x[length(x)]) { - - # check quality - if (retVal$qual > FQ) { # <=== bad fit - - # do it again - return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height)) - - # good fit - } else { - - # check which mean is outside range - # Todo ========> check this!!!!!!!!!!!!!!! - for (i in 1:length(retVal$mean)){ - if (retVal$mean[i]x[length(x)] ) { - peak.mean = c(peak.mean, -i) - peak.area = c(peak.area, -i) - } else { - peak.mean = c(peak.mean, retVal$mean[i]) - peak.area = c(peak.area, retVal$area[i]) - } - } - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - } - - } else { # <=== all means within range - - if (retVal$qual > FQ) { # <=== bad fit - - # Try to fit four curves - new_index=which(diff(sign(diff(y)))==-2)+1 - - if (length(new_index)!=4) { - new_index = round(length(x)/5) - new_index = c(new_index, 2*new_index, 3*new_index, 4*new_index) - } - - #length(new_index) - return(fitGaussian(x2,x,y,new_index,scale,resol,outdir,force=4,useBounds=FALSE,plot,scanmode,int.factor,width,height)) - - } else { # <======== good fit - - # check if means are within 3 ppm and sum if so - tmp = retVal$qual - - nMeanNew = -1 - nMean = length(retVal$mean) - while (nMean!=nMeanNew){ - nMean = length(retVal$mean) - retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) - nMeanNew = length(retVal$mean) - } - - retVal$qual = tmp - - h2=NULL - # peak.mean=NULL - # peak.area=NULL - - for (i in 1:length(retVal$mean)){ - h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) - - peak.mean = c(peak.mean, retVal$mean[i]) - peak.area = c(peak.area, retVal$area[i]) - } - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - - h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) - if (plot) legend("topright", legend=h2) - - } - } - - # Four local max - } else if (force==4 & (length(x)>6)){ - - retVal = fit4peaks(x2,x,y,index,scale,resol,useBounds,plot,FQ,int.factor) - - if (retVal$mean[1]x[length(x)] | - retVal$mean[2]x[length(x)] | - retVal$mean[3]x[length(x)] | - retVal$mean[4]x[length(x)]) { - - # check quality - if (retVal$qual > FQ) { # <=== bad fit - - # do it again - return(fitGaussian(x2,x,y,index,scale,resol,outdir,force,useBounds=TRUE,plot,scanmode,int.factor,width,height)) - - # good fit - } else { - - # check which mean is outside range - # Todo ========> check this!!!!!!!!!!!!!!! - for (i in 1:length(retVal$mean)){ - if (retVal$mean[i]x[length(x)] ) { - peak.mean = c(peak.mean, -i) - peak.area = c(peak.area, -i) - } else { - peak.mean = c(peak.mean, retVal$mean[i]) - peak.area = c(peak.area, retVal$area[i]) - } - } - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - } - - } else { - - if (retVal$qual > FQ) { # <=== bad fit - - # Generate 1 curve - return(fitGaussian(x2,x,y,index,scale,resol,outdir,force=5,useBounds=FALSE,plot,scanmode,int.factor,width,height)) - - } else { # <======== good fit - - # check if means are within 3 ppm and sum if so - tmp = retVal$qual - - nMeanNew = -1 - nMean = length(retVal$mean) - while (nMean!=nMeanNew){ - nMean = length(retVal$mean) - retVal = isWithinXppm(retVal$mean, retVal$scale, retVal$sigma, retVal$area, x2, x, ppm=4, resol, plot) - nMeanNew = length(retVal$mean) - } - - retVal$qual = tmp - - h2=NULL - - for (i in 1:length(retVal$mean)){ - h2 = c(h2, paste("mean =", retVal$mean[i], sep=" ")) - - peak.mean = c(peak.mean, retVal$mean[i]) - peak.area = c(peak.area, retVal$area[i]) - } - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - - h2 = c(h2, paste("fq =", retVal$qual, sep=" ")) - if (plot) legend("topright", legend=h2) - } - } - - # More then four local max - } else { - - scale=2 - FQ1=0.40 - useBounds=TRUE - index=which(y==max(y)) - retVal = fit1Peak(x2,x,y,index,scale,resol,plot,FQ1,useBounds) - - if (retVal$qual > FQ1){ # <=== bad fit - - if (plot) dev.off() - - rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) - peak.mean = c(peak.mean, rval$mean) - peak.area = c(peak.area, rval$area) - peak.min = rval$min - peak.max = rval$max - peak.qual = 0 - - } else { # <=== good fit - - peak.mean = c(peak.mean, retVal$mean) - peak.area = c(peak.area, getArea(retVal$mean,resol,retVal$scale,retVal$sigma,int.factor)) - peak.qual = retVal$qual - peak.min = x[1] - peak.max = x[length(x)] - } - } - - return(list("mean"=peak.mean, "area"=peak.area, "qual" = peak.qual, "min"=peak.min, "max"=peak.max)) -} diff --git a/DIMS/AddOnFunctions/fitGaussianInit.R b/DIMS/AddOnFunctions/fitGaussianInit.R deleted file mode 100644 index 1c92bce..0000000 --- a/DIMS/AddOnFunctions/fitGaussianInit.R +++ /dev/null @@ -1,60 +0,0 @@ -fitGaussianInit <- function(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) { - # scanmode="negative" - - mz.range = x[length(x)] - x[1] - x2 = seq(x[1],x[length(x)],length=mz.range*int.factor) - - # # diff(diff(x)) essentially computes the discrete analogue of the second derivative, so should be negative at local maxima. - # # The +1 below takes care of the fact that the result of diff is shorter than the input vector. - # index=which(diff(sign(diff(y)))==-2)+1 - - # Alway try to fit one curve first - index = which(y==max(y)) - - if (scanmode=="positive"){ - plot_label="pos_fit.png" - } else { - plot_label="neg_fit.png" - } - - if (plot) { - CairoPNG(filename=paste(outdir,"Gaussian_fit",paste(sampname, x[1], plot_label, sep="_"), sep="/"), width, height) - plot(x,y,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y)),main=paste(i,start,end, sep=" ")) #, ylim=c(0,1e+05) - } - - retVal = fitGaussian(x2,x,y,index,scale,resol,outdir,force=length(index),useBounds=FALSE,plot,scanmode,int.factor,width,height) - - if (plot) { - if (length(retVal$mean)==1) { - - result = tryCatch(dev.off(), warning = function(w){}, - error=function(e){}, - finally = {}) - - # file.rename(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"), - # paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean,digits = 6), plot_label, sep="_"), sep="/")) - } else { - - - h2=NULL - for (i in 1:length(retVal$mean)){ - h2=c(h2, paste("mean =", retVal$mean[i], sep=" ")) - } - h2=c(h2, paste("fq =", retVal$qual, sep=" ")) - legend("topright", legend=h2) - - dev.off() - - for (i in 1:length(retVal$mean)){ - if (retVal$mean[i]!=-1){ - file.copy(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/"), - paste(outdir,"Gaussian_fit", paste(sampname, round(retVal$mean[i],digits = 6), plot_label, sep="_"), sep="/")) - } - } - file.remove(paste(outdir,"Gaussian_fit", paste(sampname, x[1], plot_label, sep="_"), sep="/")) - } - } - - return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max)) - -} diff --git a/DIMS/AddOnFunctions/generateBreaksFwhm.R b/DIMS/AddOnFunctions/generateBreaksFwhm.R deleted file mode 100644 index 1a42631..0000000 --- a/DIMS/AddOnFunctions/generateBreaksFwhm.R +++ /dev/null @@ -1,26 +0,0 @@ -# options(digits=16) -# resol = 140000 -# breaks.fwhm = 1 -# i = 1 -# breaks.fwhm.avg = NULL -# -# Sys.time() -# -# while (breaks.fwhm[length(breaks.fwhm)]<1000){ -# -# resol.mz = resol*(1/sqrt(2)^(log2(breaks.fwhm[i]/200))) -# fwhm.0.1 = (breaks.fwhm[i]/resol.mz)/10 -# breaks.fwhm = c(breaks.fwhm, breaks.fwhm[i] + fwhm.0.1) -# breaks.fwhm.avg = c(breaks.fwhm.avg,(breaks.fwhm[i] +breaks.fwhm[i+1])/2) -# -# if (i %% 10000 == 0){ -# cat(paste("i =", i)) -# cat(paste("breaks.fwhm =", breaks.fwhm[length(breaks.fwhm)])) -# } -# -# i = i + 1 -# } -# -# Sys.time() -# save(list=c("breaks.fwhm", "breaks.fwhm.avg"), file="breaks.RData") -# cat("Breaks saved!") diff --git a/DIMS/AddOnFunctions/generateExcelFile.R b/DIMS/AddOnFunctions/generateExcelFile.R deleted file mode 100644 index 31474d9..0000000 --- a/DIMS/AddOnFunctions/generateExcelFile.R +++ /dev/null @@ -1,24 +0,0 @@ -generateExcelFile <- function(peaklist, fileName, sub, plot = TRUE) { - # plotdir=file.path(plotdir) - # imageNum=2 - # fileName="./results/xls/Pos_allpgrps_identified" - # subName=c("","_box") - - end <- 0 - i <- 0 - - if (dim(peaklist)[1]>=sub & (sub>0)){ - for (i in 1:floor(dim(peaklist)[1]/sub)){ - start=-(sub-1)+i*sub - end=i*sub - message(paste0(start, ":", end)) - - genExcelFileV3(peaklist[c(start:end),], paste(fileName, sep="_"), plot) - } - } - start = end + 1 - end = dim(peaklist)[1] - message(start) - message(end) - genExcelFileV3(peaklist[c(start:end),], paste(fileName, i+1, sep="_"), plot) -} diff --git a/DIMS/AddOnFunctions/generateGaussian.R b/DIMS/AddOnFunctions/generateGaussian.R deleted file mode 100644 index da6dec2..0000000 --- a/DIMS/AddOnFunctions/generateGaussian.R +++ /dev/null @@ -1,48 +0,0 @@ -generateGaussian <- function(x,y,resol,plot,scanmode,int.factor,width,height) { - - factor=1.5 - index = which(y==max(y))[1] - x=x[index] - y=y[index] - mu = x - fwhm = getFwhm(mu,resol) - x.p = c(mu-factor*fwhm, x, mu+factor*fwhm) - y.p = c(0, y, 0) - - # if (plot) dir.create("./results/plots",showWarnings = FALSE) - # if (plot) dir.create("./results/plots/Gaussian_fit",showWarnings = FALSE) - - if (plot) { - if (scanmode=="positive"){ - plot_label="pos_fit.png" - } else { - plot_label="neg_fit.png" - } - } - - mz.range = x.p[length(x.p)] - x.p[1] - x2 = seq(x.p[1],x.p[length(x.p)],length=mz.range*int.factor) - sigma = getSD(x.p,y.p) - scale = optimizeGauss(x.p,y.p,sigma,mu) - - if (plot) { - CairoPNG(filename=paste("./results/Gaussian_fit",paste(sampname, mu, plot_label, sep="_"), sep="/"), width, height) - - plot(x.p,y.p,xlab="m/z",ylab="I", ylim=c(0,1.5*max(y))) - lines(x2,scale*dnorm(x2,mu,sigma), col="green") - half_max = max(scale*dnorm(x2,mu,sigma))*0.5 - lines(c(mu - 0.5*fwhm, mu + 0.5*fwhm),c(half_max,half_max),col="orange") - abline(v = mu, col="green") - h=c(paste("mean =", mu, sep=" ")) - legend("topright", legend=h) - - dev.off() - } - - # area = sum(scale*dnorm(x2,mu,sigma)) - # area = max(scale*dnorm(x2,mu,sigma)) - area = getArea(mu,resol,scale,sigma,int.factor) - - return(list("mean"=mu, "area"=area, "min"=x2[1] , "max"=x2[length(x2)])) - -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/getArea.R b/DIMS/AddOnFunctions/getArea.R deleted file mode 100644 index 415b050..0000000 --- a/DIMS/AddOnFunctions/getArea.R +++ /dev/null @@ -1,23 +0,0 @@ -getArea <- function(mu,resol,scale,sigma,int.factor){ - - # mu=p3[1] - # scale=p3[2] - # sigma=sigma1 - - # mu=p3[1] - # scale=p3[2] - # sigma=sigma1 - - # avoid too big vector (cannot allocate vector of size ...) - if (mu>1200) return(0) - - fwhm = getFwhm(mu,resol) - mzMin = mu - 2*fwhm - mzMax = mu + 2*fwhm - mz.range = mzMax - mzMin - x_int = seq(mzMin,mzMax,length=mz.range*int.factor) - - #plot(x_int,scale*dnorm(x_int,mu,sigma),col="red",type="l") - - return(sum(scale*dnorm(x_int,mu,sigma))/100) -} diff --git a/DIMS/AddOnFunctions/getDeltaMZ.R b/DIMS/AddOnFunctions/getDeltaMZ.R deleted file mode 100644 index f09a0a4..0000000 --- a/DIMS/AddOnFunctions/getDeltaMZ.R +++ /dev/null @@ -1,5 +0,0 @@ -getDeltaMZ <- function(mass, breaks.fwhm){ - index = which(breaks.fwhmintRangeMin) & (xintRangeMin) & (x mu: ",mu)) - # message(paste("====================> resol.mz: ",resol.mz)) - # }) - - # , error = function(e) { - # error-handler-code - # }, finally = { - # cleanup-code - # }) - - return(fwhm) -} diff --git a/DIMS/AddOnFunctions/getPatients.R b/DIMS/AddOnFunctions/getPatients.R deleted file mode 100644 index 3b03f0a..0000000 --- a/DIMS/AddOnFunctions/getPatients.R +++ /dev/null @@ -1,9 +0,0 @@ -getPatients <- function(peaklist){ - patients=colnames(peaklist)[grep("P", colnames(peaklist), fixed = TRUE)] - patients=unique(as.vector(unlist(lapply(strsplit(patients, ".", fixed = TRUE), function(x) x[1])))) - # ToDo: If 2 P's in sample names!!!!!!!!!!!!! - # patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "_P", fixed = TRUE), function(x) x[2])))))) - patients=sort(as.numeric(unique(as.vector(unlist(lapply(strsplit(patients, "P", fixed = TRUE), function(x) x[2])))))) - - return(patients) -} diff --git a/DIMS/AddOnFunctions/getSD.R b/DIMS/AddOnFunctions/getSD.R deleted file mode 100644 index 197a002..0000000 --- a/DIMS/AddOnFunctions/getSD.R +++ /dev/null @@ -1,9 +0,0 @@ -getSD <- function(x,y,resol=140000) { - - index = which(y==max(y)) - mean = x[index] - resol.mz = resol*(1/sqrt(2)^(log2(mean/200))) - fwhm = mean/resol.mz - sd = (fwhm/2)*0.85 - return(sd) -} diff --git a/DIMS/AddOnFunctions/get_patient_data_to_helix.R b/DIMS/AddOnFunctions/get_patient_data_to_helix.R deleted file mode 100644 index c582177..0000000 --- a/DIMS/AddOnFunctions/get_patient_data_to_helix.R +++ /dev/null @@ -1,31 +0,0 @@ -get_patient_data_to_helix <- function(metab_interest_sorted, metab_list_all){ - # Combine Z-scores of metab groups together - df_all_metabs_zscores <- bind_rows(metab_interest_sorted) - # Change columnnames - colnames(df_all_metabs_zscores) <- c("HMDB_name", "Patient", "Z_score") - # Change Patient column to character instead of factor - df_all_metabs_zscores$Patient <- as.character(df_all_metabs_zscores$Patient) - - # Delete whitespaces HMDB_name - df_all_metabs_zscores$HMDB_name <- str_trim(df_all_metabs_zscores$HMDB_name, "right") - - # Split HMDB_name column on "nitine;" for match dims_helix_table - df_all_metabs_zscores$HMDB_name_split <- str_split_fixed(df_all_metabs_zscores$HMDB_name, "nitine;", 2)[,1] - - # Combine stofgroepen - dims_helix_table <- bind_rows(metab_list_all) - # Filter table for metabolites for Helix - dims_helix_table <- dims_helix_table %>% filter(Helix == "ja") - # Split HMDB_name column on "nitine;" for match df_all_metabs_zscores - dims_helix_table$HMDB_name_split <- str_split_fixed(dims_helix_table$HMDB_name, "nitine;", 2)[,1] - dims_helix_table <- dims_helix_table %>% select(HMDB_name_split, Helix_naam, high_zscore, low_zscore) - - # Filter DIMS results for metabolites for Helix - df_metabs_helix <- df_all_metabs_zscores %>% filter(HMDB_name_split %in% dims_helix_table$HMDB_name_split) - # Combine dims_helix_table and df_metabs_helix, adding Helix codes etc. - df_metabs_helix <- df_metabs_helix %>% left_join(dims_helix_table, by = join_by(HMDB_name_split)) - - df_metabs_helix <- df_metabs_helix %>% select(HMDB_name, Patient, Z_score, Helix_naam, high_zscore, low_zscore) - - return(df_metabs_helix) -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/globalAssignments.HPC.R b/DIMS/AddOnFunctions/globalAssignments.HPC.R deleted file mode 100644 index 9207661..0000000 --- a/DIMS/AddOnFunctions/globalAssignments.HPC.R +++ /dev/null @@ -1,114 +0,0 @@ -#.libPaths(new="/hpc/local/CentOS7/dbg_mz/R_libs/3.2.2") -## calculate relative abundancies from theoretical mass and composition - -#library(Rdisop) - -#theor.table <- read.table(file="C:/Users/mraves/Metabolomics/TheoreticalMZ_Negative_composition.txt", sep="\t", header=TRUE) -options(digits=16) - -#library(OrgMassSpecR) - -suppressPackageStartupMessages(library(lattice)) -# The following list was copied from Rdisop elements.R and corrected for C, H, O, Cl, S according to NIST -Ba <- list(name= 'Ba', mass=130, isotope=list(mass=c(-0.093718, 0, -0.094958, 0, -0.095514, -0.094335, -0.095447, -0.094188, -0.094768), abundance=c(0.00106, 0, 0.00101, 0, 0.02417, 0.06592, 0.07854, 0.1123, 0.717))) -Br <- list(name= 'Br', mass=79, isotope=list(mass=c(-0.0816639, 0, -0.083711), abundance=c(0.5069, 0, 0.4931))) -C <- list(name= 'C', mass=12, isotope=list(mass=c(0, 0.003354838, 0.003241989), abundance=c(0.9893, 0.0107, 0))) -Ca <- list(name= 'Ca', mass=40, isotope=list(mass=c(-0.0374094, 0, -0.0413824, -0.0412338, -0.0445194, 0, -0.046311, 0, -0.047467), abundance=c(0.96941, 0, 0.00647, 0.00135, 0.02086, 0, 4e-05, 0, 0.00187))) -Cl <- list(name= 'Cl', mass=35, isotope=list(mass=c(-0.03114732, 0, -0.03409741), abundance=c(0.7576, 0, 0.2424))) -Cr <- list(name= 'Cr', mass=50, isotope=list(mass=c(-0.0539536, 0, -0.0594902, -0.0593487, -0.0611175), abundance=c(0.04345, 0, 0.83789, 0.09501, 0.02365))) -Cu <- list(name= 'Cu', mass=63, isotope=list(mass=c(-0.0704011, 0, -0.0722071), abundance=c(0.6917, 0, 0.3083))) -F <- list(name= 'F', mass=19, isotope=list(mass=c(-0.00159678), abundance=c(1))) -Fe <- list(name= 'Fe', mass=54, isotope=list(mass=c(-0.0603873, 0, -0.0650607, -0.0646042, -0.0667227), abundance=c(0.058, 0, 0.9172, 0.022, 0.0028))) -H <- list(name= 'H', mass=1, isotope=list(mass=c(0.00782503207, 0.014101778, 0.01604928), abundance=c(0.999885, 0.000115, 0))) -Hg <- list(name= 'Hg', mass=196, isotope=list(mass=c(-0.034193, 0, -0.033257, -0.031746, -0.0317, -0.029723, -0.029383, 0, -0.026533), abundance=c(0.0015, 0, 0.0997, 0.1687, 0.231, 0.1318, 0.2986, 0, 0.0687))) -I <- list(name= 'I', mass=127, isotope=list(mass=c(-0.095527), abundance=c(1))) -K <- list(name= 'K', mass=39, isotope=list(mass=c(-0.0362926, -0.0360008, -0.0381746), abundance=c(0.932581, 0.000117, 0.067302))) -Li <- list(name= 'Li', mass=6, isotope=list(mass=c(0.0151214, 0.016003), abundance=c(0.075, 0.925))) -Mg <- list(name= 'Mg', mass=24, isotope=list(mass=c(-0.0149577, -0.0141626, -0.0174063), abundance=c(0.7899, 0.1, 0.1101))) -Mn <- list(name= 'Mn', mass=55, isotope=list(mass=c(-0.0619529), abundance=c(1))) -N <- list(name= 'N', mass=14, isotope=list(mass=c(0.003074002, 0.00010897), abundance=c(0.99634, 0.00366))) -Na <- list(name= 'Na', mass=23, isotope=list(mass=c(-0.0102323), abundance=c(1))) -Ni <- list(name= 'Ni', mass=58, isotope=list(mass=c(-0.0646538, 0, -0.0692116, -0.0689421, -0.0716539, 0, -0.0720321), abundance=c(0.68077, 0, 0.26223, 0.0114, 0.03634, 0, 0.00926))) -O <- list(name= 'O', mass=16, isotope=list(mass=c(-0.00508538044, -0.0008683, -0.0008397), abundance=c(0.99757, 0.000381, 0.00205))) -P <- list(name= 'P', mass=31, isotope=list(mass=c(-0.026238), abundance=c(1))) -S <- list(name= 'S', mass=32, isotope=list(mass=c(-0.027929, -0.02854124, -0.0321331, 0, -0.03291924), abundance=c(0.9499, 0.0075, 0.0425, 0, 1e-04))) -Se <- list(name= 'Se', mass=74, isotope=list(mass=c(-0.0775254, 0, -0.080788, -0.0800875, -0.0826924, 0, -0.0834804, 0, -0.0833022), abundance=c(0.0089, 0, 0.0936, 0.0763, 0.2378, 0, 0.4961, 0, 0.0873))) -Si <- list(name= 'Si', mass=28, isotope=list(mass=c(-0.0230729, -0.0235051, -0.0262293), abundance=c(0.9223, 0.0467, 0.031))) -Sn <- list(name= 'Sn', mass=112, isotope=list(mass=c(-0.095174, 0, -0.097216, -0.096652, -0.098253, -0.097044, -0.098391, -0.09669, -0.0978009, 0, -0.0965596, 0, -0.0947257),abundance=c(0.0097, 0, 0.0065, 0.0034, 0.1453, 0.0768, 0.2423, 0.0859, 0.3259, 0, 0.0463, 0, 0.0579))) -Zn <- list(name= 'Zn', mass=64, isotope=list(mass=c(-0.0708552, 0, -0.0739653, -0.0728709, -0.0751541, 0, -0.074675), abundance=c(0.486, 0, 0.279, 0.041, 0.188, 0, 0.006))) - -NH4 <- list(name= "NH4", mass=18, isotope=list(mass=c(0.03437, 0.03141, -0.95935)), abundance=c(0.995, 0.004, 0.001)) # SISweb: 18.03437 100 19.03141 0.4 19.04065 0.1 -Ac <- list(name= "Ac", mass=60, isotope=list(mass=c(0.02114, 0.02450, 0.02538)), abundance=c(0.975, 0.021, 0.004)) # SISweb: 60.02114 100 61.02450 2.2 62.02538 0.4 -NaCl <- list(name= "NaCl", mass=58, isotope=list(mass=c(-0.04137, 0, -0.04433)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 -NaCl2 <- list(name= "NaCl2", mass=116, isotope=list(mass=c(-0.08274, 0, -0.08866)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 -NaCl3 <- list(name= "NaCl3", mass=174, isotope=list(mass=c(-0.12411, 0, -0.13299)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 -NaCl4 <- list(name= "NaCl4", mass=232, isotope=list(mass=c(-0.16548, 0, -0.17732)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 -NaCl5 <- list(name= "NaCl5", mass=290, isotope=list(mass=c(-0.20685, 0, -0.22165)), abundance=c(0.755, 0, 0.245)) # SISweb: 57.95862 100 59.95567 32.4 -For <- list(name= "For", mass=45, isotope=list(mass=c(-0.00233, 0.00103)), abundance=c(0.989, 0.011)) # SISweb: 46.00549 100 47.00885 1.1 (47.0097 0.1) 48.00973 0.4 -Na2 <- list(name= "2Na-H", mass=46, isotope=list(mass=c(-1.0282896)), abundance=c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! -Met <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 -CH3OH <- list(name= "CH3OH", mass=32, isotope=list(mass=c(1.034045,1.037405)), abundance=c(0.989,0.011)) # SISweb: 32.02622 100 33.02958 1.1 33.0325 0.1 34.03046 0.2 -Na3 <- list(name= "3Na-2H", mass=69, isotope=list(mass=c(-2.0463469)), abundance=c(1)) # SISweb for Na2: 45.97954 100 # minus 1 H ! -KCl <- list(name= "KCl", mass=74, isotope=list(mass=c(-0.06744,0.92961,-0.06744,0.92772)), abundance=c(0.7047,0.2283,0.0507,0.0162)) # SISweb: 73.93256 100 75.92961 32.4 75.93067 7.2 77.92772 2.3 -H2PO4 <- list(name= "H2PO4", mass=97, isotope=list(mass=c(-0.03091)), abundance=c(1)) -HSO4 <- list(name= "HSO4", mass=97, isotope=list(mass=c(-0.04042,0,-0.04462)), abundance=c(0.96,0,0.04)) -Met2 <- list(name= "Met2", mass=64, isotope=list(mass=c(1.060265,1.013405)), abundance=c(0.978,0.022)) -Met3 <- list(name= "Met3", mass=96, isotope=list(mass=c(1.086485,1.089845)), abundance=c(0.969,0.031)) -Met4 <- list(name= "Met4", mass=128, isotope=list(mass=c(1.112705,1.116065)), abundance=c(0.959,0.041)) -Met5 <- list(name= "Met5", mass=160, isotope=list(mass=c(1.20935,1.142285)), abundance=c(0.949,0.051)) -NaminH <- list(name= "Na-H", mass=21, isotope=list(mass=c(-0.02571416)), abundance=c(1)) -KminH <- list(name= "K-H", mass=37, isotope=list(mass=c(-0.05194,0.94617)), abundance=c(0.9328,0.0672)) -H2O <- list(name= "H2O", mass=-19, isotope=list(mass=c(-0.01894358)), abundance=c(1)) -NaK <- list(name= "NaK-H", mass=61, isotope=list(mass=c(-0.054345,0.943765)), abundance=c(0.9328,0.0672)) -min2H <- list(name= "min2H", mass=-2, isotope=list(mass=c(-0.0151014)), abundance=c(1)) -plus2H <- list(name= "plus2H", mass=2, isotope=list(mass=c(0.0151014)), abundance=c(1)) -plus2Na <- list(name= "plus2Na", mass=46, isotope=list(mass=c(-0.02046), abundance=c(1))) -plusNaH <- list(name= "plusNaH", mass=24, isotope=list(mass=c(-0.00295588), abundance=c(1))) -plusKH <- list(name= "plusKH", mass=40, isotope=list(mass=c(-0.029008,0.969101)), abundance=c(0.9328,0.0672)) -plusHNH <- list(name= "plusHNH", mass=19, isotope=list(mass=c(0.04164642)), abundance=c(1)) -min3H <- list(name= "min3H", mass=-3, isotope=list(mass=c(-0.02182926)), abundance=c(1)) -plus3H <- list(name= "plus3H", mass=3, isotope=list(mass=c(0.02182926)), abundance=c(1)) -plus3Na <- list(name= "plus3Na", mass=68, isotope=list(mass=c(0.96931)), abundance=c(1)) -plus2NaH <- list(name= "plus2NaH", mass=47, isotope=list(mass=c(0.2985712)), abundance=c(1)) -plusNa2H <- list(name= "plusNa2H", mass=25, isotope=list(mass=c(0.00432284)), abundance=c(1)) - -Cl37 <- list(name= "Cl37", mass=37, isotope=list(mass=c(-0.03409741)), abundance=c(1)) - -allelements <- list(Ba, Br, C, Ca, Cl, Cr, Cu, F, Fe, H, Hg, I, K, Li, Mg, Mn, N, Na, Ni, O, P, S, Se, Si, Sn, Zn) -allAdducts <- list(Ba, Br, Ca, Cl, Cl37, Cr, Cu, Fe, Hg, I, K, Li, Mg, Mn, Na, Ni, Se, Si, Sn, Zn, NH4, Ac, NaCl, For,Na2,CH3OH,NaCl2,NaCl3,NaCl4,NaCl5,Na3,KCl,H2PO4,HSO4,Met2,Met3,Met4,Met5,NaminH,KminH,H2O,NaK,min2H,plus2H,plus2Na,plusNaH,plusKH,min3H,plus3H,plusHNH,plus3Na,plus2NaH,plusNa2H) - -#atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl") -#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721) -#electron <- 0.00054858 -#Mol.comp <- c(0,4,0,0,1,1,0) -#Mol.exact <- sum(Mol.comp * atomicWts) + electron -#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list -#getMass(Mol) + electron - 0.0022 + 0.000007*getMass(Mol) - -#atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "34S", "18O") -#atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 33.96786690, 17.9991610) -#electron <- 0.00054858 -#Mol.comp <- c(0,4,0,0,1,1,0,0,0,0) # main peak HSO4 -#Mol.comp <- c(0,4,0,0,0,1,0,1,0,0) # deuterated -#Mol.comp <- c(0,4,0,0,1,0,0,0,1,0) # 34S -#Mol.comp <- c(0,3,0,0,1,1,0,0,0,1) # 18O -#Mol.exact <- sum(Mol.comp * atomicWts) + electron -#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list - -atomsinuse <- c("P", "O", "N", "C", "H", "S", "Cl", "D", "13C", "34S", "18O", "37Cl") -atomicWts <- c(30.97376163, 15.99491463, 14.0030740052, 12.0000, 1.0078250321, 31.9720707, 34.968852721, 2.0141017778, 13.0033548378, 33.96786690, 17.9991610, 36.96590259) -electron <- 0.00054858 -############# P O N C H S Cl D 13C 34S 18O 37Cl -#Mol.comp <- c(0,6,0,6,12,0,1, 0, 0, 0, 0, 0) # main peak Galactose HCl negative ion -#Mol.comp <- c(0,6,0,5,12,0,1, 0, 1, 0, 0, 0) # 13C Galactose HCl negative ion -#Mol.comp <- c(0,6,0,6,11,0,1, 1, 0, 0, 0, 0) # D Galactose HCl negative ion -#Mol.comp <- c(0,5,0,6,12,0,1, 0, 0, 0, 1, 0) # 18O Galactose HCl negative ion -#Mol.comp <- c(0,6,0,6,12,0,0, 0, 0, 0, 0, 1) # 18O Galactose HCl negative ion -#Mol.exact <- sum(Mol.comp * atomicWts) + electron -#Mol.corr <- Mol.exact - 0.0022 + 0.000007*Mol.exact # mass as found in peak group list - -Hmass <- H$mass + H$isotope$mass[1] -Dmass <- H$mass + 1 + H$isotope$mass[2] -Tmass <- H$mass + 2 + H$isotope$mass[3] -C13mass <- C$mass + 1 + C$isotope$mass[2] -N15mass <- N$mass + 1 + N$isotope$mass[2] diff --git a/DIMS/AddOnFunctions/iden.code.R b/DIMS/AddOnFunctions/iden.code.R deleted file mode 100644 index 4853647..0000000 --- a/DIMS/AddOnFunctions/iden.code.R +++ /dev/null @@ -1,45 +0,0 @@ -iden.code <- function(peaklist, db, ppm, theor_mass_label) { - # theor_mass_label = {"MNeg", "Mpos"} - - mcol <- peaklist[ , "mzmed.pgrp"] - theor.mcol <- db[,theor_mass_label] - assi_HMDB <- iso_HMDB <- HMDB_code <- c(rep("",length(mcol))) - theormz_HMDB <- c(rep(0,length(mcol))) - - for(t in 1:length(mcol)){ - mz<-mcol[t] - mtol <- mz*ppm/1000000 - selp <- which(theor.mcol > (mz - mtol) & theor.mcol < (mz + mtol)) - assinames <- isonames <- HMDBcodes <- "" - - if(length(selp)!=0){ - for(i in 1:length(selp)){ - - if(grepl(" iso ", db[selp[i], "CompoundName"])){ - mainpeak <- strsplit(db[selp[i],"CompoundName"]," iso ")[[1]][1] - - # Check if peak without isotope occurs, this assumes that peaklist is ordered on mass!!! - if(length(grep(mainpeak, assi_HMDB, fixed=TRUE)) > 0){ - isonames <- as.character(paste(isonames,as.character(db[selp[i],"CompoundName"]), sep=";")) - } - - } else { - assinames <- as.character(paste(assinames,as.character(db[selp[i], "CompoundName"]), sep=";")) - HMDBcodes <- as.character(paste(HMDBcodes, as.character(rownames(db)[selp[i]]), sep=";")) - } - } - } - - assi_HMDB[t] <- as.character(assinames) - iso_HMDB[t] <- as.character(isonames) - if ((assinames=="") & (isonames=="")){ - theormz_HMDB[t] <- "" - } else { - theormz_HMDB[t] <- theor.mcol[selp[1]] - } - HMDB_code[t] <- as.character(HMDBcodes) - } - - return(cbind(peaklist, assi_HMDB, iso_HMDB, theormz_HMDB, HMDB_code)) -} - diff --git a/DIMS/AddOnFunctions/ident.hires.noise.HPC.R b/DIMS/AddOnFunctions/ident.hires.noise.HPC.R deleted file mode 100644 index 0be43d1..0000000 --- a/DIMS/AddOnFunctions/ident.hires.noise.HPC.R +++ /dev/null @@ -1,242 +0,0 @@ -# modified identify function to also look for adducts and their isotopes -ident.hires.noise.HPC <- function(peaklist, allAdducts, scanmode="Negative", look4=c("Cl", "Ac"), identlist=NULL, resol=280000, slope=0, incpt=0, ppm.fixed, ppm.iso.fixed) { - - # peaklist=outlist_Neg - # scanmode="Negative" - # identlist=noise.Neg.MZ - # look4=look4.addN - # resol=resol - # slope=0 - # incpt=0 - # ppm.fixed=3 - # ppm.iso.fixed=2 - - metlin <- assi <- iso <- rep("", nrow(peaklist)) - theormz <- nisos <- expint <- conf <- rep(0, nrow(peaklist)) - # nrH <- nrD <- nrC <- nr13C <- nrN <- nr15N <- nrO <- nrP <- nrS <- nrCl <- rep(0, nrow(peaklist)) - - # add adducts to identification list - if (scanmode == "Positive") { add.mode <- "+" } else { add.mode <- "-" } - # identlist <- theorMZ - identlist.orig <- identlist - - for (p in 1:length(look4)) { # loop over type of adduct - #print(p) - identlist.Adduct <- identlist.orig - identlist.Adduct[ , "CompoundName"] <- as.character(identlist.orig[ , "CompoundName"]) - - #add2label <- paste("[M+", look4[p], "]", add.mode, sep="") - if (look4[p]=="H2O") { - add2label <- paste("[M-", look4[p], "]", add.mode, sep="") - } else { - add2label <- paste("[M+", look4[p], "]", add.mode, sep="") - } - - identlist.Adduct[ , "CompoundName"] <- paste(identlist.Adduct[ , "CompoundName"], add2label, sep=" ") - adductInfo <- elementInfo(look4[p], allAdducts) - if (scanmode == "Positive") { - adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] - Hmass } else { - adductMass <- adductInfo$mass[1] + adductInfo$isotope$mass[1] + Hmass - } - for (q in 1:nrow(identlist.Adduct)) { # loop over compounds in database - # construct information for compound + adduct: - if (scanmode == "Positive") { - identlist.Adduct[q, "Mpos"] <- as.numeric(identlist.Adduct[q, "Mpos"]) + adductMass # + Na - H - identlist.Adduct[q, "MNeg"] <- 0 - } else { - identlist.Adduct[q, "Mpos"] <- 0 - identlist.Adduct[q, "MNeg"] <- as.numeric(identlist.Adduct[q, "MNeg"]) + adductMass # + Cl + H - } - } - - # # modify columns with info for mol. formula: - # if (look4[p] == "2Na-H") { - # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1 - # } else if (look4[p] == "NH4") { - # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 3 - # identlist.Adduct[, "nrN"] <- as.numeric(identlist.Adduct[, "nrN"]) + 1 - # } else if (look4[p] == "Cl") { - # identlist.Adduct[, "nrCl"] <- as.numeric(identlist.Adduct[, "nrCl"]) + 1 - # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) + 1 - # } else if (look4[p] == "Ac") { - # identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 2 - # identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 3 - # identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 2 - # } else if (look4[p] == "CH3OH+H") { - # identlist.Adduct[ , "nrC"] <- as.numeric(identlist.Adduct[ , "nrC"]) + 1 - # identlist.Adduct[ , "nrH"] <- as.numeric(identlist.Adduct[ , "nrH"]) + 4 - # identlist.Adduct[ , "nrO"] <- as.numeric(identlist.Adduct[ , "nrO"]) + 1 - # } else { - # identlist.Adduct[, "nrH"] <- as.numeric(identlist.Adduct[, "nrH"]) - 1 - # } - identlist <- rbind(identlist, identlist.Adduct) - } # end for p adducts in look4 - - - if (scanmode == "Positive") { theor.mcol <- as.numeric(identlist[ , "Mpos"]) } else { - theor.mcol <- as.numeric(identlist[ , "MNeg"]) } - # apply correction using regression line obtained with ISses - theor.mcol <- (1+slope)*theor.mcol + incpt - - # get mz information from peaklist - mcol <- peaklist[ , "mzmed.pgrp"] - # if column with average intensities is missing, calculate it: - if (!("avg.int" %in% colnames(peaklist))){ - mzmaxcol <- which(colnames(peaklist) == "mzmax.pgrp") - endcol <- ncol(peaklist) - peaklist[ , "avg.int"] <- apply(peaklist[ ,(mzmaxcol+1):(endcol)], 1, mean) - } - - # # generate URL for Metlin: - # for (p in 1:nrow(peaklist)) { # for each peak # p <- 1 - # mzpeak <- as.numeric(mcol[p]) - # # resolution as function of mz: - # resol.mz <- resol*(1/sqrt(2)^(log2(mzpeak/200))) - # fwhm <- mzpeak/resol.mz - # # massmin <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) - fwhm # for Metlin db - # # massmax <- mzpeak - (0.00003 + 0.000003*mzpeak) - (1.0078250321 - 0.00054858) + fwhm # for Metlin db - # if (scanmode == "Positive") { - # massmin <- mzpeak - (1.0078250321 - 0.00054858) - fwhm # for Metlin db - # massmax <- mzpeak - (1.0078250321 - 0.00054858) + fwhm # for Metlin db - # } else { - # massmin <- mzpeak + (1.0078250321 - 0.00054858) - fwhm # for Metlin db - # massmax <- mzpeak + (1.0078250321 - 0.00054858) + fwhm # for Metlin db - # } - # metlin[p] <- paste("http://metlin.scripps.edu/metabo_list.php?mass_min=", massmin, "&mass_max=", massmax, sep="") - # } - - - # do indentification using own database: - for (t in 1:nrow(identlist)) { # theoretical mass # t <- 45 - # for (t in 1:169) { - #print(as.character(identlist[t,"CompoundName"])) - theor.mz <- theor.mcol[t] - - theor.comp <- as.character(identlist[t, "Composition"]) - #theor.comp <- mol.formula(identlist[t, ]) - - # # if there's Deuterium, Tritium, 13C or 15N in the composition: - # mass.incr <- 0 + (as.numeric(identlist[t,"nrD"])*Dmass) + - # (as.numeric(identlist[t,"nrT"])*Tmass) + - # (as.numeric(identlist[t,"nrC13"])*C13mass) + - # (as.numeric(identlist[t,"nrN15"])*N15mass) - # theor.comp <- strsplit(theor.comp, "iso")[[1]][1] - mass.incr <- 0 - - # resolution as function of mz: - resol.mz <- resol*(1/sqrt(2)^(log2(theor.mz/200))) - # calculate fine-grained isotopic distribution using MIDAs - fwhm <- round(theor.mz/resol.mz,6) - - #system(paste("C:/Users/awillem7/tools/MIDAs_New/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) - #system(paste("C:/Users/mraves/Metabolomics/MIDAs_New/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) - #system(paste(path2MIDAS, theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) - - #system(paste("/data/home/luyf/Metabolomics/MIDAs/MIDAs_Example ", theor.comp, " 2 C00 \"\" \"\" ", fwhm, " 1 0 1e-50 2 tmp", sep=""),ignore.stderr=TRUE) - options(stringsAsFactors = FALSE) - #fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE) - - # res <- try(fgid <- read.table(file="tmp_Fine_Grained_Isotopic_Distribution", header=FALSE)) - # if(inherits(res, "try-error")) - # { - # #error handling code, maybe just skip this iteration using - # message("Skipped") - # next - # } - # - # # correct mass for D, T, 13C and 15N - # fgid[ ,1] <- as.numeric(fgid[ ,1]) + mass.incr - # # calculate percentage intensities from relative intensities - # firstone <- as.numeric(fgid[1,2]) - # fgid[ ,3] <- as.numeric(fgid[ , 2]) / firstone - # #fgid <- as.matrix(fgid, ncol=3) - # # the mz in the MIDAs file are of the neutral molecule - # if (scanmode == "Positive") { mz.iso <- as.numeric(fgid[ , 1]) + Hmass - electron } - # if (scanmode == "Negative") { mz.iso <- as.numeric(fgid[ , 1]) - Hmass + electron } - # fgid <- cbind(fgid, mz.iso) - # colnames(fgid) <- c("mz","rel.int", "perc.int", "mz.iso") - - # compensate mz for presence of adduct - # Adduct.mass <- theor.mz - mz.iso[1] - # fgid[ , "mz.iso"] <- as.numeric(fgid[ , "mz.iso"]) + Adduct.mass - - # set tolerance for mz accuracy of main peak - mtol <- theor.mz*ppm.fixed/1000000 - # find main peak - selp <- which(mcol > (theor.mz - mtol) & mcol < (theor.mz + mtol)) - # selp <- which(mcol > (theor.mz - 0.01) & mcol < (theor.mz + 0.01)) - # peaklist[selp, c(1:4,(endcol+1))] - - # set tolerance for mz accuracy of isotope peaks - itol <- theor.mz*ppm.iso.fixed/1000000 - - # if (length(selp) > 1) { # more than one candidate peak for main; select best one based on isotope pattern - # #cat(as.character(identlist[t, "CompoundName"])); print(" has >1 candidate peaks") - # conf.local <- rep(0, length(selp)) - # for (p in 1:length(selp)) { # p <- 2 - # # determine isotope pattern for each candidate peak - # # obs.mz <- peaklist[selp[p],"mzmed.pgrp"] - # conf.local[p] <- match.isotope.pattern(peaklist, scanmode, selp[p], fgid, ppm.iso.fixed) - # } - # # selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] } - # selp <- na.exclude(selp[conf.local == max(conf.local)]) - # } - if (length(selp) > 1) { # more than one candidate peak for main; select best one based on mz.diff - selp <- selp[abs(mcol[selp] - theor.mz) == min(abs(mcol[selp] - theor.mz))] - } - if (length(selp) == 1) { # match for main - assi[selp] <- paste(assi[selp], as.character(identlist[t,"CompoundName"]), sep=";") - theormz[selp] <- theor.mz - # conf[selp] <- match.isotope.pattern(peaklist, scanmode, selp, fgid, ppm.iso.fixed) - - # nrH[selp] <- identlist[t,"nrH"] - # nrD[selp] <- identlist[t,"nrD"] - # nrC[selp] <- identlist[t,"nrC"] - # nr13C[selp] <- identlist[t,"nrC13"] - # nrN[selp] <- identlist[t,"nrN"] - # nr15N[selp] <- identlist[t,"nrN15"] - # nrO[selp] <- identlist[t,"nrO"] - # nrP[selp] <- identlist[t,"nrP"] - # nrS[selp] <- identlist[t,"nrS"] - # nrCl[selp] <- identlist[t,"nrCl"] - - # assign isotope peaks - # mz.main <- peaklist[selp, "mzmed.pgrp"] # mz of main peak - # int.main <- peaklist[selp, "avg.int"] # intensity of main peak (= 100%) - # # deviation from theoretical mass: - # diff <- theor.mz - mz.main - # # calculate expected intensities and select isotopes with exp.int > threshold - # fgid[ , "exp.int"] <- fgid[ , "perc.int"] * int.main - # fgid.subset <- fgid[(fgid[ , "exp.int"] > thresh), ] - # nisos[selp] <- nrow(fgid.subset) - 1 - # if (nrow(fgid.subset) > 1) { # avoid error message if fgid.subset has only 1 line - # for (f in 2:nrow(fgid.subset)) { # f <- 2 - # mz.target <- fgid.subset[f, "mz.iso"] - diff - # int.target <- fgid.subset[f, "exp.int"] - # # print(itarget) - # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - itol) & peaklist[ , "mzmed.pgrp"] < (mz.target + itol) - # # sum(sel.iso) - # # sel.iso <- peaklist[ , "mzmed.pgrp"] > (mz.target - 0.01) & peaklist[ , "mzmed.pgrp"] < (mz.target + 0.01) - # # peaklist[sel.iso, c(1:4,23)] - # if (sum(sel.iso) == 1) { # 2 separate if-statements because of error if sum(sel.iso) = 0 - # if (peaklist[sel.iso, "avg.int"] > (int.target/2)) { # match - # iso[sel.iso] <- paste(paste(iso[sel.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="") - # # peaklist[sel.iso, ] - # expint[sel.iso] <- fgid.subset[f, "exp.int"] - # } - # } else if (sum(sel.iso) > 1) { - # nrs.iso <- which(sel.iso) - # nr.iso <- nrs.iso[which(abs(peaklist[sel.iso, "avg.int"] - int.target) == min(abs(peaklist[sel.iso, "avg.int"] - int.target)))] - # if (peaklist[nr.iso, "avg.int"] > (int.target/2)) { # match - # # print(peaklist[nr.iso, "avg.int"]) - # iso[nr.iso] <- paste(paste(iso[nr.iso], as.character(identlist[t,"CompoundName"]), "iso", f, sep=" "),";", sep="") - # expint[nr.iso] <- fgid.subset[f, "exp.int"] - # } # end if - # } # end else if - # } # end for f - # } # end if - } # end if - } # end for t - # cbind(peaklist, nrH, nrD, nrC, nr13C, nrN, nr15N, nrO, nrP, nrS, nrCl, assi, theormz, conf, nisos, iso, expint, metlin) - cbind(peaklist, assi, theormz, conf, nisos, iso, expint, metlin) -} diff --git a/DIMS/AddOnFunctions/isWithinXppm.R b/DIMS/AddOnFunctions/isWithinXppm.R deleted file mode 100644 index 5da43c6..0000000 --- a/DIMS/AddOnFunctions/isWithinXppm.R +++ /dev/null @@ -1,51 +0,0 @@ -isWithinXppm <- function(mean, scale, sigma, area, x2, x, ppm=4, resol, plot) { - # mean=retVal$mean - # scale=retVal$scale - # sigma=retVal$sigma - # area=retVal$area - # ppm=3 - - # sort!!!!!!!!!!!!!!!! - index = order(mean) - mean = mean[index] - scale = scale[index] - sigma = sigma[index] - area = area[index] - - summed = NULL - remove = NULL - - if (length(mean)>1){ - for (i in 2:length(mean)){ - if ((abs(mean[i-1]-mean[i])/mean[i-1])*10^6 < ppm) { - - # avoid double occurance in sum - if ((i-1) %in% summed) next - - retVal = sumCurves(mean[i-1], mean[i], scale[i-1], scale[i], sigma[i-1], sigma[i], x2, x, resol, plot) - summed = c(summed, i-1, i) - if (is.nan(retVal$mean)) retVal$mean=0 - mean[i-1] = retVal$mean - mean[i] = retVal$mean - area[i-1] = retVal$area - area[i] = retVal$area - scale[i-1] = retVal$scale - scale[i] = retVal$scale - sigma[i-1] = retVal$sigma - sigma[i] = retVal$sigma - - remove = c(remove, i) - } - } - } - - if (length(remove)!=0){ - mean=mean[-c(remove)] - area=area[-c(remove)] - scale=scale[-c(remove)] - sigma=sigma[-c(remove)] - } - - return(list("mean"=mean, "area"=area, "scale"=scale, "sigma"=sigma, "qual"=NULL)) - -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/is_diagnostic_patient.R b/DIMS/AddOnFunctions/is_diagnostic_patient.R deleted file mode 100644 index 839f41d..0000000 --- a/DIMS/AddOnFunctions/is_diagnostic_patient.R +++ /dev/null @@ -1,6 +0,0 @@ -is_diagnostic_patient <- function(patient_column){ - # Check for Diagnostics patients with correct patientnumber (e.g. starting with "P2024M") - diagnostic_patients <- grepl("^P[0-9]{4}M",patient_column) - - return(diagnostic_patients) -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/mergeDuplicatedRows.R b/DIMS/AddOnFunctions/mergeDuplicatedRows.R deleted file mode 100644 index 39cb8b4..0000000 --- a/DIMS/AddOnFunctions/mergeDuplicatedRows.R +++ /dev/null @@ -1,49 +0,0 @@ -mergeDuplicatedRows <- function(peaklist) { - # peaklist = outlist.tot - # resultDir = "./results" - # scanmode = "positive" - - # peaklist_index=which(peaklist[,"mzmed.pgrp"]=="94.9984524624111") - # peaklist[peaklist_index,] - - collapse <- function(label,pklst,index){ - # label = "iso_HMDB" - # pklst = peaklist - # index = peaklist_index - tmp2=as.vector(pklst[index,label]) - if (length(which(is.na(tmp2)))>0) tmp2=tmp2[-which(is.na(tmp2))] - return(paste(tmp2,collapse = ";")) - } - - options(digits=16) - - collect=NULL - remove=NULL - - index = which(duplicated(peaklist[, "mzmed.pgrp"])) - - while (length(index) > 0){ - - peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"]) - # peaklist[peaklist_index,"iso_HMDB",drop=FALSE] - tmp=peaklist[peaklist_index[1],,drop=FALSE] - - tmp[,"assi_HMDB"]=collapse("assi_HMDB",peaklist,peaklist_index) - tmp[,"iso_HMDB"]=collapse("iso_HMDB",peaklist,peaklist_index) - tmp[,"HMDB_code"]=collapse("HMDB_code",peaklist,peaklist_index) - tmp[,"assi_noise"]=collapse("assi_noise",peaklist,peaklist_index) - if (tmp[,"assi_noise"]==";") tmp[,"assi_noise"]=NA - tmp[,"theormz_noise"]=collapse("theormz_noise",peaklist,peaklist_index) - if (tmp[,"theormz_noise"]=="0;0") tmp[,"theormz_noise"]=NA - - collect = rbind(collect, tmp) - remove = c(remove, peaklist_index) - - index=index[-which(peaklist[index, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"])] - } - - if (!is.null(remove)) peaklist = peaklist[-remove,] - peaklist = rbind(peaklist,collect) - - return(peaklist) -} diff --git a/DIMS/AddOnFunctions/normalization_2.1.R b/DIMS/AddOnFunctions/normalization_2.1.R deleted file mode 100644 index ed99964..0000000 --- a/DIMS/AddOnFunctions/normalization_2.1.R +++ /dev/null @@ -1,75 +0,0 @@ -# normalization_2.1(outlist.pos.id, "Intensity_all_peaks_positive_norm", groupNames.pos, on="total", assi_label="assi_HMDB") - -normalization_2.1 <- function(data, filename, groupNames, on="total", assi_label="assi"){ - # data=outlist.pos.id - # filename = "Intensity_all_peaks_positive_norm" - # groupNames = groupNames.pos - # on="total" - # assi_label="assi_HMDB" - - lastcol = length(groupNames) + 6 - before = data[ ,c(7:lastcol)] - - if (on=="total_IS") { - - assi = which(colnames(data)==assi_label) - data.int <- data[ ,c(assi,7:lastcol)] # assi and samples columns - data.assi = data.int[grep("(IS", data.int[,1], ignore.case=FALSE, fixed = TRUE),] - - } else if (on=="total_ident"){ - - assi = which(colnames(data)==assi_label) - assi.hmdb = which(colnames(data)=="assi.hmdb") - index = sort(union(which(data[,assi]!=""), which(data[,assi.hmdb]!=""))) - - data.int = data[ ,c(assi,7:lastcol)] # assi and samples columns - data.assi = data.int[index,] - - } else if (on=="total") { - - assi = which(colnames(data)==assi_label) - data.int = data[ ,c(assi,7:lastcol)] # assi and samples columns - data.assi = data.int - - } - - sum <- 0 - for (c in 2:ncol(data.assi)) { - sum <- sum + sum(as.numeric(data.assi[,c])) - } - average <- sum/(ncol(data.assi)-1) - for (c in 2:ncol(data.int)) { - factor <- sum(as.numeric(data.assi[,c]))/average - if (factor==0) { - data.int[ ,c]=0 - cat(colnames(data.int)[c]) - cat("factor==0 !!!") - } else { - data.int[ ,c] <- as.numeric(data.int[ ,c])/factor - } - } - - # colnames(data.int[,2:ncol(data.int)]) - - if (dim(data)[2]==lastcol){ - final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)]) - } else { - final.outlist.Pos.idpat.norm <- cbind(data[,1:6],data.int[,2:ncol(data.int)],data[,(lastcol + 1):ncol(data)]) - } - - #outdir="./results/normalization" - #dir.create(outdir, showWarnings = FALSE) - - #CairoPNG(filename=paste(outdir, paste(filename, "_before.png", sep=""), sep="/"), width, height) - #sub=apply(before,2, function(x) sum(as.numeric(x))) - #barplot(as.vector(unlist(sub)), main="Not normalized",names.arg = colnames(before),las=2) - #dev.off() - - #CairoPNG(filename=paste(outdir, paste(filename, "_", on, ".png", sep=""), sep="/"), width, height) - #sub=apply(final.outlist.Pos.idpat.norm[,c(7:lastcol)],2, function(x) sum(as.numeric(x))) - #barplot(as.vector(unlist(sub)), main=filename,names.arg = colnames(final.outlist.Pos.idpat.norm[,c(7:lastcol)]),las=2) - #dev.off() - - return(final.outlist.Pos.idpat.norm) - -} diff --git a/DIMS/AddOnFunctions/optimizeGauss.R b/DIMS/AddOnFunctions/optimizeGauss.R deleted file mode 100644 index 0fbce75..0000000 --- a/DIMS/AddOnFunctions/optimizeGauss.R +++ /dev/null @@ -1,11 +0,0 @@ -optimizeGauss <- function(x,y,sigma,mu) { - - f = function(p,x,y,sigma,mu) { - curve = p*dnorm(x,mu,sigma) - return((max(curve)-max(y))^2) - } - - rval = optimize(f, c(0, 100000), tol = 0.0001,x,y,sigma,mu) - - return(rval$minimum) -} diff --git a/DIMS/AddOnFunctions/output_helix.R b/DIMS/AddOnFunctions/output_helix.R deleted file mode 100644 index c098675..0000000 --- a/DIMS/AddOnFunctions/output_helix.R +++ /dev/null @@ -1,31 +0,0 @@ -output_for_helix <- function(protocol_name, df_metabs_helix){ - - # Remove positive controls - df_metabs_helix <- df_metabs_helix %>% filter(is_diagnostic_patient(Patient)) - - # Add 'Vial' column, each patient has unique ID - df_metabs_helix <- df_metabs_helix %>% - group_by(Patient) %>% - mutate(Vial = cur_group_id()) %>% - ungroup() - - # Split patient number into labnummer and Onderzoeksnummer - df_metabs_helix <- add_lab_id_and_onderzoeksnummer(df_metabs_helix) - - # Add column with protocol name - df_metabs_helix$Protocol <- protocol_name - - # Change name Z_score and Helix_naam columns to Amount and Name - change_columns <- c(Amount = "Z_score", Name = "Helix_naam") - df_metabs_helix <- df_metabs_helix %>% rename(all_of(change_columns)) - - # Select only necessary columns and set them in correct order - df_metabs_helix <- df_metabs_helix %>% - select(c(Vial, labnummer, Onderzoeksnummer, Protocol, Name, Amount)) - - # Remove duplicate patient-metabolite combinations ("leucine + isoleucine + allo-isoleucin_Z-score" is added 3 times) - df_metabs_helix <- df_metabs_helix %>% - group_by(Onderzoeksnummer, Name) %>% distinct() %>% ungroup() - - return(df_metabs_helix) -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R b/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R deleted file mode 100644 index c36259a..0000000 --- a/DIMS/AddOnFunctions/peak.grouping.Gauss.HPC.R +++ /dev/null @@ -1,88 +0,0 @@ -peak.grouping.Gauss.HPC <- function(outdir, fileIn, scanmode, resol, groupNames) { - # fileIn="./results/specpks_all/positive_outlist_i_min_1.197.RData" - - # ppm/2 - #range = 1.5e-06 - range = 2e-06 - startcol=7 - - # outlist.copy <- read.table(file=fileIn, sep="\t", header=TRUE) - load(fileIn) - outlist.copy = outlist_i_min_1 - batch = strsplit(fileIn, ".",fixed = TRUE)[[1]][3] - - outpgrlist = NULL - - while (max(as.numeric(outlist.copy[ , "height.pkt"])) > 0 ) { - - sel = which(as.numeric(outlist.copy[ , "height.pkt"]) == max(as.numeric(outlist.copy[ , "height.pkt"])))[1] - - # 3ppm range around max - mzref = as.numeric(outlist.copy[sel, "mzmed.pkt"]) - pkmin = -(range*mzref - mzref) - pkmax = 2*mzref-pkmin - - selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > pkmin & as.numeric(outlist.copy[ , "mzmed.pkt"]) < pkmax - tmplist = outlist.copy[selp, ] - - nrsamples = sum(selp) - if (nrsamples > 1) { - # 3ppm range around mean - mzmed.pgrp = mean(as.numeric(outlist.copy[selp, "mzmed.pkt"])) - mzmin.pgrp = -(range*mzmed.pgrp - mzmed.pgrp) - mzmax.pgrp = 2*mzmed.pgrp - mzmin.pgrp - - selp = as.numeric(outlist.copy[ , "mzmed.pkt"]) > mzmin.pgrp & as.numeric(outlist.copy[ , "mzmed.pkt"]) < mzmax.pgrp - tmplist = outlist.copy[selp, ] - - fq.worst.pgrp = as.numeric(max(outlist.copy[selp, "fq"])) - fq.best.pgrp = as.numeric(min(outlist.copy[selp, "fq"])) - ints.allsamps = rep(0, length(groupNames)) - names(ints.allsamps) = groupNames # same order as sample list!!! - - # if (length(unique(as.vector(tmplist[,"samplenr"]))) != length(as.vector(tmplist[,"samplenr"]))) { - # message(paste("bingo", sel)) - # break - # } - - # Check for each sample if multiple peaks exists, if so take the sum! - labels=unique(tmplist[,"samplenr"]) - ints.allsamps[labels] = as.vector(unlist(lapply(labels, function(x) {sum(as.numeric(tmplist[which(tmplist[ , "samplenr"]==x), "height.pkt"]))}))) - - outpgrlist = rbind(outpgrlist, c(mzmed.pgrp, fq.best.pgrp, fq.worst.pgrp, nrsamples, mzmin.pgrp, mzmax.pgrp, ints.allsamps)) - } - outlist.copy[selp, "height.pkt"] = -1 - } - - outpgrlist = as.data.frame(outpgrlist) # ignore warnings of duplicate row names - colnames(outpgrlist)[1:6] = c("mzmed.pgrp", "fq.best", "fq.worst", "nrsamples", "mzmin.pgrp", "mzmax.pgrp") - - # # filtering ################################################################################################################## - # final.outlist=outpgrlist[,c("mzmed.pgrp", "fq.best", "fq.worst","nrsamples","mzmin.pgrp","mzmax.pgrp", sampleNames)] - # # NB: in centroided mode, data files contains many "-1.000" values, from peak finding. Set these to zero. - # final.outlist[final.outlist == -1] = 0 - # - # # keep only peaks which occur in 3 out of 3 technical replicates in at least one sample in peak group list - # # peakFiltering(repl.pattern, final.outlist, nsampgrps, outdir, scanmode, startcol=7) - # # peakFiltering <- function(repl.pattern, final.outlist, nsampgrps, resultDir, scanmode, startcol){ - # nsamp = length(repl.pattern) - # nsampgrps = length(repl.pattern[[1]]) - # - # keep <- rep(0, nrow(final.outlist)) - # for (p in 1:nrow(final.outlist)) { - # for (g in 1:nsampgrps) { # g <- 31 - # if (keep[p] == 0 & sum(final.outlist[p, repl.pattern[[g]]] > 0) == length(repl.pattern[[g]]) ) { keep[p] <- 1 } - # } - # } - # - # tmp <- cbind(final.outlist, keep) - # final.outlist.filt <- tmp[keep == 1, ] - # - # # omit keep column - # final.outlist.filt <- final.outlist.filt[ , -ncol(final.outlist.filt)] - - #save(outpgrlist_part, file=paste(outdir, paste(scanmode, "_", mzstart, "_", mzend, ".RData", sep=""), sep="/")) - # save(final.outlist.filt, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/")) - save(outpgrlist, file=paste(outdir, "peak_grouping", paste(scanmode, "_",batch,".RData", sep=""), sep="/")) - -} \ No newline at end of file diff --git a/DIMS/AddOnFunctions/prepare_alarmvalues.R b/DIMS/AddOnFunctions/prepare_alarmvalues.R deleted file mode 100644 index c09c536..0000000 --- a/DIMS/AddOnFunctions/prepare_alarmvalues.R +++ /dev/null @@ -1,52 +0,0 @@ -prepare_alarmvalues <- function(pt_name, dims_helix_table) { - # extract data for patient of interest (pt_name) - pt_metabs_helix <- dims_helix_table %>% filter(Patient == pt_name) - pt_metabs_helix$Z_score <- round(pt_metabs_helix$Z_score, 2) - - # Make empty dataframes for metabolites above or below alarmvalues - pt_list_high <- data.frame(HMDB_name = character(), Z_score = numeric()) - pt_list_low <- data.frame(HMDB_name = character(), Z_score = numeric()) - - # Loop over individual metabolites - for (metab in unique(pt_metabs_helix$HMDB_name)){ - # Get data for individual metabolite - pt_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) - # print(pt_metab) - - # Check if zscore is positive of negative - if(pt_metab$Z_score > 0) { - # Get specific alarmvalue for metabolite - high_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(high_zscore) - - # If zscore is above the alarmvalue, add to pt_list_high table - if(pt_metab$Z_score > high_zscore_cutoff_metab) { - pt_metab_high <- pt_metab %>% select(HMDB_name, Z_score) - pt_list_high <- rbind(pt_list_high, pt_metab_high) - } - } else { - # Get specific alarmvalue for metabolite - low_zscore_cutoff_metab <- pt_metabs_helix %>% filter(HMDB_name == metab) %>% pull(low_zscore) - - # If zscore is below the alarmvalue, add to pt_list_low table - if(pt_metab$Z_score < low_zscore_cutoff_metab) { - pt_metab_low <- pt_metab %>% select(HMDB_name, Z_score) - pt_list_low <- rbind(pt_list_low, pt_metab_low) - } - } - } - - # sort tables on zscore - pt_list_high <- pt_list_high %>% arrange(desc(Z_score)) - pt_list_low <- pt_list_low %>% arrange(Z_score) - # add lines for increased, decreased - extra_line1 <- c("Increased", "") - extra_line2 <- c("Decreased", "") - # combine the two lists - top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) - # remove row names - rownames(top_metab_pt) <- NULL - # change column names for display - colnames(top_metab_pt) <- c("Metabolite", "Z-score") - - return(top_metab_pt) -} diff --git a/DIMS/AddOnFunctions/prepare_data.R b/DIMS/AddOnFunctions/prepare_data.R deleted file mode 100644 index 07a8805..0000000 --- a/DIMS/AddOnFunctions/prepare_data.R +++ /dev/null @@ -1,42 +0,0 @@ -prepare_data <- function(metab_list_all, zscore_patients_local) { - # remove "_Zscore" from column (patient) names - colnames(zscore_patients_local) <- gsub("_Zscore", "", colnames(zscore_patients_local)) - # put data into pages, max 20 violin plots per page in PDF - metab_interest_sorted <- list() - metab_category <- c() - for (metab_class_index in 1:length(metab_list_all)) { # "acyl_carnitines" "amino_acids" "crea_gua" - metab_class <- names(metab_list_all)[metab_class_index] - metab_list <- metab_list_all[[metab_class_index]] - if (ncol(metab_list) > 2) { - # third column are the alarm values, so reduce the data frame to 2 columns and save list - metab_list_alarm <- metab_list - metab_list <- metab_list[ , c(1,2)] - } - # make sure that all HMDB_names have 45 characters - for (metab_index in 1:length(metab_list$HMDB_name)) { - if (is.character(metab_list$HMDB_name[metab_index])) { - HMDB_name_separated <- strsplit(metab_list$HMDB_name[metab_index], "")[[1]] - } else { HMDB_name_separated <- "strspliterror" } - if (length(HMDB_name_separated) <= 45) { - HMDB_name_separated <- c(HMDB_name_separated, rep(" ", 45-length(HMDB_name_separated))) - } else { - HMDB_name_separated <- c(HMDB_name_separated[1:42], "...") - } - metab_list$HMDB_name[metab_index] <- paste0(HMDB_name_separated, collapse = "") - } - # find metabolites and ratios in data frame zscore_patients_local - metab_interest <- inner_join(metab_list, zscore_patients_local[-2], by = "HMDB_code") - # remove column "HMDB_code" - metab_interest <- metab_interest[ , -which(colnames(metab_interest) == "HMDB_code")] - # put the data frame in long format - metab_interest_melt <- reshape2::melt(metab_interest, id.vars = "HMDB_name") - # sort on metabolite names (HMDB_name) - sort_order <- order(metab_interest_melt$HMDB_name) - metab_interest_sorted[[metab_class_index]] <- metab_interest_melt[sort_order, ] - metab_category <- c(metab_category, metab_class) - } - names(metab_interest_sorted) <- metab_category - - return(metab_interest_sorted) - -} diff --git a/DIMS/AddOnFunctions/prepare_data_perpage.R b/DIMS/AddOnFunctions/prepare_data_perpage.R deleted file mode 100644 index 9f112f3..0000000 --- a/DIMS/AddOnFunctions/prepare_data_perpage.R +++ /dev/null @@ -1,47 +0,0 @@ -prepare_data_perpage <- function(metab_interest_sorted, metab_interest_contr, nr_plots_perpage, nr_pat=20, nr_contr=30) { - total_nr_pages <- 0 - metab_perpage <- list() - metab_category <- c() - for (metab_class_index in 1:length(metab_interest_sorted)) { # "acyl_carnitines" "amino_acids" "crea_gua" - # split list into pages, each page containing max nr_plots_perpage (20) compounds - metab_interest_perclass <- metab_interest_sorted[[metab_class_index]] - metab_class <- names(metab_interest_sorted)[metab_class_index] - # add controls - metab_interest_contr_perclass <- metab_interest_contr[[metab_class_index]] - # number of pages for this class - nr_pages <- ceiling(length(unique(metab_interest_perclass$HMDB_name)) / nr_plots_perpage) - for (page_nr in 1:nr_pages) { - total_nr_pages <- total_nr_pages + 1 - select_rows_start <- (nr_pat * nr_plots_perpage * (page_nr-1)) + 1 - select_rows_end <- nr_pat * nr_plots_perpage * page_nr - metab_onepage_pat <- metab_interest_perclass[select_rows_start:select_rows_end, ] - # same for controls - select_rows_start_contr <- (nr_contr * nr_plots_perpage * (page_nr-1)) + 1 - select_rows_end_contr <- nr_contr * nr_plots_perpage * page_nr - metab_onepage_pcontr <- metab_interest_contr_perclass[select_rows_start_contr:select_rows_end_contr, ] - # add controls - metab_onepage <- rbind(metab_onepage_pat, metab_onepage_pcontr) - # if a page has fewer than nr_plots_perpage plots, fill page with empty plots - NA_rows <- which(is.na(metab_onepage$HMDB_name)) - if (length(NA_rows) > 0) { - # repeat the patient and control variables - metab_onepage$variable[NA_rows] <- metab_onepage$variable[1:(nr_pat + nr_contr)] - # for HMDB name, substitute a number of spaces - for (row_nr in NA_rows) { - metab_onepage$HMDB_name[row_nr] <- paste0(rep("_", ceiling(row_nr/(nr_pat + nr_contr))), collapse = "") - } - metab_onepage$HMDB_name <- gsub("_", " ", metab_onepage$HMDB_name) - # leave the values at NA - } - # put data for one page into object with data for all pages - metab_perpage[[total_nr_pages]] <- metab_onepage - # create list of page headers - metab_category <- c(metab_category, paste(metab_class, page_nr, sep="_")) - } - } - # add page headers to list - names(metab_perpage) <- metab_category - - return(metab_perpage) - -} diff --git a/DIMS/AddOnFunctions/prepare_toplist.R b/DIMS/AddOnFunctions/prepare_toplist.R deleted file mode 100644 index 369f887..0000000 --- a/DIMS/AddOnFunctions/prepare_toplist.R +++ /dev/null @@ -1,29 +0,0 @@ -prepare_toplist <- function(pt_name, zscore_patients_copy) { - # set parameters for table - top_highest <- 20 - top_lowest <- 10 - - # extract data for patient of interest (pt_name) - pt_list <- zscore_patients_copy[ , c(1,2, which(colnames(zscore_patients_copy) == pt_name))] - # sort metabolites on Z-scores for this patient - pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE) - pt_list_sorted <- pt_list[pt_list_sort$ix, ] - # determine top highest and lowest Z-scores for this patient - pt_list_sort <- sort(pt_list[ , 3], index.return=TRUE) - pt_list_low <- pt_list[pt_list_sort$ix[1:top_lowest], ] - pt_list_high <- pt_list[pt_list_sort$ix[length(pt_list_sort$ix):(length(pt_list_sort$ix)-top_highest+1)], ] - # round off Z-scores - pt_list_low[ , 3] <- round(as.numeric(pt_list_low[ , 3]), 2) - pt_list_high[ , 3] <- round(as.numeric(pt_list_high[ , 3]), 2) - # add lines for increased, decreased - extra_line1 <- c("Increased", "", "") - extra_line2 <- c("Decreased", "", "") - top_metab_pt <- rbind(extra_line1, pt_list_high, extra_line2, pt_list_low) - # remove row names - rownames(top_metab_pt) <- NULL - - # change column names for display - colnames(top_metab_pt) <- c("HMDB_ID", "Metabolite", "Z-score") - - return(top_metab_pt) -} diff --git a/DIMS/AddOnFunctions/remove.dupl.2.1.R b/DIMS/AddOnFunctions/remove.dupl.2.1.R deleted file mode 100644 index 89e307d..0000000 --- a/DIMS/AddOnFunctions/remove.dupl.2.1.R +++ /dev/null @@ -1,32 +0,0 @@ -remove.dupl.2.1 <- function(peaklist) { - # peaklist = outlist.tot - # resultDir = "./results" - # scanmode = "positive" - - # peaklist=peaklist[1:100000,] - - collect=NULL - remove=NULL - - index = which(duplicated(peaklist[, "mzmed.pgrp"])) - - while (length(index) > 0){ - - peaklist_index = which(peaklist[, "mzmed.pgrp"] == peaklist[index[1], "mzmed.pgrp"]) - tmp=peaklist[peaklist_index[1],,drop=FALSE] - if (!is.na(peaklist[peaklist_index[1],"assi_HMDB"])) tmp[,"assi_HMDB"]=paste(peaklist[peaklist_index,"assi_HMDB"],collapse = ";") else tmp[,"assi_HMDB"]=NA - if (!is.na(peaklist[peaklist_index[1],"iso_HMDB"])) tmp[,"iso_HMDB"]=paste(peaklist[peaklist_index,"iso_HMDB"],collapse = ";") else tmp[,"iso_HMDB"]=NA - if (!is.na(peaklist[peaklist_index[1],"HMDB_code"])) tmp[,"HMDB_code"]=paste(peaklist[peaklist_index,"HMDB_code"],collapse = ";") else tmp[,"HMDB_code"]=NA - if (peaklist[peaklist_index[1],"assi_noise"]!="") tmp[,"assi_noise"]=paste(peaklist[peaklist_index,"assi_noise"],collapse = ";") else tmp[,"assi_noise"]="" - - collect = rbind(collect, tmp) - remove = c(remove, peaklist_index) - - index=index[-which(index==index[1])] - } - - peaklist = peaklist[-remove,] - peaklist = rbind(peaklist,collect) - - return(peaklist) -} diff --git a/DIMS/AddOnFunctions/remove.dupl.R b/DIMS/AddOnFunctions/remove.dupl.R deleted file mode 100644 index 227f207..0000000 --- a/DIMS/AddOnFunctions/remove.dupl.R +++ /dev/null @@ -1,39 +0,0 @@ -# remove duplicates, peak groups with mz within a few ppm -# ppmdef should be 2 times bigger than during identification!!! -remove.dupl <- function(peaklist, ppmdef=4, tolint=0.01) { - - # peaklist = outpgrlist - # ppmdef = 2 - # tolint = 0.01 - - int.cols = 7:ncol(peaklist) - - p <- 1 - while (p < nrow(peaklist)) { - mzref <- peaklist[p, "mzmed.pgrp"] - # print(mzref) - dist.ppm <- ppmdef * mzref / 1000000 - sel <- peaklist[ , "mzmed.pgrp"] > (mzref - dist.ppm) & peaklist[ , "mzmed.pgrp"] < (mzref + dist.ppm) - subset <- peaklist[sel, ] - if (nrow(subset) > 1) { - avi <- rep(1, max(int.cols)) - for (c in int.cols) { avi[c] <- max(subset[ ,c])/mean(subset[ ,c]) } - - # remove NaN - avi[which(is.nan(avi))] = 1 - - if (mean(avi) > (1-tolint) & mean(avi < (1+tolint))) { - peaklist <- peaklist[-which(sel), ] - newline <- subset[1, ] - newline[ , "mzmed.pgrp"] <- mean(subset[ , "mzmed.pgrp"]) - newline[ , "mzmin.pgrp"] <- min(subset[ , "mzmin.pgrp"]) - newline[ , "mzmax.pgrp"] <- max(subset[ , "mzmax.pgrp"]) - newline[ , int.cols] <- apply(subset[ , int.cols], 2, max) - # newline[ , "avg.int"] <- mean(as.numeric(newline[ , int.cols])) - peaklist <- rbind(peaklist, newline) - } - } - p <- p + 1 - } - peaklist -} diff --git a/DIMS/AddOnFunctions/removeFromRepl.pat.R b/DIMS/AddOnFunctions/removeFromRepl.pat.R deleted file mode 100644 index ad5b1c8..0000000 --- a/DIMS/AddOnFunctions/removeFromRepl.pat.R +++ /dev/null @@ -1,31 +0,0 @@ -removeFromRepl.pat <- function(bad_samples, repl_pattern, nr_replicates) { - - tmp = repl_pattern - - removeFromGroup=NULL - - for (i in 1:length(tmp)){ - tmp2 = repl_pattern[[i]] - - remove=NULL - - for (j in 1:length(tmp2)){ - if (tmp2[j] %in% bad_samples){ - #cat(tmp2[j]) - #cat(paste("remove",tmp2[j])) - #cat(paste("remove i",i)) - #cat(paste("remove j",j)) - remove = c(remove, j) - } - } - - if (length(remove)==nr_replicates) removeFromGroup=c(removeFromGroup,i) - if (!is.null(remove)) repl_pattern[[i]]=repl_pattern[[i]][-remove] - } - - if (length(removeFromGroup)!=0) { - repl_pattern=repl_pattern[-removeFromGroup] - } - - return(list("pattern"=repl_pattern)) -} diff --git a/DIMS/AddOnFunctions/replaceZeros.R b/DIMS/AddOnFunctions/replaceZeros.R deleted file mode 100644 index 234d961..0000000 --- a/DIMS/AddOnFunctions/replaceZeros.R +++ /dev/null @@ -1,90 +0,0 @@ -replaceZeros <- function(outpgrlist, repl_pattern, scanmode, resol, outdir, thresh, ppm) { - # file="./results/grouping_rest/negative_1.RData" - # scanmode= "negative" - # scriptDir="./scripts" - # resol=140000 - # thresh=2000 - # outdir="./results" - - # control_label="C" - - # source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) - # sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) - - # dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) - - # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) - # scale=2 # Initial value used to estimate scaling parameter - # width=1024 - # height=768 - - # load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) - - # batch_number = strsplit(basename(HMDB_part_file), ".",fixed = TRUE)[[1]][2] - # name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) - # name = name[length(name)] - # message(paste("File name: ", name)) - - # load samplePeaks - # load outpgrlist - # load(file) - - # # filter on at least signal in two control samples - # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) - # # barplot(as.numeric(outpgrlist[753, int.cols])) - # keep = NULL - # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) - # outpgrlist = outpgrlist[keep,] - - # replace zeros - if (!is.null(outpgrlist)) { - print(dim(outpgrlist)) - print(colnames(outpgrlist)) - for (i in 1:length(names(repl_pattern))){ - print(names(repl_pattern)[i]) - samplePeaks=outpgrlist[,names(repl_pattern)[i]] - index=which(samplePeaks<=0) - if (!length(index)){ - next - } - for (j in 1:length(index)){ - area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area - # for testing purposes, add a fixed random seed - # set.seed(123) - outpgrlist[index[j], names(repl_pattern)[i]] = rnorm(n=1, mean=area, sd=0.25*area) - } - } - - - # Identification - - # Add average column - outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) - - if (scanmode=="negative"){ - label = "MNeg" - label2 = "Negative" - # take out multiple NaCl adducts - look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H" - # HMDB_add_iso=HMDB_add_iso.Neg - } else { - label = "Mpos" - label2 = "Positive" - # take out NaCl adducts - look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5") - # HMDB_add_iso=HMDB_add_iso.Pos - } - - # Identify noise peaks - noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") - noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] - final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) - tmp <- final.outlist.idpat2[ , c("assi", "theormz")] - colnames(tmp) <- c("assi_noise", "theormz_noise") - - final.outlist.idpat3 <- cbind(outpgrlist, tmp) - - return(final.outlist.idpat3) - # save(final.outlist.idpat3, file=paste("./", name, sep="")) - } -} diff --git a/DIMS/AddOnFunctions/replaceZeros_setseed.R b/DIMS/AddOnFunctions/replaceZeros_setseed.R deleted file mode 100644 index 4fafc9a..0000000 --- a/DIMS/AddOnFunctions/replaceZeros_setseed.R +++ /dev/null @@ -1,108 +0,0 @@ -replaceZeros <- function(file,scanmode,resol,outdir,thresh,scriptDir,ppm){ - # file="./results/grouping_rest/negative_1.RData" - # scanmode= "negative" - # scriptDir="./scripts" - # resol=140000 - # thresh=2000 - # outdir="./results" - - control_label="C" - - source(paste(scriptDir, "AddOnFunctions/sourceDir.R", sep="/")) - sourceDir(paste(scriptDir, "AddOnFunctions", sep="/")) - - dir.create(paste(outdir, "9-samplePeaksFilled", sep="/"), showWarnings = FALSE) - - # int.factor=1*10^5 # Number of x used to calc area under Gaussian (is not analytic) - # scale=2 # Initial value used to estimate scaling parameter - # width=1024 - # height=768 - - # message(paste("file", file)) - # message(paste("scanmode", scanmode)) - # message(paste("resol", resol)) - # message(paste("outdir", outdir)) - # message(paste("thresh", thresh)) - # message(paste("scriptDir", scriptDir)) - - load(paste0(outdir, "/repl.pattern.",scanmode, ".RData")) - - name = as.vector(unlist(strsplit(file, "/", fixed=TRUE))) - name = name[length(name)] - # message(paste("File name: ", name)) - - # load samplePeaks - # load outpgrlist - load(file) - - # ################################################################################# - # # filter on at least signal in two control samples - # int.cols = grep(control_label, colnames(outpgrlist),fixed = TRUE) - # # barplot(as.numeric(outpgrlist[753, int.cols])) - # keep = NULL - # keep = apply(outpgrlist, 1, function(x) if (length(which(as.numeric(x[int.cols]) > 0)) > 1) keep=c(keep,TRUE) else keep=c(keep,FALSE)) - # outpgrlist = outpgrlist[keep,] - # ################################################################################# - - ################################################################################ - # For now only replace zeros - if (!is.null(outpgrlist)) { - for (i in 1:length(names(repl.pattern.filtered))){ - samplePeaks=outpgrlist[,names(repl.pattern.filtered)[i]] - index=which(samplePeaks<=0) - if (!length(index)){ - next - } - for (j in 1:length(index)){ - area = generateGaussian(outpgrlist[index[j],"mzmed.pgrp"],thresh,resol,FALSE,scanmode,int.factor=1*10^5,1,1)$area - # for testing purposes, add a fixed random seed - set.seed(123) - outpgrlist[index[j], names(repl.pattern.filtered)[i]] = rnorm(n=1, mean=area, sd=0.25*area) - } - } - - ################################################################################ - - - #################### identification ######################################################### - # load(paste(scriptDir, "../db/HMDB_add_iso_corrNaCl.RData", sep="/")) # E:\Metabolomics\LargeDataBase\Apr25_2016 - - # Add average column - outpgrlist = cbind(outpgrlist, "avg.int"=apply(outpgrlist[, 7:(ncol(outpgrlist)-4)], 1, mean)) - - if (scanmode=="negative"){ - label = "MNeg" - label2 = "Negative" - # take out multiple NaCl adducts - look4.add2 <- c("Cl", "Cl37", "For", "NaCl","KCl","H2PO4","HSO4","Na-H","K-H","H2O","I") # ,"min2H","min3H" - # HMDB_add_iso=HMDB_add_iso.Neg - } else { - label = "Mpos" - label2 = "Positive" - # take out NaCl adducts - look4.add2 <- c("Na", "K", "NaCl", "NH4","2Na-H","CH3OH","KCl","NaK-H") # ,"NaCl2","NaCl3","NaCl4","NaCl5") - # HMDB_add_iso=HMDB_add_iso.Pos - } - - # # Identification using large database - # final.outlist.idpat = iden.code(outpgrlist, HMDB_add_iso, ppm=2, label) - # message(paste(sum(final.outlist.idpat[ , "assi_HMDB"] != ""), "assigned peakgroups")) - # message(paste(sum(final.outlist.idpat[ , "iso_HMDB"] != ""), "assigned isomeres")) - - # Identify noise peaks - noise.MZ <- read.table(file="/hpc/dbg_mz/tools/db/TheoreticalMZ_NegPos_incNaCl.txt", sep="\t", header=TRUE, quote = "") - noise.MZ <- noise.MZ[(noise.MZ[ , label] != 0), 1:4] - - # Replace "Negative" by "negative" in ident.hires.noise - final.outlist.idpat2 = ident.hires.noise.HPC(outpgrlist, allAdducts, scanmode=label2, noise.MZ, look4=look4.add2, resol=resol, slope=0, incpt=0, ppm.fixed=ppm, ppm.iso.fixed=ppm) - # message(paste(sum(final.outlist.idpat2[ , "assi"] != ""), "assigned noise peaks")) - tmp <- final.outlist.idpat2[ , c("assi", "theormz")] - colnames(tmp) <- c("assi_noise", "theormz_noise") - - final.outlist.idpat3 <- cbind(outpgrlist, tmp) - ############################################################################################# - - # message(paste("File saved: ", paste(outdir, "/samplePeaksFilled/", name, sep=""))) - save(final.outlist.idpat3, file=paste(outdir, "/9-samplePeaksFilled/", name, sep="")) - } -} diff --git a/DIMS/AddOnFunctions/run.vbs b/DIMS/AddOnFunctions/run.vbs deleted file mode 100644 index 640116a..0000000 --- a/DIMS/AddOnFunctions/run.vbs +++ /dev/null @@ -1,22 +0,0 @@ -Option Explicit -On Error Resume Next -RunExcelMacro - -Sub RunExcelMacro() - -Dim xlApp -Dim xlBook -Dim xlBook_persenal - -Set xlApp = CreateObject("Excel.Application") -xlApp.DisplayAlerts = FALSE -Set xlBook_persenal = xlApp.Workbooks.Open("C:\Users\awillem8\AppData\Roaming\Microsoft\Excel\XLSTART\PERSONAL.XLSB", 0, True) -Set xlBook = xlApp.Workbooks.Open("C:\Users\mkerkho7\testmap\xls\P181_Aberant_HMDB.xlsx", 0, True) -xlApp.Run "PERSONAL.XLSB!FixAndResize" -xlBook.SaveAs "C:\Users\mkerkho7\testmap\results\P181_Aberant_HMDB.xlsx" -xlBook.Close False -xlApp.Quit -Set xlBook = Nothing -Set xlBook_persenal = Nothing -Set xlApp = Nothing -End Sub diff --git a/DIMS/AddOnFunctions/runVBAMacro.R b/DIMS/AddOnFunctions/runVBAMacro.R deleted file mode 100644 index 3836b2f..0000000 --- a/DIMS/AddOnFunctions/runVBAMacro.R +++ /dev/null @@ -1,48 +0,0 @@ -runVBAMacro <- function(dir, dir2, vb_script) { - -# dir="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls\\" -# dir2="E:\\Metabolomics\\Lynne_BSP-2015-08-05\\results\\xls_fixed\\" - -dir.create(dir2) -files=list.files(dir) - -script_1 = paste("Option Explicit -On Error Resume Next -RunExcelMacro - -Sub RunExcelMacro() - -Dim xlApp -Dim xlBook -Dim xlBook_persenal - -Set xlApp = CreateObject(\"Excel.Application\") -xlApp.DisplayAlerts = FALSE -Set xlBook_persenal = xlApp.Workbooks.Open(\"C:\\Users\\awillem8\\AppData\\Roaming\\Microsoft\\Excel\\XLSTART\\PERSONAL.XLSB\", 0, True) -Set xlBook = xlApp.Workbooks.Open(\"", dir, sep="") - -script_2 = "\", 0, True) -xlApp.Run \"PERSONAL.XLSB!FixAndResize\" -xlBook.SaveAs \"" - -script_3 = "\" -xlBook.Close False -xlApp.Quit -Set xlBook = Nothing -Set xlBook_persenal = Nothing -Set xlApp = Nothing -End Sub" - -for (i in 1:length(files)){ - script = paste(script_1, files[i], script_2, dir2, files[i], script_3, sep="") # dir2, files[i], - - message(script) - - fileConn = file("./src/run.vbs") - writeLines(script, fileConn) - close(fileConn) - - system(vb_script) -} -} - diff --git a/DIMS/AddOnFunctions/searchMZRange.R b/DIMS/AddOnFunctions/searchMZRange.R deleted file mode 100644 index 43a7171..0000000 --- a/DIMS/AddOnFunctions/searchMZRange.R +++ /dev/null @@ -1,187 +0,0 @@ -searchMZRange <- function(range,values,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,thresh){ - # range=sub_range - - end=NULL - index=as.vector(which(range!=0)) - - # bad infusion - if (length(index)==0) return(values) - - start=index[1] - subRangeLength = 15 - - for (i in 1:length(index)){ - # for (i in 1:129000){ - - if (i 1){ - - end=index[i] - - # start=395626 - # end=395640 - # i=25824 - - # gaan met de banaan - # 128836 - # start 1272853 - # mz start 316.83302 - # end 1272870 - # mz end 316.84269 - - x = as.numeric(names(range)[c(start:end)]) - y = as.vector(range[c(start:end)]) - # # Trim zeros - # x = as.vector(trimZeros(x,y)[[1]]) - # y = as.vector(trimZeros(x,y)[[2]]) - - if (length(y)!=0) { - - # check if intensity above thresh - if (max(y) < thresh | is.nan(max(y))) { - start=index[i+1] - next - } - - # cat("gaan met de banaan") - # cat(i) - # cat(paste("start", start, sep=" ")) - # cat(paste("mz start", x[1], sep=" ")) - # cat(paste("end", end, sep=" ")) - # cat(paste("mz end", x[length(x)], sep=" ")) - - if (length(y)>subRangeLength) { - - y[which(y3) { - # Check only zeros - if (sum(y)==0) next - - rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname, scanmode, plot,width,height,i,start,end) - - if (rval$qual[1]==1) { - - rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) - - values$mean = c(values$mean, rval$mean) - values$area = c(values$area, rval$area) - values$nr = c(values$nr, sampname) - values$min = c(values$min, rval$min) - values$max = c(values$max, rval$max) - values$qual = c(values$qual, 0) - - values$spikes = values$spikes + 1 - - } else { - for (j in 1:length(rval$mean)){ - values$mean = c(values$mean, rval$mean[j]) - values$area = c(values$area, rval$area[j]) - values$nr = c(values$nr, sampname) - values$min = c(values$min, rval$min[1]) - values$max = c(values$max, rval$max[1]) - values$qual = c(values$qual, rval$qual[1]) - } - } - - } else { - - rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) - - values$mean = c(values$mean, rval$mean) - values$area = c(values$area, rval$area) - values$nr = c(values$nr, sampname) - values$min = c(values$min, rval$min) - values$max = c(values$max, rval$max) - values$qual = c(values$qual, 0) - - values$spikes = values$spikes + 1 - } - } - start=index[i+1] - } - } - - # last little range - end = index[length(index)] - x = as.numeric(names(range)[c(start:end)]) - y = as.vector(range[c(start:end)]) - - # x = as.vector(trimZeros(x,y)[[1]]) - # y = as.vector(trimZeros(x,y)[[2]]) - - if (length(y)!=0) { - - # check if intensity above thresh - if (max(y) < thresh | is.nan(max(y))) { - #start=index[i+1] - # do nothing!! - } else { - - # cat("gaan met de banaan") - # cat(paste("start", start, sep=" ")) - # cat(paste("mz start", x[1], sep=" ")) - # cat(paste("end", end, sep=" ")) - # cat(paste("mz end", x[length(x)], sep=" ")) - - if (length(y)>subRangeLength) { - - y[which(y3) { - - # Check only zeros - if (sum(y)==0) next - - rval = fitGaussianInit(x,y,int.factor,scale,resol,outdir,sampname,scanmode,plot,width,height,i,start,end) - - if (rval$qual[1]==1) { - #cat("Quality = 1!!!") - rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) - - values$mean = c(values$mean, rval$mean) - values$area = c(values$area, rval$area) - values$nr = c(values$nr, sampname) - values$min = c(values$min, rval$min) - values$max = c(values$max, rval$max) - values$qual = c(values$qual, 0) - - values$spikes = values$spikes + 1 - - } else { - for (j in 1:length(rval$mean)){ - values$mean = c(values$mean, rval$mean[j]) - values$area = c(values$area, rval$area[j]) - values$nr = c(values$nr, sampname) - values$min = c(values$min, rval$min[1]) - values$max = c(values$max, rval$max[1]) - values$qual = c(values$qual, rval$qual[1]) - } - } - } else { - rval = generateGaussian(x,y,resol,plot,scanmode,int.factor, width, height) - - values$mean = c(values$mean, rval$mean) - values$area = c(values$area, rval$area) - values$nr = c(values$nr, sampname) - values$min = c(values$min, rval$min) - values$max = c(values$max, rval$max) - values$qual = c(values$qual, 0) - - values$spikes = values$spikes + 1 - } - } - } - - return(values) - #return(list("mean"=retVal$mean, "area"=retVal$area, "qual"=retVal$qual, "min"=retVal$min, "max"=retVal$max, "nr"=sample.nr, "spikes"=spikes)) -} diff --git a/DIMS/AddOnFunctions/sourceDir.R b/DIMS/AddOnFunctions/sourceDir.R deleted file mode 100644 index abf4c9e..0000000 --- a/DIMS/AddOnFunctions/sourceDir.R +++ /dev/null @@ -1,8 +0,0 @@ -# source add on functions -sourceDir <- function(path, trace = TRUE, ...) { - for (nm in list.files(path, pattern = "[.][RrSsQq]$")) { - #if(trace) cat(nm,":") - source(file.path(path, nm), ...) - #if(trace) cat("\n") - } -} diff --git a/DIMS/AddOnFunctions/statistics_z.R b/DIMS/AddOnFunctions/statistics_z.R deleted file mode 100644 index 0a92d3d..0000000 --- a/DIMS/AddOnFunctions/statistics_z.R +++ /dev/null @@ -1,69 +0,0 @@ -statistics_z <- function(peaklist, sortCol, adducts){ - # peaklist=as.data.frame(outlist.adducts.HMDB) - # plotdir="./results/plots/adducts" - # filename="./results/allpgrps_stats.txt" - # adducts=TRUE - - # peaklist=outlist.tot - # sortCol=NULL - # adducts=FALSE - - case_label = "P" - control_label = "C" - startcol = dim(peaklist)[2]+3 - - # calculate mean and sd for Control group - ctrl.cols <- grep(control_label, colnames(peaklist),fixed = TRUE) # 5:41 - int.cols <- c(grep(control_label, colnames(peaklist),fixed = TRUE), grep(case_label, colnames(peaklist),fixed = TRUE)) - peaklist[,int.cols][peaklist[,int.cols]==0] = NA - - # tmp = data.matrix(peaklist[ , ctrl.cols], rownames.force = TRUE) - tmp = peaklist[ , ctrl.cols, drop=FALSE] - - peaklist$avg.ctrls = apply(tmp, 1, function(x) mean(as.numeric(x),na.rm = TRUE)) - peaklist$sd.ctrls = apply(tmp, 1, function(x) sd(as.numeric(x),na.rm = TRUE)) - - cnames.z = NULL - - for (i in int.cols) { - # message(i) - cname = colnames(peaklist)[i] - cnames.z = c(cnames.z, paste(cname, "Zscore", sep="_")) - zscores.1col = (as.numeric(as.vector(unlist(peaklist[ , i]))) - peaklist$avg.ctrls) / peaklist$sd.ctrls - peaklist = cbind(peaklist, zscores.1col) - } - - colnames(peaklist)[startcol:ncol(peaklist)] = cnames.z - - z.cols = grep("Zscore", colnames(peaklist),fixed = TRUE) - - if (!adducts){ - if ((dim(peaklist[, z.cols])[2]+6)!=(startcol-1)){ - ppmdev=array(1:dim(peaklist)[1], dim=c(dim(peaklist)[1])) - - # calculate ppm deviation - for (i in 1:dim(peaklist)[1]){ - if (!is.na(peaklist$theormz_HMDB[i]) & !is.null(peaklist$theormz_HMDB[i]) & (peaklist$theormz_HMDB[i]!="")){ - ppmdev[i] = 10^6*(as.numeric(as.vector(peaklist$mzmed.pgrp[i]))-as.numeric(as.vector(peaklist$theormz_HMDB[i])))/as.numeric(as.vector(peaklist$theormz_HMDB[i])) - } else { - ppmdev[i]=NA - } - } - - peaklist = cbind(peaklist[, 1:6], ppmdev=ppmdev, peaklist[ , 7:ncol(peaklist)]) - } - } - - #peaklist = peaklist[order(peaklist[,sortCol]),] - - # # Order on average Z-score - # tmp = peaklist[,grep("Zscore", colnames(peaklist))] - # tmp.p = tmp[,grep("P", colnames(tmp)),drop=FALSE] - # tmp.p.avg = apply(tmp.p, 1, mean) - # - # peaklist = cbind(peaklist, "avg.z.score"=tmp.p.avg) - # peaklist = peaklist[order(abs(tmp.p.avg), decreasing = TRUE),] - - return(peaklist) - -} diff --git a/DIMS/AddOnFunctions/sumCurves.R b/DIMS/AddOnFunctions/sumCurves.R deleted file mode 100644 index bcc1485..0000000 --- a/DIMS/AddOnFunctions/sumCurves.R +++ /dev/null @@ -1,39 +0,0 @@ -sumCurves <- function(mean1, mean2, scale1, scale2, sigma1, sigma2, x2, x, resol, plot) { - # mean1=mean[i-1] - # mean2=mean[i] - # scale1=scale[i-1] - # scale2=scale[i] - # sigma1=sigma[i-1] - # sigma2=sigma[i] - - # message("=============================================================> sum 2 curves!") - - sumFit=(scale1*dnorm(x2,mean1,sigma1))+(scale2*dnorm(x2,mean2,sigma2)) - if (plot) lines(x2,sumFit,col="brown") - - #mean1plus2 = mean(c(mean1,mean2)) - mean1plus2 = weighted.mean(c(mean1,mean2),c(max(scale1*dnorm(x2,mean1,sigma1)),max(scale2*dnorm(x2,mean2,sigma2)))) - - if (plot) abline(v = mean1plus2, col="brown") - fwhm = getFwhm(mean1plus2, resol) - half_max = max(sumFit)*0.5 - if (plot) lines(c(mean1plus2 - 0.5*fwhm, mean1plus2 + 0.5*fwhm),c(half_max,half_max),col="orange") - - # sumFit=(scale1*dnorm(x,mean1,sigma1))+(scale2*dnorm(x,mean2,sigma2)) - # fq=abs(sum(y) - sum(sumFit))/sum(y) - - # h2=c(paste("mean =", mean1plus2, sep=" "), - # paste("fq =", fq, sep=" ")) - # - # legend("topright", legend=h2) - - # I assume that the sum of the distributions if also normal, which is not - #area = sum(scale1*dnorm(x2,mean1,sigma1))+sum(scale2*dnorm(x2,mean2,sigma2)) - #area = max(scale1*dnorm(x2,mean1,sigma1))+max(scale2*dnorm(x2,mean2,sigma2)) - area = max(sumFit) - scale = scale1 + scale2 - sigma = (fwhm/2)*0.85 - - return(list("mean"= mean1plus2,"area"=area, "scale"=scale, "sigma"=sigma)) # "qual"=fq - -} diff --git a/DIMS/AddOnFunctions/trimZeros.R b/DIMS/AddOnFunctions/trimZeros.R deleted file mode 100644 index 0fc5ea3..0000000 --- a/DIMS/AddOnFunctions/trimZeros.R +++ /dev/null @@ -1,8 +0,0 @@ -trimZeros <- function(x, y) { - tmp = which(y==0) - if (length(tmp)!=0){ - y = y[-tmp] - x = x[-tmp] - } - return(list(x,y)) -} From d586b345f95b5de5b630d4395437646817af2de9 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Tue, 27 Aug 2024 11:01:20 +0200 Subject: [PATCH 02/18] Changes for HMDB V5 update --- DIMS/GenerateExcel.R | 21 +++++++++-- DIMS/PeakGrouping.R | 62 ++++++++++++++++++++++--------- DIMS/SumAdducts.R | 2 +- DIMS/Utils/merge_duplicate_rows.R | 5 ++- 4 files changed, 67 insertions(+), 23 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index d36ac29..e6d6b70 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -102,7 +102,8 @@ tmp <- cbind(tmp, "HMDB_name" = tmp_hmdb_name_pos) outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left) # Filter for biological relevance -peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) +# peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) +peaks_in_list <- which(rownames(outlist) %in% rlvnc$HMDB_key) outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc[rownames(outlist)[peaks_in_list], ])) # filter out all irrelevant HMDBs outlist <- outlist %>% @@ -568,13 +569,25 @@ if (z_score == 1) { pku_sample_name <- positive_control_list[grepl("P1003", positive_control_list)] lpi_sample_name <- positive_control_list[grepl("P1005", positive_control_list)] - pa_codes <- c("HMDB00824", "HMDB00783", "HMDB00123") - pku_codes <- c("HMDB00159") - lpi_codes <- c("HMDB00904", "HMDB00641", "HMDB00182") + # pa_codes <- c("HMDB0000824", "HMDB0000783", "HMDB0000123") + # pku_codes <- c("HMDB0000159") + # lpi_codes <- c("HMDB0000904", "HMDB0000641", "HMDB0000182") + + pa_codes <- c("HMDB0000824", "HMDB0000725", "HMDB0000123") + pku_codes <- c("HMDB0000159") + lpi_codes <- c("HMDB0000904", "HMDB0000641", "HMDB0000182") + + pa_names <- c("Propionylcarnitine", "Propionylglycine", "Glycine") + pku_names <- c("L-Phenylalanine") + lpi_names <- c("Citrulline", "L-Glutamine", "L-Lysine") pa_data <- outlist[pa_codes, c("HMDB_code", "name", pa_sample_name)] pa_data <- reshape2::melt(pa_data, id.vars = c("HMDB_code", "name")) colnames(pa_data) <- c("HMDB.code", "HMDB.name", "Sample", "Zscore") + # change HMDB names because propionylglycine doesn't have its own line, rowname is HMDB0000725 (4-hydroxyproline) + pa_data$HMDB.name <- pa_codes + # change HMDB codes so the propionylglycine is the correct HMDB ID + pa_data$HMDB.code <- c("HMDB0000824", "HMDB0000783", "HMDB0000123") pku_data <- outlist[pku_codes, c("HMDB_code", "name", pku_sample_name)] pku_data <- reshape2::melt(pku_data, id.vars = c("HMDB_code", "name")) diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index dc29e76..a801500 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -105,9 +105,15 @@ while (dim(hmdb_add_iso)[1] > 0) { # Compose a list of compounds, adducts or isotopes with corresponding m/z if (dim(tmplist_mass)[1] > 0) { # metabolites - assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), + assi_hmdb <- as.character(paste(as.character(tmplist_mass[, "CompoundName"]), collapse = ";")) - hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), + all_hmdb_names <- as.character(paste(as.character(tmplist_mass[, "HMDB_name_all"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass)), + collapse = ";")) + all_hmdb_ids <- as.character(paste(as.character(tmplist_mass[, "HMDB_ID_all"]), + collapse = ";")) + all_sec_hmdb_ids <- as.character(paste(as.character(tmplist_mass[, "sec_HMDB_ID"]), collapse = ";")) theormz_hmdb <- as.numeric(tmplist_mass[1, column_label]) @@ -115,17 +121,27 @@ while (dim(hmdb_add_iso)[1] > 0) { if (!is.null(tmplist_mass_adduct)) { if (dim(tmplist_mass_adduct)[1] > 0) { if (is.na(assi_hmdb)) { - assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")) + all_hmdb_names <- as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_name_all"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + collapse = ";")) + all_hmdb_ids <- as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_ID_all"]), collapse = ";")) - hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + all_sec_hmdb_ids <- as.character(paste(as.character(tmplist_mass[, "sec_HMDB_ID"]), collapse = ";")) } else { - assi_hmdb <- paste(assi_hmdb, - as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + assi_hmdb <- paste(assi_hmdb, as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";") - hmdb_code <- paste(hmdb_code, - as.character(paste(as.character(rownames(tmplist_mass_adduct)), + all_hmdb_names <- paste(all_hmdb_names, as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_name_all"]), + collapse = ";")), sep=";") + hmdb_code <- paste(hmdb_code, as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";") + all_hmdb_ids <- paste(all_hmdb_ids, as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_ID_all"]), + collapse = ";")), sep = ";") + all_sec_hmdb_ids <- as.character(paste(as.character(tmplist_mass[, "sec_HMDB_ID"]), + collapse = ";")) } } } @@ -146,17 +162,27 @@ while (dim(hmdb_add_iso)[1] > 0) { if (!is.null(tmplist_mass_adduct)) { if (dim(tmplist_mass_adduct)[1] > 0) { if (is.na(assi_hmdb)) { - assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + assi_hmdb <- as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + collapse = ";")) + all_hmdb_names <- as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_name_all"]), + collapse = ";")) + hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")) - hmdb_code <- as.character(paste(as.character(rownames(tmplist_mass_adduct)), + all_hmdb_ids <- as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_ID_all"]), collapse = ";")) + all_sec_hmdb_ids <- as.character(paste(as.character(tmplist_mass[, "sec_HMDB_ID"]), + collapse = ";")) } else { - assi_hmdb <- paste(assi_hmdb, - as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), + assi_hmdb <- paste(assi_hmdb, as.character(paste(as.character(tmplist_mass_adduct[, "CompoundName"]), collapse = ";")), sep = ";") - hmdb_code <- paste(hmdb_code, - as.character(paste(as.character(rownames(tmplist_mass_adduct)), + all_hmdb_names <- paste(all_hmdb_names, as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_name_all"]), + collapse = ";")), sep=";") + hmdb_code <- paste(hmdb_code, as.character(paste(as.character(rownames(tmplist_mass_adduct)), collapse = ";")), sep = ";") + all_hmdb_ids <- paste(all_hmdb_ids, as.character(paste(as.character(tmplist_mass_adduct[, "HMDB_ID_all"]), + collapse = ";")), sep = ";") + all_sec_hmdb_ids <- as.character(paste(as.character(tmplist_mass[, "sec_HMDB_ID"]), + collapse = ";")) } } } @@ -164,7 +190,7 @@ while (dim(hmdb_add_iso)[1] > 0) { # isotopes of metabolites if (!is.null(tmplist_mass_iso)) { if (dim(tmplist_mass_iso)[1] > 0) { - iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) } } @@ -173,7 +199,7 @@ while (dim(hmdb_add_iso)[1] > 0) { } else if (!is.null(tmplist_mass_iso)) { if (dim(tmplist_mass_iso)[1] > 0) { theormz_hmdb <- as.numeric(tmplist_mass_iso[1, column_label]) - iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), + iso_hmdb <- as.character(paste(as.character(tmplist_mass_iso[, "CompoundName"]), collapse = ";")) } } @@ -184,7 +210,9 @@ while (dim(hmdb_add_iso)[1] > 0) { data.frame("mzmed.pgrp" = mzmed_pgrp, "fq.best" = fq_best_pgrp, "fq.worst" = fq_worst_pgrp, nrsamples, "mzmin.pgrp" = mzmin_pgrp, "mzmax.pgrp" = mzmax_pgrp), t(as.matrix(ints_allsamps)), - data.frame("assi_HMDB" = assi_hmdb, "iso_HMDB" = iso_hmdb, "HMDB_code" = hmdb_code, "theormz_HMDB" = theormz_hmdb) + # data.frame("assi_HMDB" = assi_hmdb, "iso_HMDB" = iso_hmdb, "HMDB_code" = hmdb_code, "theormz_HMDB" = theormz_hmdb) + data.frame("assi_HMDB" = assi_hmdb, "all_hmdb_names" = all_hmdb_names, "iso_HMDB" = iso_hmdb, + "HMDB_code" = hmdb_code, "all_hmdb_ids" = all_hmdb_ids, "sec_hmdb_ids" = all_sec_hmdb_ids, "theormz_HMDB" = theormz_hmdb) )) } diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 699da4a..4670e88 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -9,7 +9,7 @@ z_score <- as.numeric(cmd_args[3]) sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number, scanmode, outdir, z_score) { hmdb_codes <- rownames(theor_mz) - hmdb_names <- theor_mz[, 1, drop = FALSE] + hmdb_names <- theor_mz[, "CompoundName", drop = FALSE] hmdb_names[] <- lapply(hmdb_names, as.character) # remove isotopes diff --git a/DIMS/Utils/merge_duplicate_rows.R b/DIMS/Utils/merge_duplicate_rows.R index da3bc52..901f116 100644 --- a/DIMS/Utils/merge_duplicate_rows.R +++ b/DIMS/Utils/merge_duplicate_rows.R @@ -41,7 +41,10 @@ merge_duplicate_rows <- function(peakgroup_list) { single_peakgroup[, "assi_noise"] <- collapse("assi_noise", peakgroup_list, peaklist_index) if (single_peakgroup[, "assi_noise"] == ";") single_peakgroup[, "assi_noise"] <- NA single_peakgroup[, "theormz_noise"] <- collapse("theormz_noise", peakgroup_list, peaklist_index) - if (single_peakgroup[,"theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA + if (single_peakgroup[, "theormz_noise"] == "0;0") single_peakgroup[, "theormz_noise"] <- NA + single_peakgroup[, "all_hmdb_ids"] <- collapse("all_hmdb_ids", peakgroup_list, peaklist_index) + single_peakgroup[, "sec_hmdb_ids"] <- collapse("sec_hmdb_ids", peakgroup_list, peaklist_index) + if (single_peakgroup[, "sec_hmdb_ids"] == ";") single_peakgroup[, "sec_hmdb_ids"] < NA # keep track of deduplicated entries collect <- rbind(collect, single_peakgroup) From c12f7e83e27e88cc2617ea3e7358674591604d55 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 6 Sep 2024 16:03:56 +0200 Subject: [PATCH 03/18] Fixed missing colums error --- DIMS/PeakGrouping.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/PeakGrouping.R b/DIMS/PeakGrouping.R index a801500..53e42de 100644 --- a/DIMS/PeakGrouping.R +++ b/DIMS/PeakGrouping.R @@ -77,7 +77,7 @@ while (dim(hmdb_add_iso)[1] > 0) { }))) # Initialize - assi_hmdb <- iso_hmdb <- hmdb_code <- NA + assi_hmdb <- iso_hmdb <- hmdb_code <- all_hmdb_names <- all_hmdb_ids <- all_sec_hmdb_ids <- NA tmplist_mass_iso <- tmplist_mass_adduct <- NULL # find all entries in HMDB part with mass within ppm range From 38df3fc1a552cabd0204ebc414a9507c3928e5e8 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 6 Sep 2024 16:04:51 +0200 Subject: [PATCH 04/18] Added extra HMDB info columns --- DIMS/SumAdducts.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/DIMS/SumAdducts.R b/DIMS/SumAdducts.R index 4670e88..6b7d076 100755 --- a/DIMS/SumAdducts.R +++ b/DIMS/SumAdducts.R @@ -57,6 +57,11 @@ sum_adducts <- function(peaklist, theor_mz, grpnames_long, adducts, batch_number if (!is.null(adductsum)) { rownames(adductsum) <- names adductsum <- cbind(adductsum, "HMDB_name" = names_long) + # Add HMDB info + cols_hmdb_info <- c("HMDB_ID_all", "sec_HMDB_ID", "HMDB_name_all") + hmdb_info <- theor_mz[names, cols_hmdb_info] + adductsum <- cbind(adductsum, hmdb_info) + save(adductsum, file = paste(scanmode, "_", batch_number, "_SummedAdducts.RData", sep = "")) } } From cc164edf955cdfb1abb48cacce6e6c864ff0599a Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 6 Sep 2024 16:05:38 +0200 Subject: [PATCH 05/18] Changes for extra HMDB info columns --- DIMS/GenerateExcel.R | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index e6d6b70..01f3c24 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -88,28 +88,33 @@ index_neg <- which(rownames(outlist_neg_adducts_hmdb) %in% rownames(outlist_pos_ index_pos <- which(rownames(outlist_pos_adducts_hmdb) %in% rownames(outlist_neg_adducts_hmdb)) # Only continue with HMDB codes (rows) that were found in both positive mode and remove last column (hmdb_name) -tmp_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_pos_adducts_hmdb)[2] - 1)] -tmp_hmdb_name_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], dim(outlist_pos_adducts_hmdb)[2]] +tmp_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], ] %>% select(-c(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID)) +tmp_pos_info <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], ] %>% select(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID) +# tmp_hmdb_name_pos <- outlist_pos_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], dim(outlist_pos_adducts_hmdb)[2]] tmp_pos_left <- outlist_pos_adducts_hmdb[-index_pos, ] # same for negative mode -tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], 1:(dim(outlist_neg_adducts_hmdb)[2] - 1)] +tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], ] %>% select(-c(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID)) tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ] - +print("combine pos + neg") # Combine positive and negative numbers and paste back HMDB column tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) rownames(tmp) <- rownames(tmp_pos) -tmp <- cbind(tmp, "HMDB_name" = tmp_hmdb_name_pos) +tmp <- cbind(tmp, tmp_pos_info) outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left) +outlist <- outlist %>% arrange(rownames(outlist)) +print("Add biological relevance") # Filter for biological relevance # peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) peaks_in_list <- which(rownames(outlist) %in% rlvnc$HMDB_key) -outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc[rownames(outlist)[peaks_in_list], ])) +rlvnc_in_list <- rlvnc %>% filter(HMDB_key %in% rownames(outlist)[peaks_in_list]) +outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc_in_list)) + # filter out all irrelevant HMDBs outlist <- outlist %>% tibble::rownames_to_column("rowname") %>% - filter(!grepl("Exogenous|Drug|exogenous", relevance)) %>% - tibble::column_to_rownames("rowname") + filter(grepl("relevant|Onbekend|Internal", relevance)) %>% + tibble::column_to_rownames("rowname")print("Add HMDB_code column") # Add HMDB_code column with all the HMDB ID and sort on it outlist <- cbind(outlist, "HMDB_code" = rownames(outlist)) @@ -260,8 +265,9 @@ if (z_score == 1) { openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) openxlsx::setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) } - +print("Write Excel file") # write Excel file +outlist$name <- stringi::stri_enc_toutf8(outlist$name) openxlsx::writeData(wb, sheet = 1, outlist, startCol = 1) xlsx_name <- paste0(outdir, "/", project, ".xlsx") openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) @@ -297,6 +303,7 @@ is_summed$Intensity <- as.numeric(as.character(is_summed$Intensity)) # Retrieve IS positive mode is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) +is_pos <- is_pos[c(names(repl_pattern))] is_pos$HMDB_name <- is_list[match(row.names(is_pos), is_list$HMDB_code, nomatch = NA), "name"] is_pos$HMDB.code <- row.names(is_pos) is_pos <- reshape2::melt(is_pos, id.vars = c("HMDB.code", "HMDB_name")) @@ -308,6 +315,7 @@ is_pos$Intensity <- as.numeric(as.character(is_pos$Intensity)) # Retrieve IS negative mode is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) +is_neg <- is_neg[c(names(repl_pattern))] is_neg$HMDB_name <- is_list[match(row.names(is_neg), is_list$HMDB_code, nomatch = NA), "name"] is_neg$HMDB.code <- row.names(is_neg) is_neg <- reshape2::melt(is_neg, id.vars = c("HMDB.code", "HMDB_name")) @@ -316,7 +324,6 @@ is_neg$Matrix <- dims_matrix is_neg$Rundate <- rundate is_neg$Project <- project is_neg$Intensity <- as.numeric(as.character(is_neg$Intensity)) - # Save results save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) From b072e3a1598146b55bc1b13d109ea362cb4dd096 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 6 Sep 2024 16:50:51 +0200 Subject: [PATCH 06/18] Changes for ordering dataframe --- DIMS/GenerateViolinPlots.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DIMS/GenerateViolinPlots.R b/DIMS/GenerateViolinPlots.R index 54bd193..a0af859 100644 --- a/DIMS/GenerateViolinPlots.R +++ b/DIMS/GenerateViolinPlots.R @@ -344,8 +344,8 @@ if (z_score == 1){ # from table expected_biomarkers, choose selected columns select_columns <- c("Disease", "HMDB_code", "HMDB_name") - select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) - expected_biomarkers_select <- expected_biomarkers[, select_col_nrs] + #select_col_nrs <- which(colnames(expected_biomarkers) %in% select_columns) + expected_biomarkers_select <- expected_biomarkers %>% select(all_of(select_columns)) # remove duplicates expected_biomarkers_select <- expected_biomarkers_select[!duplicated(expected_biomarkers_select[, c(1, 2)]), ] From b4b0b1c14a3365b22a2056066fc82e468743cd8b Mon Sep 17 00:00:00 2001 From: ALuesink Date: Fri, 11 Oct 2024 11:58:21 +0200 Subject: [PATCH 07/18] Added trimming per scanmodus --- DIMS/AssignToBins.R | 4 ++-- DIMS/GenerateBreaks.R | 22 ++++++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 4ca7fce..5c588cb 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -44,8 +44,8 @@ raw_data_matrix <- xcms::rawMat(raw_data) pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] # Select scans between trim_left and trim_right -pos_times <- pos_times[pos_times > trim_left & pos_times < trim_right] -neg_times <- neg_times[neg_times > trim_left & neg_times < trim_right] +pos_times <- pos_times[pos_times > trim_left_pos & pos_times < trim_right_pos] +neg_times <- neg_times[neg_times > trim_left_neg & neg_times < trim_right_neg] # Generate an index with which to select values for each mode pos_index <- which(raw_data_matrix[, "time"] %in% pos_times) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index 6aacfe8..ca7040d 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -12,8 +12,10 @@ trim <- as.numeric(cmd_args[3]) resol <- as.numeric(cmd_args[4]) # initialize -trim_left <- NULL -trim_right <- NULL +trim_left_pos <- NULL +trim_right_pos <- NULL +trim_left_neg <- NULL +trim_right_neg <- NULL breaks_fwhm <- NULL breaks_fwhm_avg <- NULL bins <- NULL @@ -21,9 +23,17 @@ bins <- NULL # read in mzML file raw_data <- suppressMessages(xcms::xcmsRaw(filepath)) -# trim (remove) scans at the start and end -trim_left <- round(raw_data@scantime[length(raw_data@scantime) * trim]) -trim_right <- round(raw_data@scantime[length(raw_data@scantime) * (1 - trim)]) +# Get time values for positive and negative scans +pos_times <- raw_data@scantime[raw_data@polarity == "positive"] +neg_times <- raw_data@scantime[raw_data@polarity == "negative"] + +# trim (remove) scans at the start and end for positive +trim_left_pos <- round(pos_times[length(pos_times) * trim]) +trim_right_pos <- round(pos_times[length(pos_times) * (1 - trim)]) + +# trim (remove) scans at the start and end for negative +trim_left_neg <- round(neg_times[length(neg_times) * trim]) +trim_right_neg <- round(neg_times[length(neg_times) * (1 - trim)]) # Mass range m/z low_mz <- raw_data@mzrange[1] @@ -47,5 +57,5 @@ for (i in 1:nr_segments) { } # generate output file -save(breaks_fwhm, breaks_fwhm_avg, trim_left, trim_right, file = "breaks.fwhm.RData") +save(breaks_fwhm, breaks_fwhm_avg, trim_left_pos, trim_right_pos, trim_left_neg, trim_right_neg, file = "breaks.fwhm.RData") save(high_mz, file = "highest_mz.RData") From f550d930f09ff76232aafa9d8f4a4eed61f2d147 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 14 Oct 2024 10:56:48 +0200 Subject: [PATCH 08/18] Add trim lines to TIC plots --- DIMS/AverageTechReplicates.R | 13 +++++++++++-- DIMS/AverageTechReplicates.nf | 4 +++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 02279fb..ddda4be 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -13,6 +13,7 @@ run_name <- cmd_args[3] dims_matrix <- cmd_args[4] highest_mz_file <- cmd_args[5] highest_mz <- get(load(highest_mz_file)) +breaks_filepath <- cmd_args[6] thresh2remove <- 1000000000 dims_thresh <- 100 @@ -43,14 +44,18 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { # get repl_pattern load(init_file) +# get trim parameters +load(breaks_filepath) + # lower the threshold below which a sample will be removed for DBS and for high m/z if (dims_matrix == "DBS") { thresh2remove <- 500000000 } if (highest_mz > 700) { - thresh2remove <- 1000000 + thresh2remove <- 10000000 } + # remove technical replicates which are below the threshold remove_neg <- NULL remove_pos <- NULL @@ -64,7 +69,7 @@ for (sample_nr in 1:length(repl_pattern)) { nr_pos <- 0 nr_neg <- 0 for (file_nr in 1:length(tech_reps)) { - load(paste(tech_reps[file_nr], ".RData", sep = "")) + load(paste("AverageTechReplicates/", tech_reps[file_nr], ".RData", sep = "")) cat("\n\nParsing", tech_reps[file_nr]) # negative scanmode cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) @@ -162,6 +167,10 @@ for (sample_nr in c(1:length(repl_pattern))) { tic_plot <- ggplot(repl1_nr, aes(retention_time, tic_intensity)) + geom_line(linewidth = 0.3) + geom_hline(yintercept = highest_tic_max, col = "grey", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_left_pos, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_right_pos, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_left_neg, col = "red", linetype = 2, linewidth = 0.3) + + geom_vline(xintercept = trim_right_neg, col = "red", linetype = 2, linewidth = 0.3) + labs(x = "t (s)", y = "tic_intensity", title = paste0(tech_reps[file_nr], " || ", sample_name)) + theme(plot.background = element_rect(fill = plot_color), axis.text = element_text(size = 4), diff --git a/DIMS/AverageTechReplicates.nf b/DIMS/AverageTechReplicates.nf index 5387e61..f4b85b9 100644 --- a/DIMS/AverageTechReplicates.nf +++ b/DIMS/AverageTechReplicates.nf @@ -12,6 +12,7 @@ process AverageTechReplicates { val(analysis_id) val(matrix) path(highest_mz_file) + path(breaks_file) output: path('*_repl_pattern.RData'), emit: pattern_files @@ -26,7 +27,8 @@ process AverageTechReplicates { $params.nr_replicates \ $analysis_id \ $matrix \ - $highest_mz_file + $highest_mz_file \ + $breaks_file """ } From 958c48cd5f784c8a9c33d03894010958c09e18cd Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Fri, 18 Oct 2024 13:32:03 +0200 Subject: [PATCH 09/18] fixed print error --- DIMS/GenerateExcel.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 01f3c24..1e5d4c4 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -114,7 +114,7 @@ outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc_in_list)) outlist <- outlist %>% tibble::rownames_to_column("rowname") %>% filter(grepl("relevant|Onbekend|Internal", relevance)) %>% - tibble::column_to_rownames("rowname")print("Add HMDB_code column") + tibble::column_to_rownames("rowname") # Add HMDB_code column with all the HMDB ID and sort on it outlist <- cbind(outlist, "HMDB_code" = rownames(outlist)) From 34b3c608f18dcd1d90cf8f7916608a0d8888d908 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 21 Oct 2024 10:26:35 +0200 Subject: [PATCH 10/18] removed print statements --- DIMS/GenerateExcel.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 1e5d4c4..bd90015 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -95,14 +95,12 @@ tmp_pos_left <- outlist_pos_adducts_hmdb[-index_pos, ] # same for negative mode tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], ] %>% select(-c(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID)) tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ] -print("combine pos + neg") # Combine positive and negative numbers and paste back HMDB column tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) rownames(tmp) <- rownames(tmp_pos) tmp <- cbind(tmp, tmp_pos_info) outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left) outlist <- outlist %>% arrange(rownames(outlist)) -print("Add biological relevance") # Filter for biological relevance # peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) @@ -265,7 +263,6 @@ if (z_score == 1) { openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) openxlsx::setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) } -print("Write Excel file") # write Excel file outlist$name <- stringi::stri_enc_toutf8(outlist$name) openxlsx::writeData(wb, sheet = 1, outlist, startCol = 1) From 2a0d85287aeb5b63f368b9071945ff57c3637e52 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 21 Oct 2024 11:33:06 +0200 Subject: [PATCH 11/18] Added outlier removal --- DIMS/GenerateExcel.R | 68 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index bd90015..476ed7f 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -16,6 +16,7 @@ project <- cmd_args[2] dims_matrix <- cmd_args[3] hmdb_file <- cmd_args[4] z_score <- as.numeric(cmd_args[5]) +outlier_threshold <- 2 round_df <- function(df, digits) { #' Round numbers to a set number of digits for numeric values @@ -44,6 +45,25 @@ robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { return(trimmed_control_intensities) } +remove_outliers_for_Zscore <- function(control_intensities, outlier_threshold = 2) { + #' Remove outliers per metabolite before calculating Z-scores + #' + #' @param control_intensities: Vector with intensities for control samples + #' @param outlier_threshold: Threshold for outliers which will be removed from controls (float) + #' + #' @return trimmed_control_intensities: Intensities trimmed for outliers + mean_permetabolite <- mean(as.numeric(control_intensities)) + stdev_permetabolite <- sd(as.numeric(control_intensities)) + zscores_permetabolite <- (control_intensities - mean_permetabolite) / stdev_permetabolite + # remove intensities with a zscore_permetabolite greater than outlier_threshold + if (sum(zscores_permetabolite > outlier_threshold) > 0) { + trimmed_control_intensities <- control_intensities[-which(zscores_permetabolite > outlier_threshold)] + } else { + trimmed_control_intensities <- control_intensities + } + return(trimmed_control_intensities) +} + # Initialise plot <- TRUE export <- TRUE @@ -95,6 +115,7 @@ tmp_pos_left <- outlist_pos_adducts_hmdb[-index_pos, ] # same for negative mode tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos], ] %>% select(-c(HMDB_name, HMDB_name_all, HMDB_ID_all, sec_HMDB_ID)) tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ] + # Combine positive and negative numbers and paste back HMDB column tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) rownames(tmp) <- rownames(tmp_pos) @@ -146,7 +167,7 @@ if (z_score == 1) { outlist[, intensity_col_ids][outlist[, intensity_col_ids] == 0] <- NA # save outlist as it is and use it to calculate robust scaler - outlist_noZ <- outlist + outlist_robustZ <- outlist_nooutliers <- outlist # calculate mean and sd for Control group outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) @@ -163,15 +184,15 @@ if (z_score == 1) { colnames(outlist)[startcol:ncol(outlist)] <- colnames_z # calculate robust scaler (Zscores minus outliers in Controls) - outlist_noZ$avg.ctrls <- 0 - outlist_noZ$sd.ctrls <- 0 + outlist_robustZ$avg.ctrls <- 0 + outlist_robustZ$sd.ctrls <- 0 # only calculate robust Z-scores if there are enough Controls if (length(control_col_ids) > 10) { for (metabolite_index in 1:nrow(outlist)) { - outlist_noZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist_noZ[metabolite_index, control_col_ids], + outlist_robustZ$avg.ctrls[metabolite_index] <- mean(robust_scaler(outlist_robustZ[metabolite_index, control_col_ids], control_col_ids, perc)) - outlist_noZ$sd.ctrls[metabolite_index] <- sd(robust_scaler(outlist_noZ[metabolite_index, control_col_ids], + outlist_robustZ$sd.ctrls[metabolite_index] <- sd(robust_scaler(outlist_robustZ[metabolite_index, control_col_ids], control_col_ids, perc)) } } @@ -179,17 +200,39 @@ if (z_score == 1) { # Make and add columns with robust zscores cnames_robust <- gsub("_Zscore", "_RobustZscore", colnames_z) for (i in intensity_col_ids) { - zscores_1col <- (as.numeric(as.vector(unlist(outlist_noZ[, i]))) - outlist_noZ$avg.ctrls) / outlist_noZ$sd.ctrls - outlist_noZ <- cbind(outlist_noZ, zscores_1col) + zscores_1col <- (as.numeric(as.vector(unlist(outlist_robustZ[, i]))) - outlist_robustZ$avg.ctrls) / outlist_robustZ$sd.ctrls + outlist_robustZ <- cbind(outlist_robustZ, zscores_1col) } - colnames(outlist_noZ)[startcol:ncol(outlist_noZ)] <- cnames_robust + colnames(outlist_robustZ)[startcol:ncol(outlist_robustZ)] <- cnames_robust + + # calculate Z-scores after removal of outliers in Control samples + if (length(control_col_ids) > 10) { + for (metabolite_index in 1:nrow(outlist_nooutliers)) { + intensities_without_outliers <- remove_outliers_for_Zscore(as.numeric(outlist_nooutliers[metabolite_index, control_col_ids]), outlier_threshold) + outlist_nooutliers$avg.ctrls[metabolite_index] <- mean(intensities_without_outliers) + outlist_nooutliers$sd.ctrls[metabolite_index] <- sd(intensities_without_outliers) + outlist_nooutliers$nr.ctrls[metabolite_index] <- length(intensities_without_outliers) + } + } + + cnames_nooutliers <- gsub("_Zscore", "_OutlierRemovedZscore", colnames_z) + for (i in intensity_col_ids) { + zscores_1col <- (as.numeric(as.vector(unlist(outlist_nooutliers[, i]))) - outlist_nooutliers$avg.ctrls) / outlist_nooutliers$sd.ctrls + outlist_nooutliers <- cbind(outlist_nooutliers, zscores_1col) + } + # add column names for Z-scores without outliers. NB: 1 extra column so shift to +1 + colnames(outlist_nooutliers)[(startcol + 1):ncol(outlist_nooutliers)] <- cnames_nooutliers + # output metabolites filtered on relevance save(outlist, file = paste0("AdductSums_filtered_Zscores.RData")) write.table(outlist, file = paste0("AdductSums_filtered_Zscores.txt"), sep = "\t", row.names = FALSE) # output filtered metabolites with robust scaled Zscores - save(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.RData")) - write.table(outlist_noZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE) + save(outlist_robustZ, file = paste0("AdductSums_filtered_robustZ.RData")) + write.table(outlist_robustZ, file = paste0("AdductSums_filtered_robustZ.txt"), sep = "\t", row.names = FALSE) + # output filtered metabolites after removal of outliers + save(outlist_nooutliers, file = paste0("AdductSums_filtered_outliersremovedZ.RData")) + write.table(outlist_nooutliers, file = paste0("AdductSums_filtered_outliersremovedZ.txt"), sep = "\t", row.names = FALSE) # get the IDs of the patients and sort patient_ids <- unique(as.vector(unlist(lapply(strsplit(colnames(patient_columns), ".", fixed = TRUE), function(x) x[1])))) @@ -263,8 +306,8 @@ if (z_score == 1) { openxlsx::setRowHeights(wb, filelist, rows = c(1:nrow(outlist)), heights = 18) openxlsx::setColWidths(wb, filelist, cols = c(1:ncol(outlist)), widths = 20) } + # write Excel file -outlist$name <- stringi::stri_enc_toutf8(outlist$name) openxlsx::writeData(wb, sheet = 1, outlist, startCol = 1) xlsx_name <- paste0(outdir, "/", project, ".xlsx") openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) @@ -300,7 +343,6 @@ is_summed$Intensity <- as.numeric(as.character(is_summed$Intensity)) # Retrieve IS positive mode is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) -is_pos <- is_pos[c(names(repl_pattern))] is_pos$HMDB_name <- is_list[match(row.names(is_pos), is_list$HMDB_code, nomatch = NA), "name"] is_pos$HMDB.code <- row.names(is_pos) is_pos <- reshape2::melt(is_pos, id.vars = c("HMDB.code", "HMDB_name")) @@ -312,7 +354,6 @@ is_pos$Intensity <- as.numeric(as.character(is_pos$Intensity)) # Retrieve IS negative mode is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) -is_neg <- is_neg[c(names(repl_pattern))] is_neg$HMDB_name <- is_list[match(row.names(is_neg), is_list$HMDB_code, nomatch = NA), "name"] is_neg$HMDB.code <- row.names(is_neg) is_neg <- reshape2::melt(is_neg, id.vars = c("HMDB.code", "HMDB_name")) @@ -321,6 +362,7 @@ is_neg$Matrix <- dims_matrix is_neg$Rundate <- rundate is_neg$Project <- project is_neg$Intensity <- as.numeric(as.character(is_neg$Intensity)) + # Save results save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) From 9304507277ae1c33352218dd7d447737b12f3a47 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Mon, 21 Oct 2024 11:55:56 +0200 Subject: [PATCH 12/18] Added SST output --- DIMS/AverageTechReplicates.R | 2 +- DIMS/GenerateExcel.R | 78 ++++++++++++++++++++++++++++++++++++ DIMS/GenerateExcel.nf | 2 +- 3 files changed, 80 insertions(+), 2 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index ddda4be..9d3671a 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -69,7 +69,7 @@ for (sample_nr in 1:length(repl_pattern)) { nr_pos <- 0 nr_neg <- 0 for (file_nr in 1:length(tech_reps)) { - load(paste("AverageTechReplicates/", tech_reps[file_nr], ".RData", sep = "")) + load(paste(tech_reps[file_nr], ".RData", sep = "")) cat("\n\nParsing", tech_reps[file_nr]) # negative scanmode cat("\n\tNegative peak_list sum", sum(peak_list$neg[, 1])) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index d36ac29..1fa1eac 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -16,6 +16,8 @@ project <- cmd_args[2] dims_matrix <- cmd_args[3] hmdb_file <- cmd_args[4] z_score <- as.numeric(cmd_args[5]) +sst_components_file <- cmd_args[6] +print(sst_components_file) round_df <- function(df, digits) { #' Round numbers to a set number of digits for numeric values @@ -603,6 +605,82 @@ if (z_score == 1) { } } +### SST components output #### +calculate_coefficient_of_variation <- function(intensity_list) { + for (col_nr in 1:ncol(intensity_list)) { + intensity_list[, col_nr] <- as.numeric(intensity_list[, col_nr]) + intensity_list[, col_nr] <- round(intensity_list[, col_nr], 0) + } + sd_allsamples <- round(apply(intensity_list, 1, sd), 0) + mean_allsamples <- round(apply(intensity_list, 1, mean), 0) + cv_allsamples <- round(100 * sd_allsamples / mean_allsamples, 1) + intensity_list_with_cv <- cbind(CV_perc = cv_allsamples, + mean = mean_allsamples, + sd = sd_allsamples, + intensity_list) + return(intensity_list_with_cv) +} + +# Internal standards lists, calculate coefficients of variation +if ("plots" %in% colnames(is_list)) { + intensity_col_ids <- 2:(which(colnames(is_list) == "HMDB_name") - 1) +} else { + intensity_col_ids <- 1:(which(colnames(is_list) == "HMDB_name") - 1) +} +# previous intensity_col_ids is based on the assumption that there are Controls and Patients +is_list_intensities <- is_list[ , intensity_col_ids] +is_list_intensities <- calculate_coefficient_of_variation(is_list_intensities) +is_list_intensities <- cbind(IS_name = is_list$HMDB_name, is_list_intensities) + +# separate adducts of IS +is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) +is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) +is_pos_intensities <- is_pos[ , -which(colnames(is_pos) == "HMDB_name")] +is_neg_intensities <- is_neg[ , -which(colnames(is_neg) == "HMDB_name")] +is_pos_intensities <- calculate_coefficient_of_variation(is_pos_intensities) +is_neg_intensities <- calculate_coefficient_of_variation(is_neg_intensities) +is_pos_intensities <- cbind(IS_name = is_pos$HMDB_name, is_pos_intensities) +is_neg_intensities <- cbind(IS_name = is_neg$HMDB_name, is_neg_intensities) + +# SST components. +sst_comp <- read.csv(sst_components_file, header = TRUE, sep = "\t") +sst_rows <- which(outlist$HMDB_code %in% sst_comp$HMDB_ID) +sst_list <- outlist[sst_rows, ] +sst_colnrs <- grep("P1001", colnames(sst_list)) +if (length(sst_colnrs) > 0) { + sst_list_intensities <- sst_list[ , sst_colnrs] +} else { + sst_list_intensities <- sst_list[ , intensity_col_ids] +} +for (col_nr in 1:ncol(sst_list_intensities)) { + sst_list_intensities[, col_nr] <- as.numeric(sst_list_intensities[, col_nr]) + if (grepl("Zscore", colnames(sst_list_intensities)[col_nr])) { + sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr], 2) + } else { + sst_list_intensities[, col_nr] <- round(sst_list_intensities[, col_nr]) + } +} +sst_list_intensities <- cbind(SST_comp_name = sst_list$HMDB_name, sst_list_intensities) + +# Create Excel file +wb <- createWorkbook("IS_SST") +addWorksheet(wb, "Internal Standards") +openxlsx::writeData(wb, sheet = 1, is_list_intensities) +setColWidths(wb, 1, cols = 1, widths = 24) +addWorksheet(wb, "IS pos") +openxlsx::writeData(wb, sheet = 2, is_pos_intensities) +setColWidths(wb, 2, cols = 1, widths = 24) +addWorksheet(wb, "IS neg") +openxlsx::writeData(wb, sheet = 3, is_neg_intensities) +setColWidths(wb, 3, cols = 1, widths = 24) +addWorksheet(wb, "SST components") +openxlsx::writeData(wb, sheet = 4, sst_list_intensities) +setColWidths(wb, 4, cols = 1:3, widths = 24) +xlsx_name <- paste0(outdir, "/", project, "_IS_SST.xlsx") +openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) +rm(wb) + + ### MISSING M/Z CHECK # check the outlist_identified_(negative/positive).RData files for missing m/z values and mention in the results mail # Load the outlist_identified files + remove the loaded files diff --git a/DIMS/GenerateExcel.nf b/DIMS/GenerateExcel.nf index 6134766..2979444 100644 --- a/DIMS/GenerateExcel.nf +++ b/DIMS/GenerateExcel.nf @@ -19,6 +19,6 @@ process GenerateExcel { script: """ - Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_file $analysis_id $params.matrix $relevance_file $params.zscore + Rscript ${baseDir}/CustomModules/DIMS/GenerateExcel.R $init_file $analysis_id $params.matrix $relevance_file $params.zscore $params.sst_components_file """ } From 0d30b63108343e167be9a4d48db64be9dd077b5b Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 21 Oct 2024 16:01:38 +0200 Subject: [PATCH 13/18] Added initialization of no outliers variables --- DIMS/GenerateExcel.R | 3 +++ 1 file changed, 3 insertions(+) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 56b5d37..83c7f28 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -187,6 +187,9 @@ if (z_score == 1) { # calculate robust scaler (Zscores minus outliers in Controls) outlist_robustZ$avg.ctrls <- 0 outlist_robustZ$sd.ctrls <- 0 + outlist_nooutliers$avg.ctrls <- 0 + outlist_nooutliers$sd.ctrls <- 0 + outlist_nooutliers$nr.ctrls <- 0 # only calculate robust Z-scores if there are enough Controls if (length(control_col_ids) > 10) { From ce07e4901b75bc8be8b3684927adc8936301ca57 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Mon, 28 Oct 2024 09:23:00 +0100 Subject: [PATCH 14/18] Changed trimming positive mode --- DIMS/GenerateBreaks.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DIMS/GenerateBreaks.R b/DIMS/GenerateBreaks.R index ca7040d..d07fd20 100644 --- a/DIMS/GenerateBreaks.R +++ b/DIMS/GenerateBreaks.R @@ -28,8 +28,8 @@ pos_times <- raw_data@scantime[raw_data@polarity == "positive"] neg_times <- raw_data@scantime[raw_data@polarity == "negative"] # trim (remove) scans at the start and end for positive -trim_left_pos <- round(pos_times[length(pos_times) * trim]) -trim_right_pos <- round(pos_times[length(pos_times) * (1 - trim)]) +trim_left_pos <- round(pos_times[length(pos_times) * (trim * 1.5)]) # 15% aan het begin +trim_right_pos <- round(pos_times[length(pos_times) * (1 - (trim * 0.5))]) # 5% aan het eind # trim (remove) scans at the start and end for negative trim_left_neg <- round(neg_times[length(neg_times) * trim]) From 4703c39fa3509cd7020893f4a65087bfb69aa1c1 Mon Sep 17 00:00:00 2001 From: Anne Luesink Date: Mon, 28 Oct 2024 09:23:41 +0100 Subject: [PATCH 15/18] Added CV to SST output --- DIMS/GenerateExcel.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 83c7f28..cd16507 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -709,7 +709,10 @@ sst_rows <- which(outlist$HMDB_code %in% sst_comp$HMDB_ID) sst_list <- outlist[sst_rows, ] sst_colnrs <- grep("P1001", colnames(sst_list)) if (length(sst_colnrs) > 0) { - sst_list_intensities <- sst_list[ , sst_colnrs] + sst_list_intensities <- sst_list[, sst_colnrs] + control_list_intensities <- sst_list[, control_col_ids] + control_list_cv <- calculate_coefficient_of_variation(control_list_intensities) + sst_list_intensities <- cbind(sst_list_intensities, CV_controls = control_list_cv[ , "CV_perc"]) } else { sst_list_intensities <- sst_list[ , intensity_col_ids] } From c361f1d3ad006acedc238a30a2f99684983812e8 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Mon, 11 Nov 2024 11:18:25 +0100 Subject: [PATCH 16/18] Changes as a result of the code review --- DIMS/AssignToBins.R | 3 +- DIMS/AverageTechReplicates.R | 5 +- DIMS/GenerateExcel.R | 232 +++++++++++++++++------------------ 3 files changed, 121 insertions(+), 119 deletions(-) diff --git a/DIMS/AssignToBins.R b/DIMS/AssignToBins.R index 5c588cb..e49d6c6 100644 --- a/DIMS/AssignToBins.R +++ b/DIMS/AssignToBins.R @@ -12,7 +12,8 @@ resol <- as.numeric(cmd_args[3]) trim <- 0.1 dims_thresh <- 100 -# load breaks_fwhm +# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, +# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos load(breaks_filepath) # get sample name diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 9d3671a..0c57ad7 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -41,10 +41,11 @@ remove_from_repl_pattern <- function(bad_samples, repl_pattern, nr_replicates) { return(list("pattern" = repl_pattern)) } -# get repl_pattern +# load init_file: contains repl_pattern load(init_file) -# get trim parameters +# load breaks_file: contains breaks_fwhm, breaks_fwhm_avg, +# trim_left_neg, trim_left_pos, trim_right_neg & trim_right_pos load(breaks_filepath) # lower the threshold below which a sample will be removed for DBS and for high m/z diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index cd16507..28b2707 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -118,16 +118,14 @@ tmp_neg <- outlist_neg_adducts_hmdb[rownames(outlist_pos_adducts_hmdb)[index_pos tmp_neg_left <- outlist_neg_adducts_hmdb[-index_neg, ] # Combine positive and negative numbers and paste back HMDB column -tmp <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) -rownames(tmp) <- rownames(tmp_pos) -tmp <- cbind(tmp, tmp_pos_info) -outlist <- rbind(tmp, tmp_pos_left, tmp_neg_left) +combi_pos_neg <- apply(tmp_pos, 2, as.numeric) + apply(tmp_neg, 2, as.numeric) +rownames(combi_pos_neg) <- rownames(tmp_pos) +combi_pos_neg <- cbind(combi_pos_neg, tmp_pos_info) +outlist <- rbind(combi_pos_neg, tmp_pos_left, tmp_neg_left) outlist <- outlist %>% arrange(rownames(outlist)) # Filter for biological relevance -# peaks_in_list <- which(rownames(outlist) %in% rownames(rlvnc)) -peaks_in_list <- which(rownames(outlist) %in% rlvnc$HMDB_key) -rlvnc_in_list <- rlvnc %>% filter(HMDB_key %in% rownames(outlist)[peaks_in_list]) +rlvnc_in_list <- rlvnc %>% filter(HMDB_key %in% rownames(outlist)) outlist <- cbind(outlist[peaks_in_list, ], as.data.frame(rlvnc_in_list)) # filter out all irrelevant HMDBs @@ -168,7 +166,8 @@ if (z_score == 1) { outlist[, intensity_col_ids][outlist[, intensity_col_ids] == 0] <- NA # save outlist as it is and use it to calculate robust scaler - outlist_robustZ <- outlist_nooutliers <- outlist + outlist_robustZ <- outlist + outlist_nooutliers <- outlist # calculate mean and sd for Control group outlist$avg.ctrls <- apply(control_columns, 1, function(x) mean(as.numeric(x), na.rm = TRUE)) @@ -213,17 +212,18 @@ if (z_score == 1) { if (length(control_col_ids) > 10) { for (metabolite_index in 1:nrow(outlist_nooutliers)) { intensities_without_outliers <- remove_outliers_for_Zscore(as.numeric(outlist_nooutliers[metabolite_index, control_col_ids]), outlier_threshold) - outlist_nooutliers$avg.ctrls[metabolite_index] <- mean(intensities_without_outliers) - outlist_nooutliers$sd.ctrls[metabolite_index] <- sd(intensities_without_outliers) + outlist_nooutliers$avg.ctrls[metabolite_index] <- mean(intensities_without_outliers) + outlist_nooutliers$sd.ctrls[metabolite_index] <- sd(intensities_without_outliers) outlist_nooutliers$nr.ctrls[metabolite_index] <- length(intensities_without_outliers) } } - + cnames_nooutliers <- gsub("_Zscore", "_OutlierRemovedZscore", colnames_z) - for (i in intensity_col_ids) { - zscores_1col <- (as.numeric(as.vector(unlist(outlist_nooutliers[, i]))) - outlist_nooutliers$avg.ctrls) / outlist_nooutliers$sd.ctrls - outlist_nooutliers <- cbind(outlist_nooutliers, zscores_1col) - } + outlist_nooutliers %>% apply(outlist_nooutliers, 2, function(col) { + (as.numeric(as.vector(unlist(col))) - outlist_nooutliers$avg.ctrls) / outlist_nooutliers$sd.ctrls + }) %>% + cbind(outlist_nooutliers) + # add column names for Z-scores without outliers. NB: 1 extra column so shift to +1 colnames(outlist_nooutliers)[(startcol + 1):ncol(outlist_nooutliers)] <- cnames_nooutliers @@ -318,14 +318,14 @@ openxlsx::saveWorkbook(wb, xlsx_name, overwrite = TRUE) rm(wb) #### INTERNAL STANDARDS #### -is_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] -is_codes <- rownames(is_list) +internal_stand_list <- outlist[grep("Internal standard", outlist[, "relevance"], fixed = TRUE), ] +internal_stand_codes <- rownames(internal_stand_list) # if all data from one samplename (for example P195.1) is filtered out in 3-averageTechReplicates # because of too little data (threshold parameter)i, the init.RData (repl_pattern) will contain more sample_names # than the peak data (IS), so this data needs to be removed first, before the retrieval of the summed adducts. # Write sample_names to a log file, to let user know that this sample_name contained no data. -sample_names_nodata <- setdiff(names(repl_pattern), names(is_list)) +sample_names_nodata <- setdiff(names(repl_pattern), names(internal_stand_list)) if (!is.null(sample_names_nodata)) { write.table(sample_names_nodata, file = paste(outdir, "sample_names_nodata.txt", sep = "/"), row.names = FALSE, col.names = FALSE, quote = FALSE) @@ -336,53 +336,53 @@ if (!is.null(sample_names_nodata)) { } # Retrieve IS summed adducts -is_summed <- is_list[c(names(repl_pattern), "HMDB_code")] -is_summed$HMDB.name <- is_list$name -is_summed <- reshape2::melt(is_summed, id.vars = c("HMDB_code", "HMDB.name")) -colnames(is_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -is_summed$Matrix <- dims_matrix -is_summed$Rundate <- rundate -is_summed$Project <- project -is_summed$Intensity <- as.numeric(as.character(is_summed$Intensity)) +internal_stand_summed <- internal_stand_list[c(names(repl_pattern), "HMDB_code")] +internal_stand_summed$HMDB.name <- internal_stand_list$name +internal_stand_summed <- reshape2::melt(internal_stand_summed, id.vars = c("HMDB_code", "HMDB.name")) +colnames(internal_stand_summed) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +internal_stand_summed$Matrix <- dims_matrix +internal_stand_summed$Rundate <- rundate +internal_stand_summed$Project <- project +internal_stand_summed$Intensity <- as.numeric(as.character(internal_stand_summed$Intensity)) # Retrieve IS positive mode -is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) -is_pos$HMDB_name <- is_list[match(row.names(is_pos), is_list$HMDB_code, nomatch = NA), "name"] -is_pos$HMDB.code <- row.names(is_pos) -is_pos <- reshape2::melt(is_pos, id.vars = c("HMDB.code", "HMDB_name")) -colnames(is_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -is_pos$Matrix <- dims_matrix -is_pos$Rundate <- rundate -is_pos$Project <- project -is_pos$Intensity <- as.numeric(as.character(is_pos$Intensity)) +internal_stand_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% internal_stand_codes)) +internal_stand_pos$HMDB_name <- internal_stand_list[match(row.names(internal_stand_pos), internal_stand_list$HMDB_code, nomatch = NA), "name"] +internal_stand_pos$HMDB.code <- row.names(internal_stand_pos) +internal_stand_pos <- reshape2::melt(internal_stand_pos, id.vars = c("HMDB.code", "HMDB_name")) +colnames(internal_stand_pos) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +internal_stand_pos$Matrix <- dims_matrix +internal_stand_pos$Rundate <- rundate +internal_stand_pos$Project <- project +internal_stand_pos$Intensity <- as.numeric(as.character(internal_stand_pos$Intensity)) # Retrieve IS negative mode -is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) -is_neg$HMDB_name <- is_list[match(row.names(is_neg), is_list$HMDB_code, nomatch = NA), "name"] -is_neg$HMDB.code <- row.names(is_neg) -is_neg <- reshape2::melt(is_neg, id.vars = c("HMDB.code", "HMDB_name")) -colnames(is_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") -is_neg$Matrix <- dims_matrix -is_neg$Rundate <- rundate -is_neg$Project <- project -is_neg$Intensity <- as.numeric(as.character(is_neg$Intensity)) +internal_stand_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% internal_stand_codes)) +internal_stand_neg$HMDB_name <- internal_stand_list[match(row.names(internal_stand_neg), internal_stand_list$HMDB_code, nomatch = NA), "name"] +internal_stand_neg$HMDB.code <- row.names(internal_stand_neg) +internal_stand_neg <- reshape2::melt(internal_stand_neg, id.vars = c("HMDB.code", "HMDB_name")) +colnames(internal_stand_neg) <- c("HMDB.code", "HMDB.name", "Sample", "Intensity") +internal_stand_neg$Matrix <- dims_matrix +internal_stand_neg$Rundate <- rundate +internal_stand_neg$Project <- project +internal_stand_neg$Intensity <- as.numeric(as.character(internal_stand_neg$Intensity)) # Save results -save(is_pos, is_neg, is_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) +save(internal_stand_pos, internal_stand_neg, internal_stand_summed, file = paste0(outdir, "/", project, "_IS_results.RData")) # number of samples, for plotting length and width sample_count <- length(repl_pattern) # change the order of the x-axis summed plots to a natural sorted one -sample_naturalorder <- unique(as.character(is_summed$Sample)) +sample_naturalorder <- unique(as.character(internal_stand_summed$Sample)) sample_naturalorder <- stringr::str_sort(sample_naturalorder, numeric = TRUE) -is_summed$Sample_level <- factor(is_summed$Sample, levels = c(sample_naturalorder)) -is_pos$Sample_level <- factor(is_pos$Sample, levels = c(sample_naturalorder)) -is_neg$Sample_level <- factor(is_neg$Sample, levels = c(sample_naturalorder)) +internal_stand_summed$Sample_level <- factor(internal_stand_summed$Sample, levels = c(sample_naturalorder)) +internal_stand_pos$Sample_level <- factor(internal_stand_pos$Sample, levels = c(sample_naturalorder)) +internal_stand_neg$Sample_level <- factor(internal_stand_neg$Sample, levels = c(sample_naturalorder)) ## bar plots with all IS # theme for all IS bar plots -theme_is_bar <- function(my_plot) { +theme_internal_stand_bar <- function(my_plot) { my_plot + ggplot2::scale_y_continuous(breaks = scales::pretty_breaks(n = 10)) + ggplot2::theme(legend.position = "none", @@ -392,42 +392,42 @@ theme_is_bar <- function(my_plot) { } # ggplot functions -is_neg_bar_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) + +internal_stand_neg_bar_plot <- ggplot(internal_stand_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free_y") -is_pos_bar_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) + +internal_stand_pos_bar_plot <- ggplot(internal_stand_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free_y") -is_sum_bar_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) + +internal_stand_sum_bar_plot <- ggplot(internal_stand_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Summed)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free_y") # add theme to ggplot functions -is_neg_bar_plot <- theme_is_bar(is_neg_bar_plot) -is_pos_bar_plot <- theme_is_bar(is_pos_bar_plot) -is_sum_bar_plot <- theme_is_bar(is_sum_bar_plot) +internal_stand_neg_bar_plot <- theme_internal_stand_bar(internal_stand_neg_bar_plot) +internal_stand_pos_bar_plot <- theme_internal_stand_bar(internal_stand_pos_bar_plot) +internal_stand_sum_bar_plot <- theme_internal_stand_bar(internal_stand_sum_bar_plot) # save plots to disk plot_width <- 9 + 0.35 * sample_count ggsave(paste0(outdir, "/plots/IS_bar_all_neg.png"), - plot = is_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_neg_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_bar_all_pos.png"), - plot = is_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_pos_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_bar_all_sum.png"), - plot = is_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_sum_bar_plot, height = plot_width / 2.5, width = plot_width, units = "in") ## Line plots with all IS # function for ggplot theme # add smaller legend in the "all IS line plots", otherwise out-of-range when more than 13 IS lines -theme_is_line <- function(my_plot) { +theme_internal_stand_line <- function(my_plot) { my_plot + guides(shape = guide_legend(override.aes = list(size = 0.5)), color = guide_legend(override.aes = list(size = 0.5)) @@ -440,44 +440,44 @@ theme_is_line <- function(my_plot) { } # ggplot functions -is_neg_line_plot <- ggplot(is_neg, aes(Sample_level, Intensity)) + +internal_stand_neg_line_plot <- ggplot(internal_stand_neg, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -is_pos_line_plot <- ggplot(is_pos, aes(Sample_level, Intensity)) + +internal_stand_pos_line_plot <- ggplot(internal_stand_pos, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -is_sum_line_plot <- ggplot(is_summed, aes(Sample_level, Intensity)) + +internal_stand_sum_line_plot <- ggplot(internal_stand_summed, aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") # add theme to ggplot functions -is_sum_line_plot <- theme_is_line(is_sum_line_plot) -is_neg_line_plot <- theme_is_line(is_neg_line_plot) -is_pos_line_plot <- theme_is_line(is_pos_line_plot) +internal_stand_sum_line_plot <- theme_internal_stand_line(internal_stand_sum_line_plot) +internal_stand_neg_line_plot <- theme_internal_stand_line(internal_stand_neg_line_plot) +internal_stand_pos_line_plot <- theme_internal_stand_line(internal_stand_pos_line_plot) # save plots to disk plot_width <- 8 + 0.2 * sample_count ggsave(paste0(outdir, "/plots/IS_line_all_neg.png"), - plot = is_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_neg_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_line_all_pos.png"), - plot = is_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_pos_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_line_all_sum.png"), - plot = is_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_sum_line_plot, height = plot_width / 2.5, width = plot_width, units = "in") ## bar plots with a selection of IS -is_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", +internal_stand_neg_selection <- c("2H2-Ornithine (IS)", "2H3-Glutamate (IS)", "2H2-Citrulline (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)") -is_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", +internal_stand_pos_selection <- c("2H4-Alanine (IS)", "13C6-Phenylalanine (IS)", "2H4_13C5-Arginine (IS)", "2H3-Propionylcarnitine (IS)", "2H9-Isovalerylcarnitine (IS)") -is_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", +internal_stand_sum_selection <- c("2H8-Valine (IS)", "2H3-Leucine (IS)", "2H3-Glutamate (IS)", "2H4_13C5-Arginine (IS)", "13C6-Tyrosine (IS)") # add minimal intensity lines based on matrix (DBS or Plasma) and machine mode (neg, pos, sum) @@ -485,111 +485,111 @@ if (dims_matrix == "DBS") { hline_data_neg <- data.frame( z = c(15000, 200000, 130000, 18000, 50000), - HMDB.name = is_neg_selection + HMDB.name = internal_stand_neg_selection ) hline_data_pos <- data.frame( z = c(150000, 3300000, 1750000, 150000, 270000), - HMDB.name = is_pos_selection + HMDB.name = internal_stand_pos_selection ) hline_data_sum <- data.frame( z = c(1300000, 2500000, 500000, 1800000, 1400000), - HMDB.name = is_sum_selection + HMDB.name = internal_stand_sum_selection ) } else if (dims_matrix == "Plasma") { hline_data_neg <- data.frame( z = c(6500, 100000, 75000, 7500, 25000), - HMDB.name = is_neg_selection + HMDB.name = internal_stand_neg_selection ) hline_data_pos <- data.frame( z = c(85000, 1000000, 425000, 70000, 180000), - HMDB.name = is_pos_selection + HMDB.name = internal_stand_pos_selection ) hline_data_sum <- data.frame( z = c(700000, 1250000, 150000, 425000, 300000), - HMDB.name = is_sum_selection + HMDB.name = internal_stand_sum_selection ) } # ggplot functions -is_neg_selection_barplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) + +internal_stand_neg_selection_barplot <- ggplot(subset(internal_stand_neg, HMDB.name %in% internal_stand_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + if (exists("hline_data_neg")) { - geom_hline(aes(yintercept = z), subset(hline_data_neg, HMDB.name %in% is_neg$HMDB.name)) + geom_hline(aes(yintercept = z), subset(hline_data_neg, HMDB.name %in% internal_stand_neg$HMDB.name)) } -is_pos_selection_barplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) + +internal_stand_pos_selection_barplot <- ggplot(subset(internal_stand_pos, HMDB.name %in% internal_stand_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + if (exists("hline_data_pos")) { - geom_hline(aes(yintercept = z), subset(hline_data_pos, HMDB.name %in% is_pos$HMDB.name)) + geom_hline(aes(yintercept = z), subset(hline_data_pos, HMDB.name %in% internal_stand_pos$HMDB.name)) } -is_sum_selection_barplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) + +internal_stand_sum_selection_barplot <- ggplot(subset(internal_stand_summed, HMDB.name %in% internal_stand_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_bar(aes(fill = HMDB.name), stat = "identity") + labs(x = "", y = "Intensity") + facet_wrap(~HMDB.name, scales = "free", ncol = 2) + if (exists("hline_data_sum")) { - geom_hline(aes(yintercept = z), subset(hline_data_sum, HMDB.name %in% is_summed$HMDB.name)) + geom_hline(aes(yintercept = z), subset(hline_data_sum, HMDB.name %in% internal_stand_summed$HMDB.name)) } # add theme to ggplot functions -is_neg_selection_barplot <- theme_is_bar(is_neg_selection_barplot) -is_pos_selection_barplot <- theme_is_bar(is_pos_selection_barplot) -is_sum_selection_barplot <- theme_is_bar(is_sum_selection_barplot) +internal_stand_neg_selection_barplot <- theme_internal_stand_bar(internal_stand_neg_selection_barplot) +internal_stand_pos_selection_barplot <- theme_internal_stand_bar(internal_stand_pos_selection_barplot) +internal_stand_sum_selection_barplot <- theme_internal_stand_bar(internal_stand_sum_selection_barplot) # save plots to disk plot_width <- 9 + 0.35 * sample_count ggsave(paste0(outdir, "/plots/IS_bar_select_neg.png"), - plot = is_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") + plot = internal_stand_neg_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_bar_select_pos.png"), - plot = is_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") + plot = internal_stand_pos_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_bar_select_sum.png"), - plot = is_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") + plot = internal_stand_sum_selection_barplot, height = plot_width / 2.0, width = plot_width, units = "in") ## line plots with a selection of IS # ggplot functions -is_neg_selection_lineplot <- ggplot(subset(is_neg, HMDB.name %in% is_neg_selection), aes(Sample_level, Intensity)) + +internal_stand_neg_selection_lineplot <- ggplot(subset(internal_stand_neg, HMDB.name %in% internal_stand_neg_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Neg)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -is_pos_selection_lineplot <- ggplot(subset(is_pos, HMDB.name %in% is_pos_selection), aes(Sample_level, Intensity)) + +internal_stand_pos_selection_lineplot <- ggplot(subset(internal_stand_pos, HMDB.name %in% internal_stand_pos_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Pos)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") -is_sum_selection_lineplot <- ggplot(subset(is_summed, HMDB.name %in% is_sum_selection), aes(Sample_level, Intensity)) + +internal_stand_sum_selection_lineplot <- ggplot(subset(internal_stand_summed, HMDB.name %in% internal_stand_sum_selection), aes(Sample_level, Intensity)) + ggtitle("Interne Standaard (Sum)") + geom_point(aes(col = HMDB.name)) + geom_line(aes(col = HMDB.name, group = HMDB.name)) + labs(x = "", y = "Intensity") # add theme to ggplot functions -is_neg_selection_lineplot <- theme_is_line(is_neg_selection_lineplot) -is_pos_selection_lineplot <- theme_is_line(is_pos_selection_lineplot) -is_sum_selection_lineplot <- theme_is_line(is_sum_selection_lineplot) +internal_stand_neg_selection_lineplot <- theme_internal_stand_line(internal_stand_neg_selection_lineplot) +internal_stand_pos_selection_lineplot <- theme_internal_stand_line(internal_stand_pos_selection_lineplot) +internal_stand_sum_selection_lineplot <- theme_internal_stand_line(internal_stand_sum_selection_lineplot) # save plots to disk plot_width <- 8 + 0.2 * sample_count ggsave(paste0(outdir, "/plots/IS_line_select_neg.png"), - plot = is_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_neg_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_line_select_pos.png"), - plot = is_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_pos_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") ggsave(paste0(outdir, "/plots/IS_line_select_sum.png"), - plot = is_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") + plot = internal_stand_sum_selection_lineplot, height = plot_width / 2.5, width = plot_width, units = "in") ### POSITIVE CONTROLS CHECK @@ -683,25 +683,25 @@ calculate_coefficient_of_variation <- function(intensity_list) { } # Internal standards lists, calculate coefficients of variation -if ("plots" %in% colnames(is_list)) { - intensity_col_ids <- 2:(which(colnames(is_list) == "HMDB_name") - 1) +if ("plots" %in% colnames(internal_stand_list)) { + intensity_col_ids <- 2:(which(colnames(internal_stand_list) == "HMDB_name") - 1) } else { - intensity_col_ids <- 1:(which(colnames(is_list) == "HMDB_name") - 1) + intensity_col_ids <- 1:(which(colnames(internal_stand_list) == "HMDB_name") - 1) } # previous intensity_col_ids is based on the assumption that there are Controls and Patients -is_list_intensities <- is_list[ , intensity_col_ids] -is_list_intensities <- calculate_coefficient_of_variation(is_list_intensities) -is_list_intensities <- cbind(IS_name = is_list$HMDB_name, is_list_intensities) +internal_stand_list_intensities <- internal_stand_list[ , intensity_col_ids] +internal_stand_list_intensities <- calculate_coefficient_of_variation(internal_stand_list_intensities) +internal_stand_list_intensities <- cbind(IS_name = internal_stand_list$HMDB_name, internal_stand_list_intensities) # separate adducts of IS -is_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% is_codes)) -is_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% is_codes)) -is_pos_intensities <- is_pos[ , -which(colnames(is_pos) == "HMDB_name")] -is_neg_intensities <- is_neg[ , -which(colnames(is_neg) == "HMDB_name")] -is_pos_intensities <- calculate_coefficient_of_variation(is_pos_intensities) -is_neg_intensities <- calculate_coefficient_of_variation(is_neg_intensities) -is_pos_intensities <- cbind(IS_name = is_pos$HMDB_name, is_pos_intensities) -is_neg_intensities <- cbind(IS_name = is_neg$HMDB_name, is_neg_intensities) +internal_stand_pos <- as.data.frame(subset(outlist_pos_adducts_hmdb, rownames(outlist_pos_adducts_hmdb) %in% internal_stand_codes)) +internal_stand_neg <- as.data.frame(subset(outlist_neg_adducts_hmdb, rownames(outlist_neg_adducts_hmdb) %in% internal_stand_codes)) +internal_stand_pos_intensities <- internal_stand_pos[ , -which(colnames(internal_stand_pos) == "HMDB_name")] +internal_stand_neg_intensities <- internal_stand_neg[ , -which(colnames(internal_stand_neg) == "HMDB_name")] +internal_stand_pos_intensities <- calculate_coefficient_of_variation(internal_stand_pos_intensities) +internal_stand_neg_intensities <- calculate_coefficient_of_variation(internal_stand_neg_intensities) +internal_stand_pos_intensities <- cbind(IS_name = internal_stand_pos$HMDB_name, internal_stand_pos_intensities) +internal_stand_neg_intensities <- cbind(IS_name = internal_stand_neg$HMDB_name, internal_stand_neg_intensities) # SST components. sst_comp <- read.csv(sst_components_file, header = TRUE, sep = "\t") @@ -729,13 +729,13 @@ sst_list_intensities <- cbind(SST_comp_name = sst_list$HMDB_name, sst_list_inten # Create Excel file wb <- createWorkbook("IS_SST") addWorksheet(wb, "Internal Standards") -openxlsx::writeData(wb, sheet = 1, is_list_intensities) +openxlsx::writeData(wb, sheet = 1, internal_stand_list_intensities) setColWidths(wb, 1, cols = 1, widths = 24) addWorksheet(wb, "IS pos") -openxlsx::writeData(wb, sheet = 2, is_pos_intensities) +openxlsx::writeData(wb, sheet = 2, internal_stand_pos_intensities) setColWidths(wb, 2, cols = 1, widths = 24) addWorksheet(wb, "IS neg") -openxlsx::writeData(wb, sheet = 3, is_neg_intensities) +openxlsx::writeData(wb, sheet = 3, internal_stand_neg_intensities) setColWidths(wb, 3, cols = 1, widths = 24) addWorksheet(wb, "SST components") openxlsx::writeData(wb, sheet = 4, sst_list_intensities) From 882c4034fad8f29f26ff54846018f377c431d7c5 Mon Sep 17 00:00:00 2001 From: ALuesink Date: Thu, 14 Nov 2024 14:07:55 +0100 Subject: [PATCH 17/18] Changes according to the code review --- DIMS/AverageTechReplicates.R | 2 +- DIMS/GenerateExcel.R | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/DIMS/AverageTechReplicates.R b/DIMS/AverageTechReplicates.R index 0c57ad7..98a2d41 100644 --- a/DIMS/AverageTechReplicates.R +++ b/DIMS/AverageTechReplicates.R @@ -53,7 +53,7 @@ if (dims_matrix == "DBS") { thresh2remove <- 500000000 } if (highest_mz > 700) { - thresh2remove <- 10000000 + thresh2remove <- 1000000 } diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 28b2707..04860ea 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -46,8 +46,8 @@ robust_scaler <- function(control_intensities, control_col_ids, perc = 5) { return(trimmed_control_intensities) } -remove_outliers_for_Zscore <- function(control_intensities, outlier_threshold = 2) { - #' Remove outliers per metabolite before calculating Z-scores +remove_outliers_grubbs <- function(control_intensities, outlier_threshold = 2) { + #' Remove outliers per metabolite according to Grubb's test #' #' @param control_intensities: Vector with intensities for control samples #' @param outlier_threshold: Threshold for outliers which will be removed from controls (float) @@ -211,7 +211,7 @@ if (z_score == 1) { # calculate Z-scores after removal of outliers in Control samples if (length(control_col_ids) > 10) { for (metabolite_index in 1:nrow(outlist_nooutliers)) { - intensities_without_outliers <- remove_outliers_for_Zscore(as.numeric(outlist_nooutliers[metabolite_index, control_col_ids]), outlier_threshold) + intensities_without_outliers <- remove_outliers_grubbs(as.numeric(outlist_nooutliers[metabolite_index, control_col_ids]), outlier_threshold) outlist_nooutliers$avg.ctrls[metabolite_index] <- mean(intensities_without_outliers) outlist_nooutliers$sd.ctrls[metabolite_index] <- sd(intensities_without_outliers) outlist_nooutliers$nr.ctrls[metabolite_index] <- length(intensities_without_outliers) From 74180ee98c12ce27eda4361e1dc011ce9a983a3a Mon Sep 17 00:00:00 2001 From: ALuesink Date: Thu, 14 Nov 2024 14:36:33 +0100 Subject: [PATCH 18/18] Added docstring to function calculate_coefficient_of_variation --- DIMS/GenerateExcel.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/DIMS/GenerateExcel.R b/DIMS/GenerateExcel.R index 04860ea..fd42b4f 100644 --- a/DIMS/GenerateExcel.R +++ b/DIMS/GenerateExcel.R @@ -668,6 +668,11 @@ if (z_score == 1) { ### SST components output #### calculate_coefficient_of_variation <- function(intensity_list) { + #' Calculate coefficent of variation (cv) based on standard deviation (sd) and mean + #' + #' @param intensity_list: Matrix with intensities + #' + #' @return intensity_list_with_cv: Matrix with intensities and cv, mean, sd for (col_nr in 1:ncol(intensity_list)) { intensity_list[, col_nr] <- as.numeric(intensity_list[, col_nr]) intensity_list[, col_nr] <- round(intensity_list[, col_nr], 0)