New version

Russel88 · Oct 9, 2017 · cfaf128 · cfaf128
1 parent b2158ef
commit cfaf128
Show file tree

Hide file tree

Showing 11 changed files with 254 additions and 30 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: DAtest
 Title: Comparing Differential Abundance methods
-Version: 2.6.2
+Version: 2.6.3
 Authors@R: person("Jakob", "Russel", email = "russel2620@gmail.com", role = c("aut", "cre"))
 Description: What the title says.
 Depends: R (>= 3.2.5)

diff --git a/NAMESPACE b/NAMESPACE
@@ -48,6 +48,7 @@ export(DA.zzz)
 export(add.tax.DA)
 export(allDA)
 export(prune.tests.DA)
+export(runtimeDA)
 export(spikein)
 export(testDA)
 export(vennDA)
@@ -58,7 +59,6 @@ import(ggplot2)
 import(methods)
 import(nlme)
 import(pscl)
-import(samr)
 import(snow)
 import(statmod)
 import(stats)

diff --git a/R/DA.sam.R b/R/DA.sam.R
@@ -6,10 +6,11 @@
 #' @param fdr.output Passed to SAMseq. (Approximate) False Discovery Rate cutoff for output in significant genes table
 #' @param allResults If TRUE will return raw results from the SAMseq function
 #' @param ... Additional arguments for the SAMseq function
-#' @import samr
 #' @export
 DA.sam <- function(data, predictor, paired = NULL, fdr.output = 0.05, allResults = FALSE, ...){
 
+  library(samr)
+
   # Extract from phyloseq
   if(class(data) == "phyloseq"){
     if(length(predictor) > 1 | length(paired) > 1) stop("When data is a phyloseq object predictor and paired should only contain the name of the variables in sample_data")
@@ -76,6 +77,8 @@ DA.sam <- function(data, predictor, paired = NULL, fdr.output = 0.05, allResults
 
   if(class(data) == "phyloseq") df <- add.tax.DA(data, df)
 
+  detach("package:samr", unload = TRUE)
+
   if(allResults){
     return(res)
   } else {

diff --git a/R/allDA.R b/R/allDA.R
@@ -13,6 +13,7 @@
 #' @param args List. A list with lists of arguments passed to the different methods. See details for more.
 #' @param out.anova If TRUE (default) linear models will output results and p-values from anova/drop1. If FALSE will output results for 2. level of the predictor.
 #' @param alpha P-value threshold for calling significance. Default 0.05
+#' @param core.check If TRUE will make an interactive check that the amount of cores specified are desired. Only if cores>10. This is to ensure that the function doesn't automatically overloads a server with workers.  
 #' @details Currently implemented methods:
 #' \itemize{
 #'  \item per - Permutation test with user defined test statistic
@@ -107,13 +108,15 @@
 #' 
 #' @export
 
-allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","per","bay","adx","wil","ttt","ltt","ltt2","neb","erq","ere","erq2","ere2","msf","zig","ds2","lim","aov","lao","lao2","kru","lrm","llm","llm2","rai"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.anova = TRUE, alpha = 0.05){
+allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("sam","qua","fri","znb","zpo","vli","qpo","poi","pea","spe","per","bay","adx","wil","ttt","ltt","ltt2","neb","erq","ere","erq2","ere2","msf","zig","ds2","lim","aov","lao","lao2","kru","lrm","llm","llm2","rai"), relative = TRUE, cores = (detectCores()-1), rng.seed = 123, p.adj = "fdr", args = list(), out.anova = TRUE, alpha = 0.05, core.check = TRUE){
 
   stopifnot(exists("data"),exists("predictor"))
   # Check for servers
-  if(cores > 10){
-    ANSWER <- readline(paste("You are about to run allDA using",cores,"cores. Enter y to proceed "))
-    if(ANSWER != "y") stop("Process aborted")
+  if(core.check){
+    if(cores > 10){
+      ANSWER <- readline(paste("You are about to run allDA using",cores,"cores. Enter y to proceed "))
+      if(ANSWER != "y") stop("Process aborted")
+    }
   }
 
   # Extract from phyloseq
@@ -156,14 +159,18 @@ allDA <- function(data, predictor, paired = NULL, covars = NULL, tests = c("sam"
 
   # Numeric predictor
   if(is.numeric(predictor[1])){
-    message("predictor is assumed to be a continuous/quantitative variable")
+    message("predictor is assumed to be a quantitative variable")
+    if(levels(as.factor(predictor)) == 2){
+      ANSWER <- readline("The predictor is quantitative, but only contains 2 unique values. Are you sure this is correct? Enter y to proceed ")
+      if(ANSWER != "y") stop("Wrap the predictor with as.factor(predictor) to treat it is a categorical variable")
+    }
   }
-  
+
   # Covars
   if(!is.null(covars)){
     for(i in 1:length(covars)){
       if(is.numeric(covars[[i]][1])){
-        message(paste(names(covars)[i],"is assumed to be a continuous/quantitative variable"))
+        message(paste(names(covars)[i],"is assumed to be a quantitative variable"))
       } else {
         message(paste(names(covars)[i],"is assumed to be a categorical variable"))
       }

diff --git a/R/runtimeDA.R b/R/runtimeDA.R
@@ -0,0 +1,135 @@
+#' Estimate runtime of testDA on large datasets
+#' 
+#' Estimate the runtime of testDA from running on a subset of the features. Intended for datasets with at least 5000 features.
+#' 
+#' Runtime of all methods are expected to scale linearly with the number of features, except "anc" and "bay" which are modelled with a 2. order polynomial.
+#' @param data Either a matrix with counts/abundances, OR a phyloseq object. If a matrix/data.frame is provided rows should be taxa/genes/proteins and columns samples, and there should be rownames
+#' @param predictor The predictor of interest. Either a Factor or Numeric, OR if data is a phyloseq object the name of the variable in sample_data in quotation. If the predictor is numeric it will be treated as such in the analyses
+#' @param paired For paired/blocked experimental designs. Either a Factor with Subject/Block ID for running paired/blocked analysis, OR if data is a phyloseq object the name of the variable in sample_data in quotation. Only for "anc", "poi", "per", "ttt", "ltt", "ltt2", "neb", "wil", "erq", "ds2", "lrm", "llm", "llm2", "lim", "lli", "lli2" and "zig"
+#' @param covars Either a named list with covariates, OR if data is a phyloseq object a character vector with names of the variables in sample_data(data)
+#' @param subsamples Vector with numbers of features to subsample to estimate runtime for fast methods
+#' @param subsamples.slow Vector with numbers of features to subsample to estimate runtime for slow methods
+#' @param tests Fast methods to include
+#' @param tests.slow Slow methods to include
+#' @param R Intended number of repeats for the testDA function
+#' @param cores Integer. Number of cores to use for parallel computing. Default one less than available. Set to 1 for sequential computing.
+#' @param print.res If TRUE will print the results, alternatively will return a data.frame with the results.
+#' @param ... Additional arguments for the testDA function
+#' @return A data.frame if print.res is FALSE
+#' @importFrom parallel detectCores
+#' @export
+runtimeDA <- function(data, predictor, paired = NULL, covars = NULL, subsamples = c(500,1000,1500,2000,2500), subsamples.slow = c(100,200,300,400,500), 
+                      tests =  c("ds2","sam", "qua", "fri", "vli", "qpo", "poi", "pea", "wil", "ttt", "ltt", "ltt2", "erq", "erq2","ere", "ere2", "msf", "zig", "lim", "lli", "lli2", "aov", "lao", "lao2", "kru", "lrm", "llm", "llm2", "spe"), 
+                      tests.slow = c("neb", "bay", "per", "zpo", "znb", "rai", "adx"), R = 10, cores = (detectCores()-1), print.res = TRUE, ...){
+
+  stopifnot(exists("data"),exists("predictor"))
+
+  if(cores > 10){
+    ANSWER <- readline(paste("You are about to run runtimeDA using",cores,"cores. Enter y to proceed "))
+    if(ANSWER != "y") stop("Process aborted")
+  }
+
+  # Extract from phyloseq
+  if(class(data) == "phyloseq"){
+    if(length(predictor) > 1 | length(paired) > 1) stop("When data is a phyloseq object predictor and paired should only contain the name of the variables in sample_data")
+    if(!predictor %in% sample_variables(data)) stop(paste(predictor,"is not present in sample_data(data)"))
+    if(!is.null(paired)){
+      if(!paired %in% sample_variables(data)) stop(paste(paired,"is not present in sample_data(data)"))
+    }
+    count_table <- otu_table(data)
+    if(!taxa_are_rows(data)) count_table <- t(count_table)
+    predictor <- suppressWarnings(as.matrix(sample_data(data)[,predictor]))
+    if(!is.null(paired)) paired <- suppressWarnings(as.factor(as.matrix(sample_data(data)[,paired])))
+    if(!is.null(covars)){
+      covars.n <- covars
+      covars <- list()
+      for(i in 1:length(covars.n)){
+        covars[[i]] <- suppressWarnings(as.matrix(sample_data(data)[,covars.n[i]]))
+      }
+      names(covars) <- covars.n
+    } 
+  } else {
+    count_table <- data
+  }
+
+  # Remove Features not present in any samples
+  if(sum(rowSums(count_table) == 0) != 0) message(paste(sum(rowSums(count_table) == 0),"empty features removed"))
+  count_table <- count_table[rowSums(count_table) > 0,]
+
+  # Run subsets fast
+  message("Running fast methods")
+  test.list <- list()
+  for(i in 1:length(subsamples)){
+
+    # Subset and ensure that no samples are empty
+    j <- 0
+    while(j == 0){
+      sub <- count_table[sample(rownames(count_table),subsamples[i]),]
+      if(any(colSums(sub) == 0)) j <- 0 else j <- 1
+    }
+
+    # Run test
+    sub.test <- testDA(sub, predictor, paired, covars, R = 1, tests = tests, cores = cores, core.check = FALSE, ...)
+    test.list[[i]] <- sub.test
+  }
+
+  # Run subsets slow
+  message("Running slow methods")
+  test.slow.list <- list()
+  for(i in 1:length(subsamples.slow)){
+
+    # Subset and ensure that no samples are empty
+    j <- 0
+    while(j == 0){
+      sub <- count_table[sample(rownames(count_table),subsamples.slow[i]),]
+      if(any(colSums(sub) == 0)) j <- 0 else j <- 1
+    }
+
+    # Run test
+    sub.test <- testDA(sub, predictor, paired, covars, R = 1, tests = tests.slow, cores = cores, core.check = FALSE, ...)
+    test.slow.list[[i]] <- sub.test
+  }
+
+  tests.list <- append(test.list,test.slow.list) 
+  subsamps <- c(subsamples,subsamples.slow)
+
+  # Extrapolate
+  runtimes <- lapply(tests.list, function(x) x$run.times)
+  runtimes <- lapply(1:length(runtimes), function(x) cbind(runtimes[[x]],subsamps[[x]]))
+  runtimes <- do.call(rbind,runtimes)
+
+  # Which tests have been run
+  all.tests <- unique(rownames(runtimes))
+  all.tests <- names(table(rownames(runtimes))[table(rownames(runtimes))>1])
+
+  # Collect data
+  extra <- data.frame(Test = all.tests,
+                      Minutes = NA,
+                      Minutes. = NA)
+  colnames(extra)[3] <- paste0(colnames(extra[3]),"R=",R)
+
+  for(i in all.tests){
+    extra.sub <- runtimes[rownames(runtimes) == i,]
+    if(i %in% c("anc","bay")){
+      fit <- lm(Minutes ~ poly(V2,2), data = as.data.frame(extra.sub))
+    } else {
+      fit <- lm(Minutes ~ V2, data = as.data.frame(extra.sub))
+    }
+    extra[extra$Test == i,2] <- round(predict(fit, newdata = data.frame(V2 = nrow(count_table))),2)
+    extra[extra$Test == i,3] <- round(predict(fit, newdata = data.frame(V2 = nrow(count_table)))*R,2)
+  }
+
+  extra[extra[,2] < 0,"Minutes"] <- 0
+  extra[extra[,3] < 0,"Minutes"] <- 0
+
+  # Order extra
+  extra <- extra[order(extra$Minutes, decreasing = TRUE),]
+
+  if(print.res){
+    # Print the results
+    message("Estimated run times.\nWith cores=1 the runtime will be the sum of them all.\nWith more cores the actual runtime will decrease asymptotically towards the slowest test")
+    print(extra, row.names = FALSE)
+  } else {
+    return(extra)
+  }
+}
diff --git a/R/testDA.R b/R/testDA.R
@@ -14,6 +14,7 @@
 #' @param rng.seed Numeric. Seed for reproducibility. Default 123
 #' @param args List. A list with lists of arguments passed to the different methods. See details for more.
 #' @param out.anova If TRUE (default) linear models will output results and p-values from anova/drop1. If FALSE will output results for 2. level of the predictor.
+#' @param core.check If TRUE will make an interactive check that the amount of cores specified are desired. Only if cores>10. This is to ensure that the function doesn't automatically overloads a server with workers.  
 #' @details Currently implemented methods:
 #' \itemize{
 #'  \item per - Permutation test with user defined test statistic
@@ -116,15 +117,16 @@
 #' @importFrom pROC roc
 #' @export
 
-testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests = c("sam","qua","fri","zpo","znb","vli","qpo","poi","pea","neb","rai","per","bay","adx","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe"), relative = TRUE, effectSize = 2, k = c(5,5,5), cores = (detectCores()-1), rng.seed = 123, args = list(), out.anova = TRUE){
+testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests = c("sam","qua","fri","zpo","znb","vli","qpo","poi","pea","neb","rai","per","bay","adx","wil","ttt","ltt","ltt2","erq","erq2","ere","ere2","msf","zig","ds2","lim","lli","lli2","aov","lao","lao2","kru","lrm","llm","llm2","spe"), relative = TRUE, effectSize = 2, k = c(5,5,5), cores = (detectCores()-1), rng.seed = 123, args = list(), out.anova = TRUE, core.check = TRUE){
 
   stopifnot(exists("data"),exists("predictor"))
   # Check for servers
-  if(cores > 10){
-    ANSWER <- readline(paste("You are about to run testDA using",cores,"cores. Enter y to proceed "))
-    if(ANSWER != "y") stop("Process aborted")
+  if(core.check){
+    if(cores > 10){
+      ANSWER <- readline(paste("You are about to run allDA using",cores,"cores. Enter y to proceed "))
+      if(ANSWER != "y") stop("Process aborted")
+    }
   }
-
   # Time taking
   t1 <- proc.time()
 
@@ -161,7 +163,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
   if(sum(colSums(count_table) == 0) > 0) stop("Some samples are empty")
   if(ncol(count_table) != length(predictor)) stop("Number of samples in count_table does not match length of predictor")
   if(length(levels(as.factor(predictor))) < 2) stop("predictor should have at least two levels")
-  
+
   # Prune tests argument
   tests <- unique(tests)
   if(!"zzz" %in% tests) tests <- prune.tests.DA(tests, predictor, paired, covars, relative)
@@ -194,7 +196,11 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
   # Numeric predictor
   if(is.numeric(predictor[1])){
     num.pred <- TRUE
-    message("predictor is assumed to be a continuous/quantitative variable")
+    message("predictor is assumed to be a quantitative variable")
+    if(levels(as.factor(predictor)) == 2){
+      ANSWER <- readline("The predictor is quantitative, but only contains 2 unique values. Are you sure this is correct? Enter y to proceed ")
+      if(ANSWER != "y") stop("Wrap the predictor with as.factor(predictor) to treat it is a categorical variable")
+    }
   } else {
     num.pred <- FALSE
   }
@@ -203,7 +209,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
   if(!is.null(covars)){
     for(i in 1:length(covars)){
       if(is.numeric(covars[[i]][1])){
-        message(paste(names(covars)[i],"is assumed to be a continuous/quantitative variable"))
+        message(paste(names(covars)[i],"is assumed to be a quantitative variable"))
       } else {
         message(paste(names(covars)[i],"is assumed to be a categorical variable"))
       }
@@ -238,7 +244,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
 
   # Run the tests in parallel
   results <- foreach(i = tests.par , .options.snow = opts) %dopar% {
-
+    
     t1.sub <- proc.time()
 
     # Extract run info
@@ -485,7 +491,7 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
                                Samples = ncol(count_table),
                                Predictor = pred.det,
                                Paired = pair.det,
-                               Covars = length(covars),
+                               Covars = paste(names(covars), collapse = ", "),
                                RunTime = run.time,
                                Relative = relative,
                                EffectSize = effectSize,
@@ -497,8 +503,9 @@ testDA <- function(data, predictor, paired = NULL, covars = NULL, R = 10, tests
 
   # Run times
   run.times.all <- foreach(i = unique(gsub(".*_","",names(results))),.combine = rbind) %do% {
-    round(mean(as.numeric(run.times[gsub(".*_","",names(run.times)) == i]))/60,2)
+    round(mean(as.numeric(run.times[gsub(".*_","",names(run.times)) == i]))/60,4)
   }
+  run.times.all <- as.data.frame(run.times.all)
   rownames(run.times.all) <- unique(gsub(".*_","",names(results)))
   colnames(run.times.all) <- "Minutes"
 

diff --git a/R/zzz.R b/R/zzz.R
@@ -1,12 +1,12 @@
 
 .onLoad <- function(libname, pkgname){
-  message("DAtest version 2.6.2")
+  message("DAtest version 2.6.3")
 
   # Fix samr problem
-  if(.Platform$OS.type == "windows"){
-    if("samr" %in% rownames(installed.packages())){
-      options(error = NULL)
-    }
-  }
+  #if(.Platform$OS.type == "windows"){
+  #  if("samr" %in% rownames(installed.packages())){
+  #    options(error = NULL)
+  #  }
+  #}
 }
 
diff --git a/README.md b/README.md
@@ -129,7 +129,7 @@ Therefore, we want a method with a FPR ~0.05 or lower and a high AUC.
 
 (if you have a phyloseq object, see details further down)
 
-    mytest <- testDA(data, predictor)
+    mytest <- testDA(data, predictor, R = 10)
 
 `data` is either a matrix or data.frame with taxa/genes/proteins as rows
 and samples as columns (with rownames).
@@ -142,6 +142,10 @@ only the second level is spiked.
 
 `predictor` can also be continuous/quantitative
 
+`R` denotes how many times the spike-in and FPR/AUC calculation should
+be replicated. It is advised to use at least 10, but it can be set to 1
+for a fast test of the function.
+
 #### **The function automatically uses multiple CPUs for fast execution**
 
 The methods run in parallel, and by default the number of cores used is
@@ -159,6 +163,21 @@ to close all connections.
 If you run out of memory/RAM, reduce the number of cores used for
 computing.
 
+### *If you have a lot of features; i.e. more than 10k*
+
+Runtime of the different methods can vary quite a lot, and some methods
+are simply unfeasible for datasets with tenths of thousands of features.
+The `runtimeDA` function can estimate the runtime of the different
+methods on your dataset: It runs on small subsets of the data and then
+extrapolates the runtime from these subsets. It will not be super
+precise, but it should give a decent estimate. The function prints how
+many minutes it will take to run each method one time and `R` times. If
+you only use 1 core the actual runtime will be the sum of all the
+methods. With more cores the runtime will decrease and approach the
+runtime of the slowest method.
+
+    runtimeDA(data, predictor)
+
 ### *If your predictor is categorical with more than two levels:*
 
 All linear models (also GLMs) output results (including p-values) from