Optimal Hard Thresholding to determine optimal encoding dimension (#69)

* changed estimateBestQ to use OHT * edit estimateBestQ error messages and variable names * add unit tests for OHT * update OHT tests * add denoiseR package to Suggests * add xcolor package for vignette building * added my name * update vignette * add if statement to transpose zScores if aspect ratio larger than 1 * replace small latent space error by warning * fix estimateBestQ test error * Use GHA checkout V4 & R 4.4 for mac and windows --------- Co-authored-by: AtaJadidAhari <atajadidahari@gmail.com>
gagneurlab · Jul 31, 2024 · 52581f5 · 52581f5
1 parent dc99588
commit 52581f5
Show file tree

Hide file tree

Showing 11 changed files with 257 additions and 39 deletions.
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -55,8 +55,8 @@ jobs:
           # This needs to be updated after each Bioconductor release. Please make sure we have the matching R and Bioc versions in it. 
           - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
           - { os: ubuntu-latest, r: 'next', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
-          - { os: macOS-latest, r: '4.3', bioc: '3.18'}
-          - { os: windows-latest, r: '4.3', bioc: '3.18'}
+          - { os: macOS-latest, r: '4.4', bioc: '3.19'}
+          - { os: windows-latest, r: '4.4', bioc: '3.19'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -81,7 +81,7 @@ jobs:
       ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
       ## If they update their steps, we will also need to update ours.
       - name: Checkout Repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       ## R is already included in the Bioconductor docker images
       - name: Setup R from r-lib

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -12,6 +12,7 @@ Authors@R: c(
             email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")),
     person("Agne", "Matuseviciute", role=c("aut")),
     person(c("Michaela", "Fee"), "Müller", role=c("ctb")),
+    person("Andrea", "Raithel", role=c("ctb")),
     person("Vicente", "Yepez", role=c("aut"), email="yepez@in.tum.de"),
     person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
 Description: Identification of aberrant gene expression in RNA-seq data.
@@ -27,7 +28,7 @@ biocViews: ImmunoOncology, RNASeq, Transcriptomics, Alignment, Sequencing,
 License: MIT + file LICENSE
 NeedsCompilation: yes
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends:
     R (>= 3.6), 
     BiocParallel,
@@ -59,7 +60,9 @@ Imports:
     scales,
     splines,
     stats,
-    utils
+    utils,
+    RMTstat,
+    pracma
 Suggests: 
     testthat,
     knitr,
@@ -73,7 +76,8 @@ Suggests:
     covr,
     GenomeInfoDb,
     ggbio,
-    biovizBase
+    biovizBase,
+    denoiseR
 LinkingTo:
     Rcpp,
     RcppArmadillo

diff --git a/NAMESPACE b/NAMESPACE
@@ -58,9 +58,11 @@ exportMethods(plotManhattan)
 exportMethods(plotQQ)
 exportMethods(plotVolcano)
 exportMethods(results)
+import(RMTstat)
 import(data.table, except=melt)
 
 import(methods)
+import(pracma)
 importFrom(BBmisc,chunk)
 importFrom(BBmisc,isFALSE)
 importFrom(BBmisc,isScalarCharacter)

diff --git a/R/controlForConfounders.R b/R/controlForConfounders.R
@@ -108,3 +108,133 @@ computeLatentSpace <- function(ods){
 
     return(H(ods))
 }
+
+#'
+#' Estimation of Q
+#' 
+#' Estimating the best q for the given data set by Optimal Hard Thresholding
+#' 
+#' @param ods An OutriderDataSet object
+#' @param zScores A z-score matrix
+#' @import RMTstat
+#' @import pracma
+#' @return The estimated dimension of hidden confounders
+#' 
+#' @examples
+#' ods <- makeExampleOutriderDataSet()
+#' 
+#' estimateBestQ(ods)
+#' 
+#' @export
+estimateBestQ <- function(ods=NULL, zScores=NULL){
+  if (is.null(ods) & is.null(zScores)){
+    stop("Please provide an OutriderDataSet or a z-score matrix.")
+  }
+  else if (!is.null(ods)){
+    OUTRIDER:::checkOutriderDataSet(ods)
+    if (!is.null(zScores)){
+      warning("Provided z-scores are ignored and recalculated from ods.")
+    }
+
+    # Control for sequencing depth
+    if (is.null(sizeFactors(ods))){
+      ods <- estimateSizeFactors(ods)
+    }
+    controlledCounts <- t(t(counts(ods, normalized=FALSE)) / sizeFactors(ods)) 
+
+    # Log-transform controlled counts
+    logControlCounts <- log2((controlledCounts +1) / (rowMeans2(controlledCounts) +1))
+
+    # Compute Z-scores and control for large values
+    zScores <- (logControlCounts - rowMeans2(logControlCounts)) / rowSds(logControlCounts)
+  }
+  else if (!is.matrix(zScores)){
+    stop("Provided zScores are not a matrix.")
+  }
+  if (any(is.infinite(zScores))) {
+    # Check for infinite values (should be impossible!)
+    stop("Z-score matrix contains infinite values.")
+  }
+
+  # Transpose zScores if aspect ratio larger than 1
+  if (ncol(zScores)/nrow(zScores) > 1){
+    zScores <- t(zScores)
+  } 
+
+  # Perform Singular Value Decomposition (SVD) on the matrix of Z-scores 
+  # and extract singular values
+  sv <- svd(zScores)$d
+
+  # Aspect ratio of the (transposed) count matrix
+  numRows <- nrow(zScores)
+  numCols <- ncol(zScores)
+  beta <- numCols / numRows
+
+  # Compute the optimal w(beta)
+  coef <- (optimalSVHTCoef(beta) / sqrt(medianMarchenkoPastur(numCols, numRows)))
+
+  # compute cutoff
+  cutoff <- coef * median(sv)
+
+  # compute and return rank
+  if (any(sv > cutoff)){
+    latentDim <- max(which(sv > cutoff))
+  } else {
+    warning(paste("Optimal latent space dimension is smaller than 2. Check your count matrix and",
+         "verify that all samples have the expected number of counts",
+         "(hist(colSums(counts(ods)))).",
+         "For now, the latent space dimension is set to 2.", collapse = "\n"))
+    latentDim <- 2} 
+  cat("Optimal encoding dimension:", latentDim, "\n")
+  return(latentDim)
+}
+
+#'
+#' Calculate the OHT coefficient 
+#' 
+#' @noRd
+optimalSVHTCoef <- function(beta){ 
+  # Calculate lambda(beta)
+  sqrt(2 * (beta + 1) + (8 * beta) / (beta + 1 + sqrt(beta^2 + 14 * beta + 1)))
+}
+
+#'
+#' Calculate the median of the Marchenko-Pastur distribution
+#' 
+#' Formulas are derived from a robust estimator for the noise 
+#' parameter gamma in the model Z~ = Z + gamma * E.
+#' Gamma(Z~) = sigma / sqrt(n * mu)) with mu: median of the Marcenko-Pastur distribution.
+#' More detailed explanations can be found in Gavish and Donoho 2014.
+#' 
+#' @noRd
+medianMarchenkoPastur <- function(ncol, nrow){ 
+  # Compute median of Marchenko-Pastur distribution
+  beta <- ncol / nrow
+  betaMinus <- (1 - sqrt(beta))^2
+  betaPlus <- (1 + sqrt(beta))^2
+
+  # Initialize range for upper integral boundary
+  lobnd <- copy(betaMinus) 
+  hibnd <- copy(betaPlus)
+
+
+  while ((hibnd - lobnd) > 0.001){ # Iterate until convergence
+    x <- seq(lobnd, hibnd, length.out = 10) # Set range of values for upper integral boundary
+    y <- rep(0, length(x))
+    for (numCols in 1:length(x)){
+      # Approximate integral using Gauss-Kronrod Quadrature
+      y[numCols] <- quadgk(dmp, a=betaMinus, b=x[numCols], ndf=nrow, pdim=ncol)
+    }  
+
+    # Set new boundaries for x that yield the closest results to 0.5
+    if (any(y < 0.5)){
+      lobnd = max(x[y < 0.5])
+    }
+
+    if (any(y > 0.5)){
+      hibnd = min(x[y > 0.5])
+    }
+  }
+  # If hibnd and lobnd are similar enough, return their mean
+  return((hibnd + lobnd) / 2.0)
+}
diff --git a/R/helperFunctions.R b/R/helperFunctions.R
@@ -242,21 +242,3 @@ getBestQDT <- function(dt, usedEvalMethod='aucPR', digits=10){
     dt[,encodingDimension[
             seq_len(.N) == testFun(round(evaluationLoss, digits))]]
 }
-
-#'
-#' Estimation of Q
-#' 
-#' Estimating the best q for the given data set
-#' 
-#' @param ods An OutriderDataSet object
-#' @return The estimated dimension of hidden confounders
-#' 
-#' @examples
-#' ods <- makeExampleOutriderDataSet()
-#' 
-#' estimateBestQ(ods)
-#' 
-#' @export
-estimateBestQ <- function(ods){
-    round(max(2, min(500, 3.7 + 0.16*ncol(ods))))
-}
diff --git a/man/estimateBestQ.Rd b/man/estimateBestQ.Rd
diff --git a/man/normalizationFactors.Rd b/man/normalizationFactors.Rd
diff --git a/man/sizeFactors.Rd b/man/sizeFactors.Rd
diff --git a/tests/testthat/test_estimateBestQ.R b/tests/testthat/test_estimateBestQ.R
@@ -0,0 +1,92 @@
+context("Testing the estimateBestQ function (Optimal Hard Thresholding)")
+
+library(denoiseR)
+
+test_that("Input validation handles NULL and non-matrix inputs", {
+  expect_error(estimateBestQ(), 
+               "Please provide an OutriderDataSet or a z-score matrix.")
+  expect_error(estimateBestQ(NULL, NULL), 
+               "Please provide an OutriderDataSet or a z-score matrix.")
+  expect_error(estimateBestQ(zScores = "not a matrix"), 
+               "Provided zScores are not a matrix.")
+  expect_error(estimateBestQ(ods = "not an ods"), 
+               "Please provide an OutriderDataSet.")
+
+  ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
+                         package='OUTRIDER')
+  ctsTable <- read.table(ctsFile, check.names=FALSE)
+  ods <- OutriderDataSet(countData=ctsTable)
+  # filter out non expressed genes
+  ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
+  ods <- estimateSizeFactors(ods)
+
+  expect_warning(estimateBestQ(ods = ods, zScores = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2)),
+                 "Provided z-scores are ignored and recalculated from ods.")
+})
+
+test_that("User is notified about invalid matrix values", {
+  expect_error(estimateBestQ(zScores = matrix(c(1, 2, 3, 4, 5, Inf), nrow = 3, ncol = 2)), 
+               "Z-score matrix contains infinite values.")
+})
+
+test_that("optimalSVHTCoef works correctly", {
+  expect_equal(optimalSVHTCoef(0.5),
+               1.98, tolerance = 0.01)
+  expect_equal(optimalSVHTCoef(0.1),
+               1.58, tolerance = 0.01)
+})
+
+test_that("medianMarchenkoPastur works correctly", {
+  # Expected outputs are derived from Table IV in Gavish and Donoho (2014)
+  expect_equal(optimalSVHTCoef(0.5) / sqrt(medianMarchenkoPastur(100, 200)),
+               2.1711, tolerance = 0.0001)
+  expect_equal(optimalSVHTCoef(0.1) / sqrt(medianMarchenkoPastur(100, 1000)),
+               1.6089, tolerance = 0.0001)
+})
+
+test_that("Encoding dimensions are properly calculated for simulated z-scores", {
+  # Simulate zScore matrix consisting of signal and noise
+  set.seed(42)
+  numGenes <- 10000
+  numSamples <- 200
+  latentDim <- 50
+  signalNoiseRatio <- 5
+  zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000
+
+  expect_equal(estimateBestQ(zScores = zTilde),
+               latentDim)
+
+  # Simulate zScore matrix with beta > 1
+  set.seed(42)
+  numGenes <- 50
+  numSamples <- 200
+  latentDim <- 20
+  signalNoiseRatio <- 5
+  zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000
+
+  expect_equal(estimateBestQ(zScores = zTilde),
+               latentDim)
+
+  # Simulate zScore matrix consisting of noise only
+  set.seed(42)
+  latentDim <- 0
+  zTilde <- matrix(rnorm(numGenes * numSamples), nrow = numGenes, ncol = numSamples)
+  expect_warning(expect_equal(estimateBestQ(zScores = zTilde), 2),
+               paste("Optimal latent space dimension is smaller than 2\\. Check your count matrix and",
+                 "verify that all samples have the expected number of counts",
+                 "\\(hist\\(colSums\\(counts\\(ods\\)\\)\\)\\)\\.",
+                 "For now\\, the latent space dimension is set to 2\\.", collapse = "\n"))
+})
+
+test_that("Encoding dimensions are properly calculated for real ODS", {
+  ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
+                         package='OUTRIDER')
+  ctsTable <- read.table(ctsFile, check.names=FALSE)
+  ods <- OutriderDataSet(countData=ctsTable)
+  ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
+  ods <- estimateSizeFactors(ods)
+
+  outsingleResult <- 5 # Expected value was calculated with OutSingle
+  expect_equal(estimateBestQ(ods = ods), outsingleResult, 
+               tolerance = 1)
+})