From 52581f5de6afe6c26fae4b9b597c8341b658829e Mon Sep 17 00:00:00 2001
From: Andrea Raithel <133764907+andrearaithel@users.noreply.github.com>
Date: Wed, 31 Jul 2024 12:46:25 +0200
Subject: [PATCH] Optimal Hard Thresholding to determine optimal encoding
 dimension (#69)

* changed estimateBestQ to use OHT

* edit estimateBestQ error messages and variable names

* add unit tests for OHT

* update OHT tests

* add denoiseR package to Suggests

* add xcolor package for vignette building

* added my name

* update vignette

* add if statement to transpose zScores if aspect ratio larger than 1

* replace small latent space error by warning

* fix estimateBestQ test error

* Use GHA checkout V4 & R 4.4 for mac and windows

---------

Co-authored-by: AtaJadidAhari <atajadidahari@gmail.com>
---
 .github/workflows/check-bioc.yml    |   6 +-
 DESCRIPTION                         |  10 ++-
 NAMESPACE                           |   2 +
 R/controlForConfounders.R           | 130 ++++++++++++++++++++++++++++
 R/helperFunctions.R                 |  18 ----
 man/estimateBestQ.Rd                |   8 +-
 man/normalizationFactors.Rd         |   2 +-
 man/sizeFactors.Rd                  |   2 +-
 tests/testthat/test_estimateBestQ.R |  92 ++++++++++++++++++++
 vignettes/OUTRIDER.Rnw              |  18 ++--
 vignettes/bibliography.bib          |   8 ++
 11 files changed, 257 insertions(+), 39 deletions(-)
 create mode 100644 tests/testthat/test_estimateBestQ.R

diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
index f56aa2a..ff9f936 100644
--- a/.github/workflows/check-bioc.yml
+++ b/.github/workflows/check-bioc.yml
@@ -55,8 +55,8 @@ jobs:
           # This needs to be updated after each Bioconductor release. Please make sure we have the matching R and Bioc versions in it. 
           - { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
           - { os: ubuntu-latest, r: 'next', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
-          - { os: macOS-latest, r: '4.3', bioc: '3.18'}
-          - { os: windows-latest, r: '4.3', bioc: '3.18'}
+          - { os: macOS-latest, r: '4.4', bioc: '3.19'}
+          - { os: windows-latest, r: '4.4', bioc: '3.19'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -81,7 +81,7 @@ jobs:
       ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
       ## If they update their steps, we will also need to update ours.
       - name: Checkout Repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       ## R is already included in the Bioconductor docker images
       - name: Setup R from r-lib
diff --git a/DESCRIPTION b/DESCRIPTION
index ed5e74e..85136d7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -12,6 +12,7 @@ Authors@R: c(
             email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")),
     person("Agne", "Matuseviciute", role=c("aut")),
     person(c("Michaela", "Fee"), "Müller", role=c("ctb")),
+    person("Andrea", "Raithel", role=c("ctb")),
     person("Vicente", "Yepez", role=c("aut"), email="yepez@in.tum.de"),
     person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
 Description: Identification of aberrant gene expression in RNA-seq data.
@@ -27,7 +28,7 @@ biocViews: ImmunoOncology, RNASeq, Transcriptomics, Alignment, Sequencing,
 License: MIT + file LICENSE
 NeedsCompilation: yes
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends:
     R (>= 3.6), 
     BiocParallel,
@@ -59,7 +60,9 @@ Imports:
     scales,
     splines,
     stats,
-    utils
+    utils,
+    RMTstat,
+    pracma
 Suggests: 
     testthat,
     knitr,
@@ -73,7 +76,8 @@ Suggests:
     covr,
     GenomeInfoDb,
     ggbio,
-    biovizBase
+    biovizBase,
+    denoiseR
 LinkingTo:
     Rcpp,
     RcppArmadillo
diff --git a/NAMESPACE b/NAMESPACE
index 266312d..1927dac 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -58,9 +58,11 @@ exportMethods(plotManhattan)
 exportMethods(plotQQ)
 exportMethods(plotVolcano)
 exportMethods(results)
+import(RMTstat)
 import(data.table, except=melt)
 
 import(methods)
+import(pracma)
 importFrom(BBmisc,chunk)
 importFrom(BBmisc,isFALSE)
 importFrom(BBmisc,isScalarCharacter)
diff --git a/R/controlForConfounders.R b/R/controlForConfounders.R
index 2a7ea1a..5106354 100644
--- a/R/controlForConfounders.R
+++ b/R/controlForConfounders.R
@@ -108,3 +108,133 @@ computeLatentSpace <- function(ods){
     
     return(H(ods))
 }
+
+#'
+#' Estimation of Q
+#' 
+#' Estimating the best q for the given data set by Optimal Hard Thresholding
+#' 
+#' @param ods An OutriderDataSet object
+#' @param zScores A z-score matrix
+#' @import RMTstat
+#' @import pracma
+#' @return The estimated dimension of hidden confounders
+#' 
+#' @examples
+#' ods <- makeExampleOutriderDataSet()
+#' 
+#' estimateBestQ(ods)
+#' 
+#' @export
+estimateBestQ <- function(ods=NULL, zScores=NULL){
+  if (is.null(ods) & is.null(zScores)){
+    stop("Please provide an OutriderDataSet or a z-score matrix.")
+  }
+  else if (!is.null(ods)){
+    OUTRIDER:::checkOutriderDataSet(ods)
+    if (!is.null(zScores)){
+      warning("Provided z-scores are ignored and recalculated from ods.")
+    }
+    
+    # Control for sequencing depth
+    if (is.null(sizeFactors(ods))){
+      ods <- estimateSizeFactors(ods)
+    }
+    controlledCounts <- t(t(counts(ods, normalized=FALSE)) / sizeFactors(ods)) 
+    
+    # Log-transform controlled counts
+    logControlCounts <- log2((controlledCounts +1) / (rowMeans2(controlledCounts) +1))
+    
+    # Compute Z-scores and control for large values
+    zScores <- (logControlCounts - rowMeans2(logControlCounts)) / rowSds(logControlCounts)
+  }
+  else if (!is.matrix(zScores)){
+    stop("Provided zScores are not a matrix.")
+  }
+  if (any(is.infinite(zScores))) {
+    # Check for infinite values (should be impossible!)
+    stop("Z-score matrix contains infinite values.")
+  }
+  
+  # Transpose zScores if aspect ratio larger than 1
+  if (ncol(zScores)/nrow(zScores) > 1){
+    zScores <- t(zScores)
+  } 
+  
+  # Perform Singular Value Decomposition (SVD) on the matrix of Z-scores 
+  # and extract singular values
+  sv <- svd(zScores)$d
+  
+  # Aspect ratio of the (transposed) count matrix
+  numRows <- nrow(zScores)
+  numCols <- ncol(zScores)
+  beta <- numCols / numRows
+  
+  # Compute the optimal w(beta)
+  coef <- (optimalSVHTCoef(beta) / sqrt(medianMarchenkoPastur(numCols, numRows)))
+  
+  # compute cutoff
+  cutoff <- coef * median(sv)
+  
+  # compute and return rank
+  if (any(sv > cutoff)){
+    latentDim <- max(which(sv > cutoff))
+  } else {
+    warning(paste("Optimal latent space dimension is smaller than 2. Check your count matrix and",
+         "verify that all samples have the expected number of counts",
+         "(hist(colSums(counts(ods)))).",
+         "For now, the latent space dimension is set to 2.", collapse = "\n"))
+    latentDim <- 2} 
+  cat("Optimal encoding dimension:", latentDim, "\n")
+  return(latentDim)
+}
+
+#'
+#' Calculate the OHT coefficient 
+#' 
+#' @noRd
+optimalSVHTCoef <- function(beta){ 
+  # Calculate lambda(beta)
+  sqrt(2 * (beta + 1) + (8 * beta) / (beta + 1 + sqrt(beta^2 + 14 * beta + 1)))
+}
+
+#'
+#' Calculate the median of the Marchenko-Pastur distribution
+#' 
+#' Formulas are derived from a robust estimator for the noise 
+#' parameter gamma in the model Z~ = Z + gamma * E.
+#' Gamma(Z~) = sigma / sqrt(n * mu)) with mu: median of the Marcenko-Pastur distribution.
+#' More detailed explanations can be found in Gavish and Donoho 2014.
+#' 
+#' @noRd
+medianMarchenkoPastur <- function(ncol, nrow){ 
+  # Compute median of Marchenko-Pastur distribution
+  beta <- ncol / nrow
+  betaMinus <- (1 - sqrt(beta))^2
+  betaPlus <- (1 + sqrt(beta))^2
+  
+  # Initialize range for upper integral boundary
+  lobnd <- copy(betaMinus) 
+  hibnd <- copy(betaPlus)
+  
+  
+  while ((hibnd - lobnd) > 0.001){ # Iterate until convergence
+    x <- seq(lobnd, hibnd, length.out = 10) # Set range of values for upper integral boundary
+    y <- rep(0, length(x))
+    for (numCols in 1:length(x)){
+      # Approximate integral using Gauss-Kronrod Quadrature
+      y[numCols] <- quadgk(dmp, a=betaMinus, b=x[numCols], ndf=nrow, pdim=ncol)
+    }  
+    
+    # Set new boundaries for x that yield the closest results to 0.5
+    if (any(y < 0.5)){
+      lobnd = max(x[y < 0.5])
+    }
+    
+    if (any(y > 0.5)){
+      hibnd = min(x[y > 0.5])
+    }
+  }
+  # If hibnd and lobnd are similar enough, return their mean
+  return((hibnd + lobnd) / 2.0)
+}
diff --git a/R/helperFunctions.R b/R/helperFunctions.R
index 687be56..68d1557 100644
--- a/R/helperFunctions.R
+++ b/R/helperFunctions.R
@@ -242,21 +242,3 @@ getBestQDT <- function(dt, usedEvalMethod='aucPR', digits=10){
     dt[,encodingDimension[
             seq_len(.N) == testFun(round(evaluationLoss, digits))]]
 }
-
-#'
-#' Estimation of Q
-#' 
-#' Estimating the best q for the given data set
-#' 
-#' @param ods An OutriderDataSet object
-#' @return The estimated dimension of hidden confounders
-#' 
-#' @examples
-#' ods <- makeExampleOutriderDataSet()
-#' 
-#' estimateBestQ(ods)
-#' 
-#' @export
-estimateBestQ <- function(ods){
-    round(max(2, min(500, 3.7 + 0.16*ncol(ods))))
-}
diff --git a/man/estimateBestQ.Rd b/man/estimateBestQ.Rd
index dcaa4c9..0989f50 100644
--- a/man/estimateBestQ.Rd
+++ b/man/estimateBestQ.Rd
@@ -1,19 +1,21 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/helperFunctions.R
+% Please edit documentation in R/controlForConfounders.R
 \name{estimateBestQ}
 \alias{estimateBestQ}
 \title{Estimation of Q}
 \usage{
-estimateBestQ(ods)
+estimateBestQ(ods = NULL, zScores = NULL)
 }
 \arguments{
 \item{ods}{An OutriderDataSet object}
+
+\item{zScores}{A z-score matrix}
 }
 \value{
 The estimated dimension of hidden confounders
 }
 \description{
-Estimating the best q for the given data set
+Estimating the best q for the given data set by Optimal Hard Thresholding
 }
 \examples{
 ods <- makeExampleOutriderDataSet()
diff --git a/man/normalizationFactors.Rd b/man/normalizationFactors.Rd
index 85b5c6c..c0bac94 100644
--- a/man/normalizationFactors.Rd
+++ b/man/normalizationFactors.Rd
@@ -19,7 +19,7 @@ object.}
 
 \S4method{normalizationFactors}{OutriderDataSet,data.frame}(object, minE = 0.5, ...) <- value
 
-\S4method{normalizationFactors}{OutriderDataSet,`NULL`}(object) <- value
+\S4method{normalizationFactors}{OutriderDataSet,NULL}(object) <- value
 }
 \arguments{
 \item{object}{An \code{\link{OutriderDataSet}} object}
diff --git a/man/sizeFactors.Rd b/man/sizeFactors.Rd
index 78dbd8b..882292c 100644
--- a/man/sizeFactors.Rd
+++ b/man/sizeFactors.Rd
@@ -15,7 +15,7 @@
 
 \S4method{sizeFactors}{OutriderDataSet,numeric}(object) <- value
 
-\S4method{sizeFactors}{OutriderDataSet,`NULL`}(object) <- value
+\S4method{sizeFactors}{OutriderDataSet,NULL}(object) <- value
 
 \S4method{estimateSizeFactors}{OutriderDataSet}(object)
 }
diff --git a/tests/testthat/test_estimateBestQ.R b/tests/testthat/test_estimateBestQ.R
new file mode 100644
index 0000000..4790c0b
--- /dev/null
+++ b/tests/testthat/test_estimateBestQ.R
@@ -0,0 +1,92 @@
+context("Testing the estimateBestQ function (Optimal Hard Thresholding)")
+
+library(denoiseR)
+
+test_that("Input validation handles NULL and non-matrix inputs", {
+  expect_error(estimateBestQ(), 
+               "Please provide an OutriderDataSet or a z-score matrix.")
+  expect_error(estimateBestQ(NULL, NULL), 
+               "Please provide an OutriderDataSet or a z-score matrix.")
+  expect_error(estimateBestQ(zScores = "not a matrix"), 
+               "Provided zScores are not a matrix.")
+  expect_error(estimateBestQ(ods = "not an ods"), 
+               "Please provide an OutriderDataSet.")
+  
+  ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
+                         package='OUTRIDER')
+  ctsTable <- read.table(ctsFile, check.names=FALSE)
+  ods <- OutriderDataSet(countData=ctsTable)
+  # filter out non expressed genes
+  ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
+  ods <- estimateSizeFactors(ods)
+  
+  expect_warning(estimateBestQ(ods = ods, zScores = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2)),
+                 "Provided z-scores are ignored and recalculated from ods.")
+})
+
+test_that("User is notified about invalid matrix values", {
+  expect_error(estimateBestQ(zScores = matrix(c(1, 2, 3, 4, 5, Inf), nrow = 3, ncol = 2)), 
+               "Z-score matrix contains infinite values.")
+})
+
+test_that("optimalSVHTCoef works correctly", {
+  expect_equal(optimalSVHTCoef(0.5),
+               1.98, tolerance = 0.01)
+  expect_equal(optimalSVHTCoef(0.1),
+               1.58, tolerance = 0.01)
+})
+
+test_that("medianMarchenkoPastur works correctly", {
+  # Expected outputs are derived from Table IV in Gavish and Donoho (2014)
+  expect_equal(optimalSVHTCoef(0.5) / sqrt(medianMarchenkoPastur(100, 200)),
+               2.1711, tolerance = 0.0001)
+  expect_equal(optimalSVHTCoef(0.1) / sqrt(medianMarchenkoPastur(100, 1000)),
+               1.6089, tolerance = 0.0001)
+})
+
+test_that("Encoding dimensions are properly calculated for simulated z-scores", {
+  # Simulate zScore matrix consisting of signal and noise
+  set.seed(42)
+  numGenes <- 10000
+  numSamples <- 200
+  latentDim <- 50
+  signalNoiseRatio <- 5
+  zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000
+  
+  expect_equal(estimateBestQ(zScores = zTilde),
+               latentDim)
+  
+  # Simulate zScore matrix with beta > 1
+  set.seed(42)
+  numGenes <- 50
+  numSamples <- 200
+  latentDim <- 20
+  signalNoiseRatio <- 5
+  zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000
+  
+  expect_equal(estimateBestQ(zScores = zTilde),
+               latentDim)
+  
+  # Simulate zScore matrix consisting of noise only
+  set.seed(42)
+  latentDim <- 0
+  zTilde <- matrix(rnorm(numGenes * numSamples), nrow = numGenes, ncol = numSamples)
+  expect_warning(expect_equal(estimateBestQ(zScores = zTilde), 2),
+               paste("Optimal latent space dimension is smaller than 2\\. Check your count matrix and",
+                 "verify that all samples have the expected number of counts",
+                 "\\(hist\\(colSums\\(counts\\(ods\\)\\)\\)\\)\\.",
+                 "For now\\, the latent space dimension is set to 2\\.", collapse = "\n"))
+})
+
+test_that("Encoding dimensions are properly calculated for real ODS", {
+  ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
+                         package='OUTRIDER')
+  ctsTable <- read.table(ctsFile, check.names=FALSE)
+  ods <- OutriderDataSet(countData=ctsTable)
+  ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
+  ods <- estimateSizeFactors(ods)
+  
+  outsingleResult <- 5 # Expected value was calculated with OutSingle
+  expect_equal(estimateBestQ(ods = ods), outsingleResult, 
+               tolerance = 1)
+})
\ No newline at end of file
diff --git a/vignettes/OUTRIDER.Rnw b/vignettes/OUTRIDER.Rnw
index b1367d9..8bee49c 100644
--- a/vignettes/OUTRIDER.Rnw
+++ b/vignettes/OUTRIDER.Rnw
@@ -4,6 +4,8 @@
 %\VignetteEncoding{UTF-8}
 
 \documentclass[11pt]{article}
+\usepackage{xcolor}
+\definecolor{fgcolor}{rgb}{0,0,0}
 
 <<style-knitr, eval=TRUE, echo=FALSE, cache=FALSE, results="asis">>=
 BiocStyle::latex()
@@ -337,11 +339,7 @@ We have different ways to control for confounders present in the data.
 The first and standard way is to calculate the \Rfunction{sizeFactors} 
 as done in \deseq{}\cite{Love2014}.
 
-Additionally, the \Rfunction{controlForConfounders} function calls an 
-autoencoder that automatically controls for confounders present in the data.
-Therefore an encoding dimension $q$ needs to be set or the default value 20 is 
-used. The optimal value of $q$ can be determined using the
-\Rfunction{findEncodingDim} function. 
+Additionally, the \Rfunction{controlForConfounders} function calls a denoising autoencoder that controls for confounders by exploiting correlations in the data to reconstruct corrupted read-counts. The encoding dimension \textit{q} can be set manually or can be calculated using the Optimal Hard Thresholding (OHT) procedure established by Gavish and Donoho\cite{Gavish2013}. Alternatively, the optimal value of \textit{q} can be determined using the \Rfunction{findEncodingDim} function by testing different values for \textit{q} and calculating the one that recalls the highest number of injected outliers. The deterministic approach using OHT is much faster than the iterative procedure in \Rfunction{findEncodingDim}.
 After controlling for confounders, the heatmap should be plotted again.
 If it worked, no batches should be present and the correlations between 
 samples should be reduced and close to zero.
@@ -373,7 +371,7 @@ expression patterns.
 
 \subsection{Finding the right encoding dimension \textit{q}}
 
-In the previous section, we fixed the encoding dimension $q=21$. But having 
+In the previous section, we fixed the encoding dimension $q=21$. However, having 
 the right encoding dimension is crucial in finding outliers in the data. 
 On the one hand, if \textit{q} is too big the autoencoder will learn the 
 identity matrix and will overfit the data. On the other hand, if \textit{q}
@@ -395,10 +393,10 @@ plotEncDimSearch(ods)
 
 @
 
-Since this function runs a full OUTRIDER fit for a range of encoding dimensions,
-it is quite CPU intensive, but can increase the overall performance of the
-autoencoder and is recommended for any data set. If \textit{q} is not provided
-by the user, it will be estimated based on the number of samples.
+Since this function runs a full OUTRIDER fit for a variety of encoding dimensions, 
+it is quite CPU-intensive. Thus, it is recommended to use the faster function \Rfunction{estimateBestQ} which relies on Optimal Hard Thresholding\cite{Gavish2013}.
+It is the default method to find the optimal encoding dimension if \textit{q} is 
+not provided by the user.
 
 
 \subsubsection{Excluding samples from the autoencoder fit}
diff --git a/vignettes/bibliography.bib b/vignettes/bibliography.bib
index a0fc237..78a768f 100644
--- a/vignettes/bibliography.bib
+++ b/vignettes/bibliography.bib
@@ -94,4 +94,12 @@ @article{Stegle2012
 title = {{Using probabilistic estimation of expression residuals (PEER) to obtain increased power and interpretability of gene expression analyses}},
 volume = {7},
 year = {2012}
+}
+@article{Gavish2013,
+ abstract = {We consider recovery of low-rank matrices from noisy data by hard thresholding of singular values, where singular values below a prescribed threshold {\$}\lambda{\$} are set to 0. We study the asymptotic MSE in a framework where the matrix size is large compared to the rank of the matrix to be recovered, and the signal-to-noise ratio of the low-rank piece stays constant. The AMSE-optimal choice of hard threshold, in the case of n-by-n matrix in noise level $\backslash$sigma, is simply {\$}(4/\sqrt{3}) \sqrt{n}$\backslash$sigma $\backslash$approx 2.309 \sqrt{n}$\backslash$sigma{\$} when {\$}$\backslash$sigma{\$} is known, or simply {\$}2.858$\backslash$cdot y{\_}{med}{\$} when {\$}$\backslash$sigma{\$} is unknown, where {\$}y{\_}{med}{\$} is the median empirical singular value. For nonsquare $m$ by $n$ matrices with {\$}m $\backslash$neq n{\$}, these thresholding coefficients are replaced with different provided constants. In our asymptotic framework, this thresholding rule adapts to unknown rank and to unknown noise level in an optimal manner: it is always better than hard thresholding at any other value, no matter what the matrix is that we are trying to recover, and is always better than ideal Truncated SVD (TSVD), which truncates at the true rank of the low-rank matrix we are trying to recover. Hard thresholding at the recommended value to recover an n-by-n matrix of rank r guarantees an AMSE at most {\$}3nr$\backslash$sigma{\^{}}2{\$}. In comparison, the guarantee provided by TSVD is {\$}5nr$\backslash$sigma{\^{}}2{\$}, the guarantee provided by optimally tuned singular value soft thresholding is {\$}6nr$\backslash$sigma{\^{}}2{\$}, and the best guarantee achievable by any shrinkage of the data singular values is {\$}2nr$\backslash$sigma{\^{}}2{\$}. Empirical evidence shows that these AMSE properties of the {\$}4/\sqrt{3}{\$} thresholding rule remain valid even for relatively small n, and that performance improvement over TSVD and other shrinkage rules is substantial, turning it into the practical hard threshold of choice.},
+ author = {Gavish, Matan and Donoho, David L.},
+ date = {25.05.2013},
+ title = {{The Optimal Hard Threshold for Singular Values is 4/sqrt(3)}},
+ url = {http://arxiv.org/pdf/1305.5870},
+ file = {Gavish, Donoho 25.05.2013 - The Optimal Hard Threshold:Attachments/Gavish, Donoho 25.05.2013 - The Optimal Hard Threshold.pdf:application/pdf}
 }
\ No newline at end of file