Skip to content

Commit

Permalink
Optimal Hard Thresholding to determine optimal encoding dimension (#69)
Browse files Browse the repository at this point in the history
* changed estimateBestQ to use OHT

* edit estimateBestQ error messages and variable names

* add unit tests for OHT

* update OHT tests

* add denoiseR package to Suggests

* add xcolor package for vignette building

* added my name

* update vignette

* add if statement to transpose zScores if aspect ratio larger than 1

* replace small latent space error by warning

* fix estimateBestQ test error

* Use GHA checkout V4 & R 4.4 for mac and windows

---------

Co-authored-by: AtaJadidAhari <atajadidahari@gmail.com>
  • Loading branch information
andrearaithel and AtaJadidAhari authored Jul 31, 2024
1 parent dc99588 commit 52581f5
Show file tree
Hide file tree
Showing 11 changed files with 257 additions and 39 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ jobs:
# This needs to be updated after each Bioconductor release. Please make sure we have the matching R and Bioc versions in it.
- { os: ubuntu-latest, r: '4.3', bioc: '3.18', cont: "bioconductor/bioconductor_docker:RELEASE_3_18", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: ubuntu-latest, r: 'next', bioc: '3.19', cont: "bioconductor/bioconductor_docker:devel", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
- { os: macOS-latest, r: '4.3', bioc: '3.18'}
- { os: windows-latest, r: '4.3', bioc: '3.18'}
- { os: macOS-latest, r: '4.4', bioc: '3.19'}
- { os: windows-latest, r: '4.4', bioc: '3.19'}
## Check https://github.com/r-lib/actions/tree/master/examples
## for examples using the http-user-agent
env:
Expand All @@ -81,7 +81,7 @@ jobs:
## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
## If they update their steps, we will also need to update ours.
- name: Checkout Repository
uses: actions/checkout@v3
uses: actions/checkout@v4

## R is already included in the Bioconductor docker images
- name: Setup R from r-lib
Expand Down
10 changes: 7 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ Authors@R: c(
email="mertes@in.tum.de", comment=c(ORCID="0000-0002-1091-205X")),
person("Agne", "Matuseviciute", role=c("aut")),
person(c("Michaela", "Fee"), "Müller", role=c("ctb")),
person("Andrea", "Raithel", role=c("ctb")),
person("Vicente", "Yepez", role=c("aut"), email="yepez@in.tum.de"),
person("Julien", "Gagneur", role=c("aut"), email="gagneur@in.tum.de"))
Description: Identification of aberrant gene expression in RNA-seq data.
Expand All @@ -27,7 +28,7 @@ biocViews: ImmunoOncology, RNASeq, Transcriptomics, Alignment, Sequencing,
License: MIT + file LICENSE
NeedsCompilation: yes
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Depends:
R (>= 3.6),
BiocParallel,
Expand Down Expand Up @@ -59,7 +60,9 @@ Imports:
scales,
splines,
stats,
utils
utils,
RMTstat,
pracma
Suggests:
testthat,
knitr,
Expand All @@ -73,7 +76,8 @@ Suggests:
covr,
GenomeInfoDb,
ggbio,
biovizBase
biovizBase,
denoiseR
LinkingTo:
Rcpp,
RcppArmadillo
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ exportMethods(plotManhattan)
exportMethods(plotQQ)
exportMethods(plotVolcano)
exportMethods(results)
import(RMTstat)
import(data.table, except=melt)

import(methods)
import(pracma)
importFrom(BBmisc,chunk)
importFrom(BBmisc,isFALSE)
importFrom(BBmisc,isScalarCharacter)
Expand Down
130 changes: 130 additions & 0 deletions R/controlForConfounders.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,133 @@ computeLatentSpace <- function(ods){

return(H(ods))
}

#'
#' Estimation of Q
#'
#' Estimating the best q for the given data set by Optimal Hard Thresholding
#'
#' @param ods An OutriderDataSet object
#' @param zScores A z-score matrix
#' @import RMTstat
#' @import pracma
#' @return The estimated dimension of hidden confounders
#'
#' @examples
#' ods <- makeExampleOutriderDataSet()
#'
#' estimateBestQ(ods)
#'
#' @export
estimateBestQ <- function(ods=NULL, zScores=NULL){
if (is.null(ods) & is.null(zScores)){
stop("Please provide an OutriderDataSet or a z-score matrix.")
}
else if (!is.null(ods)){
OUTRIDER:::checkOutriderDataSet(ods)
if (!is.null(zScores)){
warning("Provided z-scores are ignored and recalculated from ods.")
}

# Control for sequencing depth
if (is.null(sizeFactors(ods))){
ods <- estimateSizeFactors(ods)
}
controlledCounts <- t(t(counts(ods, normalized=FALSE)) / sizeFactors(ods))

# Log-transform controlled counts
logControlCounts <- log2((controlledCounts +1) / (rowMeans2(controlledCounts) +1))

# Compute Z-scores and control for large values
zScores <- (logControlCounts - rowMeans2(logControlCounts)) / rowSds(logControlCounts)
}
else if (!is.matrix(zScores)){
stop("Provided zScores are not a matrix.")
}
if (any(is.infinite(zScores))) {
# Check for infinite values (should be impossible!)
stop("Z-score matrix contains infinite values.")
}

# Transpose zScores if aspect ratio larger than 1
if (ncol(zScores)/nrow(zScores) > 1){
zScores <- t(zScores)
}

# Perform Singular Value Decomposition (SVD) on the matrix of Z-scores
# and extract singular values
sv <- svd(zScores)$d

# Aspect ratio of the (transposed) count matrix
numRows <- nrow(zScores)
numCols <- ncol(zScores)
beta <- numCols / numRows

# Compute the optimal w(beta)
coef <- (optimalSVHTCoef(beta) / sqrt(medianMarchenkoPastur(numCols, numRows)))

# compute cutoff
cutoff <- coef * median(sv)

# compute and return rank
if (any(sv > cutoff)){
latentDim <- max(which(sv > cutoff))
} else {
warning(paste("Optimal latent space dimension is smaller than 2. Check your count matrix and",
"verify that all samples have the expected number of counts",
"(hist(colSums(counts(ods)))).",
"For now, the latent space dimension is set to 2.", collapse = "\n"))
latentDim <- 2}
cat("Optimal encoding dimension:", latentDim, "\n")
return(latentDim)
}

#'
#' Calculate the OHT coefficient
#'
#' @noRd
optimalSVHTCoef <- function(beta){
# Calculate lambda(beta)
sqrt(2 * (beta + 1) + (8 * beta) / (beta + 1 + sqrt(beta^2 + 14 * beta + 1)))
}

#'
#' Calculate the median of the Marchenko-Pastur distribution
#'
#' Formulas are derived from a robust estimator for the noise
#' parameter gamma in the model Z~ = Z + gamma * E.
#' Gamma(Z~) = sigma / sqrt(n * mu)) with mu: median of the Marcenko-Pastur distribution.
#' More detailed explanations can be found in Gavish and Donoho 2014.
#'
#' @noRd
medianMarchenkoPastur <- function(ncol, nrow){
# Compute median of Marchenko-Pastur distribution
beta <- ncol / nrow
betaMinus <- (1 - sqrt(beta))^2
betaPlus <- (1 + sqrt(beta))^2

# Initialize range for upper integral boundary
lobnd <- copy(betaMinus)
hibnd <- copy(betaPlus)


while ((hibnd - lobnd) > 0.001){ # Iterate until convergence
x <- seq(lobnd, hibnd, length.out = 10) # Set range of values for upper integral boundary
y <- rep(0, length(x))
for (numCols in 1:length(x)){
# Approximate integral using Gauss-Kronrod Quadrature
y[numCols] <- quadgk(dmp, a=betaMinus, b=x[numCols], ndf=nrow, pdim=ncol)
}

# Set new boundaries for x that yield the closest results to 0.5
if (any(y < 0.5)){
lobnd = max(x[y < 0.5])
}

if (any(y > 0.5)){
hibnd = min(x[y > 0.5])
}
}
# If hibnd and lobnd are similar enough, return their mean
return((hibnd + lobnd) / 2.0)
}
18 changes: 0 additions & 18 deletions R/helperFunctions.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,21 +242,3 @@ getBestQDT <- function(dt, usedEvalMethod='aucPR', digits=10){
dt[,encodingDimension[
seq_len(.N) == testFun(round(evaluationLoss, digits))]]
}

#'
#' Estimation of Q
#'
#' Estimating the best q for the given data set
#'
#' @param ods An OutriderDataSet object
#' @return The estimated dimension of hidden confounders
#'
#' @examples
#' ods <- makeExampleOutriderDataSet()
#'
#' estimateBestQ(ods)
#'
#' @export
estimateBestQ <- function(ods){
round(max(2, min(500, 3.7 + 0.16*ncol(ods))))
}
8 changes: 5 additions & 3 deletions man/estimateBestQ.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/normalizationFactors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/sizeFactors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

92 changes: 92 additions & 0 deletions tests/testthat/test_estimateBestQ.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
context("Testing the estimateBestQ function (Optimal Hard Thresholding)")

library(denoiseR)

test_that("Input validation handles NULL and non-matrix inputs", {
expect_error(estimateBestQ(),
"Please provide an OutriderDataSet or a z-score matrix.")
expect_error(estimateBestQ(NULL, NULL),
"Please provide an OutriderDataSet or a z-score matrix.")
expect_error(estimateBestQ(zScores = "not a matrix"),
"Provided zScores are not a matrix.")
expect_error(estimateBestQ(ods = "not an ods"),
"Please provide an OutriderDataSet.")

ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
package='OUTRIDER')
ctsTable <- read.table(ctsFile, check.names=FALSE)
ods <- OutriderDataSet(countData=ctsTable)
# filter out non expressed genes
ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
ods <- estimateSizeFactors(ods)

expect_warning(estimateBestQ(ods = ods, zScores = matrix(c(1, 2, 3, 4, 5, 6), nrow = 3, ncol = 2)),
"Provided z-scores are ignored and recalculated from ods.")
})

test_that("User is notified about invalid matrix values", {
expect_error(estimateBestQ(zScores = matrix(c(1, 2, 3, 4, 5, Inf), nrow = 3, ncol = 2)),
"Z-score matrix contains infinite values.")
})

test_that("optimalSVHTCoef works correctly", {
expect_equal(optimalSVHTCoef(0.5),
1.98, tolerance = 0.01)
expect_equal(optimalSVHTCoef(0.1),
1.58, tolerance = 0.01)
})

test_that("medianMarchenkoPastur works correctly", {
# Expected outputs are derived from Table IV in Gavish and Donoho (2014)
expect_equal(optimalSVHTCoef(0.5) / sqrt(medianMarchenkoPastur(100, 200)),
2.1711, tolerance = 0.0001)
expect_equal(optimalSVHTCoef(0.1) / sqrt(medianMarchenkoPastur(100, 1000)),
1.6089, tolerance = 0.0001)
})

test_that("Encoding dimensions are properly calculated for simulated z-scores", {
# Simulate zScore matrix consisting of signal and noise
set.seed(42)
numGenes <- 10000
numSamples <- 200
latentDim <- 50
signalNoiseRatio <- 5
zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000

expect_equal(estimateBestQ(zScores = zTilde),
latentDim)

# Simulate zScore matrix with beta > 1
set.seed(42)
numGenes <- 50
numSamples <- 200
latentDim <- 20
signalNoiseRatio <- 5
zTilde <- LRsim(numGenes, numSamples, latentDim, signalNoiseRatio)$X * 1000

expect_equal(estimateBestQ(zScores = zTilde),
latentDim)

# Simulate zScore matrix consisting of noise only
set.seed(42)
latentDim <- 0
zTilde <- matrix(rnorm(numGenes * numSamples), nrow = numGenes, ncol = numSamples)
expect_warning(expect_equal(estimateBestQ(zScores = zTilde), 2),
paste("Optimal latent space dimension is smaller than 2\\. Check your count matrix and",
"verify that all samples have the expected number of counts",
"\\(hist\\(colSums\\(counts\\(ods\\)\\)\\)\\)\\.",
"For now\\, the latent space dimension is set to 2\\.", collapse = "\n"))
})

test_that("Encoding dimensions are properly calculated for real ODS", {
ctsFile <- system.file('extdata', 'GTExSkinSmall.tsv',
package='OUTRIDER')
ctsTable <- read.table(ctsFile, check.names=FALSE)
ods <- OutriderDataSet(countData=ctsTable)
ods <- filterExpression(ods, minCounts=TRUE, filterGenes=TRUE)
ods <- estimateSizeFactors(ods)

outsingleResult <- 5 # Expected value was calculated with OutSingle
expect_equal(estimateBestQ(ods = ods), outsingleResult,
tolerance = 1)
})
Loading

0 comments on commit 52581f5

Please sign in to comment.