diff --git a/NAMESPACE b/NAMESPACE index b8dbd7803..c51a5a2e0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -333,6 +333,7 @@ export(runPAGEEnrich) export(runPAGEEnrich_OLD) export(runPCA) export(runPCAprojection) +export(runPCAprojectionBatch) export(runPatternSimulation) export(runRankEnrich) export(runSpatialDeconv) diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R index 7409174ad..501e38c07 100644 --- a/R/dimension_reduction.R +++ b/R/dimension_reduction.R @@ -810,7 +810,7 @@ runPCA_BiocSingular_irlba_projection = function(x, #' @title runPCAprojection #' @name runPCAprojection -#' @description runs a Principal Component Analysis +#' @description runs a Principal Component Analysis on a random subet + projection #' @param gobject giotto object #' @param spat_unit spatial unit #' @param feat_type feature type @@ -849,7 +849,7 @@ runPCAprojection = function(gobject, expression_values = c('normalized', 'scaled', 'custom'), reduction = c('cells', 'feats'), random_subset = 500, - name = NULL, + name = 'pca.projection', feats_to_use = 'hvf', return_gobject = TRUE, center = TRUE, @@ -860,7 +860,8 @@ runPCAprojection = function(gobject, rev = FALSE, set_seed = TRUE, seed_number = 1234, - verbose = TRUE) { + verbose = TRUE, + ...) { # Set feat_type and spat_unit @@ -934,7 +935,7 @@ runPCAprojection = function(gobject, pca_object = runPCA_BiocSingular_irlba_projection(x = t_flex(expr_values), ncp = ncp, center = center, - scale = scale, + scale = scale_unit, rev = rev, set_seed = set_seed, seed_number = seed_number, @@ -949,7 +950,7 @@ runPCAprojection = function(gobject, pca_object = runPCA_BiocSingular_irlba_projection(x = expr_values, ncp = ncp, center = center, - scale = scale, + scale = scale_unit, rev = rev, set_seed = set_seed, seed_number = seed_number, @@ -1012,6 +1013,339 @@ runPCAprojection = function(gobject, + +#' @title runPCAprojectionBatch +#' @name runPCAprojectionBatch +#' @description runs a Principal Component Analysis on multiple random batches + projection +#' @param gobject giotto object +#' @param spat_unit spatial unit +#' @param feat_type feature type +#' @param expression_values expression values to use +#' @param reduction cells or genes +#' @param random_subset random subset to perform PCA on +#' @param batch_number number of random batches to run +#' @param name arbitrary name for PCA run +#' @param feats_to_use subset of features to use for PCA +#' @param genes_to_use deprecated use feats_to_use +#' @param return_gobject boolean: return giotto object (default = TRUE) +#' @param center center data first (default = TRUE) +#' @param scale_unit scale features before PCA (default = TRUE) +#' @param ncp number of principal components to calculate +#' @param method which implementation to use +#' @param method_params BiocParallelParam object +#' @param rev do a reverse PCA +#' @param set_seed use of seed +#' @param seed_number seed number to use +#' @param verbose verbosity of the function +#' @param ... additional parameters for PCA (see details) +#' @return giotto object with updated PCA dimension recuction +#' @details See \code{\link[BiocSingular]{runPCA}} and \code{\link[FactoMineR]{PCA}} for more information about other parameters. +#' This PCA implementation is similar to \code{\link{runPCA}} and \code{\link{runPCAprojection}}, +#' except that it performs PCA on multiple subsets (batches) of the cells or features, +#' and predict on the others. This can significantly increase speed without sacrificing accuracy too much. +#' \itemize{ +#' \item feats_to_use = NULL: will use all features from the selected matrix +#' \item feats_to_use = : can be used to select a column name of +#' highly variable features, created by (see \code{\link{calculateHVF}}) +#' \item feats_to_use = c('geneA', 'geneB', ...): will use all manually provided features +#' } +#' @export +runPCAprojectionBatch = function(gobject, + spat_unit = NULL, + feat_type = NULL, + expression_values = c('normalized', 'scaled', 'custom'), + reduction = c('cells', 'feats'), + random_subset = 500, + batch_number = 5, + name = 'pca.projection.batch', + feats_to_use = 'hvf', + return_gobject = TRUE, + center = TRUE, + scale_unit = TRUE, + ncp = 100, + method = c('irlba'), + method_params = BiocParallel::SerialParam(), + rev = FALSE, + set_seed = TRUE, + seed_number = 1234, + verbose = TRUE, + ...) { + + + # Set feat_type and spat_unit + spat_unit = set_default_spat_unit(gobject = gobject, + spat_unit = spat_unit) + feat_type = set_default_feat_type(gobject = gobject, + spat_unit = spat_unit, + feat_type = feat_type) + + # specify name to use for pca + if(is.null(name)) { + if(feat_type == 'rna') { + name = 'pca' + } else { + name = paste0(feat_type,'.','pca') + } + } + + # expression values to be used + values = match.arg(expression_values, unique(c('normalized', 'scaled', 'custom', expression_values))) + expr_values = get_expression_values(gobject = gobject, + feat_type = feat_type, + spat_unit = spat_unit, + values = values, + output = 'exprObj') + + provenance = prov(expr_values) + + if(!is.null(slot(gobject, 'h5_file'))) { + expr_path = slot(expr_values, 'exprMat') + + expr_values = HDF5Array::h5mread(filepath = slot(gobject, 'h5_file'), + name = paste0('expression/', + feat_type,'/', + values)) + + expr_dimnames = HDF5Array::h5readDimnames(filepath = slot(gobject, 'h5_file'), + name = paste0('expression/', + feat_type,'/', + values)) + + rownames(expr_values) = expr_dimnames[[1]] + colnames(expr_values) = expr_dimnames[[2]] + + } else { + expr_values = expr_values[] # extract matrix + } + + + + ## subset matrix + if(!is.null(feats_to_use)) { + expr_values = create_feats_to_use_matrix(gobject = gobject, + spat_unit = spat_unit, + feat_type = feat_type, + sel_matrix = expr_values, + feats_to_use = feats_to_use, + verbose = verbose) + } + + + # do PCA dimension reduction + reduction = match.arg(reduction, c('cells', 'feats')) + + # PCA implementation + method = match.arg(method, c('irlba')) + + if(reduction == 'cells') { + + pca_batch_results = list() + + for(batch in 1:batch_number) { + + if(verbose) wrap_msg('start batch ', batch) + + if(isTRUE(set_seed)) { + seed_batch = seed_number+batch + } else { + seed_batch = NULL + } + + + # PCA on cells + pca_object = runPCA_BiocSingular_irlba_projection(x = t_flex(expr_values), + ncp = ncp, + center = center, + scale = scale_unit, + rev = rev, + set_seed = set_seed, + seed_number = seed_batch, + BPPARAM = method_params, + random_subset = random_subset, + verbose = verbose, + ...) + + # adjust the sign of the coordinates and loadings vector relative to first batch + # this is necessary for the next averaging step + if(batch == 1) { + pca_batch_results[[batch]] = pca_object + } else { + + for(dimension in 1:ncol(pca_object[['coords']])) { + sum_evaluation = sum(sign(pca_batch_results[[1]][['coords']][1:20, dimension]) * + sign(pca_object[['coords']][1:20, dimension])) + if(sum_evaluation < 0) { + pca_object$coords[, dimension] = -1 * pca_object$coords[, dimension] + pca_object$loadings[, dimension] = -1 * pca_object$loadings[, dimension] + } + } + pca_batch_results[[batch]] = pca_object + } + } + + if(verbose) wrap_msg('start averaging pca results of batches') + + # calculate average eigenvalues, coordinates and loadings + # TODO: test out DT approach, might be faster and more efficient for + # large datasets + + # eigenvalues + eigenvalues_list = lapply(pca_batch_results, FUN = function(x) x$eigenvalues) + eigenvalues_matrix = do.call('cbind', eigenvalues_list) + eigenvalues_mean = rowMeans_flex(eigenvalues_matrix) + + # coordinates + coords_list = lapply(pca_batch_results, FUN = function(x) x$coords) + coords_vector = do.call('c', coords_list) + coords_array = array(data = coords_vector, dim = c(ncol(expr_values), ncp, length(pca_batch_results))) + coords_all = apply(coords_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)}) + rownames(coords_all) = rownames(pca_batch_results[[1]][['coords']]) + colnames(coords_all) = colnames(pca_batch_results[[1]][['coords']]) + + # loadings + loadings_list = lapply(pca_batch_results, FUN = function(x) x$loadings) + loadings_vector = do.call('c', loadings_list) + loadings_array = array(data = loadings_vector, dim = c(nrow(expr_values), ncp, length(pca_batch_results))) + loadings_all = apply(loadings_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)}) + rownames(loadings_all) = rownames(pca_batch_results[[1]][['loadings']]) + colnames(loadings_all) = colnames(pca_batch_results[[1]][['loadings']]) + + + pca_object = list(eigenvalues = eigenvalues_mean, loadings = loadings_all, coords = coords_all) + + + } else { + + + pca_batch_results = list() + + for(batch in 1:batch_number) { + + if(verbose) wrap_msg('start batch ', batch) + + if(isTRUE(set_seed)) { + seed_batch = seed_number+batch + } else { + seed_batch = NULL + } + + + # PCA on features + pca_object = runPCA_BiocSingular_irlba_projection(x = expr_values, + ncp = ncp, + center = center, + scale = scale_unit, + rev = rev, + set_seed = set_seed, + seed_number = seed_number, + BPPARAM = method_params, + random_subset = random_subset, + verbose = verbose, + ...) + + + # adjust the sign of the coordinates and loadings vector relative to first batch + # this is necessary for the next averaging step + if(batch == 1) { + pca_batch_results[[batch]] = pca_object + } else { + + for(dimension in 1:ncol(pca_object[['coords']])) { + sum_evaluation = sum(sign(pca_batch_results[[1]][['coords']][1:20, dimension]) * + sign(pca_object[['coords']][1:20, dimension])) + if(sum_evaluation < 0) { + pca_object$coords[, dimension] = -1 * pca_object$coords[, dimension] + pca_object$loadings[, dimension] = -1 * pca_object$loadings[, dimension] + } + } + pca_batch_results[[batch]] = pca_object + } + } + + if(verbose) wrap_msg('start averaging pca results of batches') + + # calculate average eigenvalues, coordinates and loadings + # TODO: test out DT approach, might be faster and more efficient for + # large datasets + + # eigenvalues + eigenvalues_list = lapply(pca_batch_results, FUN = function(x) x$eigenvalues) + eigenvalues_matrix = do.call('cbind', eigenvalues_list) + eigenvalues_mean = rowMeans_flex(eigenvalues_matrix) + + # coordinates + coords_list = lapply(pca_batch_results, FUN = function(x) x$coords) + coords_vector = do.call('c', coords_list) + coords_array = array(data = coords_vector, dim = c(ncol(expr_values), ncp, length(pca_batch_results))) + coords_all = apply(coords_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)}) + rownames(coords_all) = rownames(pca_batch_results[[1]][['coords']]) + colnames(coords_all) = colnames(pca_batch_results[[1]][['coords']]) + + # loadings + loadings_list = lapply(pca_batch_results, FUN = function(x) x$loadings) + loadings_vector = do.call('c', loadings_list) + loadings_array = array(data = loadings_vector, dim = c(nrow(expr_values), ncp, length(pca_batch_results))) + loadings_all = apply(loadings_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)}) + rownames(loadings_all) = rownames(pca_batch_results[[1]][['loadings']]) + colnames(loadings_all) = colnames(pca_batch_results[[1]][['loadings']]) + + + pca_object = list(eigenvalues = eigenvalues_mean, loadings = loadings_all, coords = coords_all) + + + + } + + + if(return_gobject == TRUE) { + + pca_names = list_dim_reductions_names(gobject = gobject, + data_type = reduction, + spat_unit = spat_unit, + feat_type = feat_type, + dim_type = 'pca') + + if(name %in% pca_names) { + cat('\n ', name, ' has already been used, will be overwritten \n') + } + + if (reduction == "cells") { + my_row_names = colnames(expr_values) + } else { + my_row_names = rownames(expr_values) + } + + dimObject = create_dim_obj(name = name, + feat_type = feat_type, + spat_unit = spat_unit, + provenance = provenance, + reduction = reduction, + reduction_method = 'pca', + coordinates = pca_object$coords, + misc = list(eigenvalues = pca_object$eigenvalues, + loadings = pca_object$loadings), + my_rownames = my_row_names) + + ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + gobject = set_dimReduction(gobject = gobject, dimObject = dimObject) + ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### + + + ## update parameters used ## + gobject = update_giotto_params(gobject, description = '_pca') + return(gobject) + + + + } else { + return(pca_object) + } + + +} + + + ## * PC estimates #### # ------------------ # diff --git a/man/runPCAprojection.Rd b/man/runPCAprojection.Rd index 727f3369a..43bcdc48c 100644 --- a/man/runPCAprojection.Rd +++ b/man/runPCAprojection.Rd @@ -11,7 +11,7 @@ runPCAprojection( expression_values = c("normalized", "scaled", "custom"), reduction = c("cells", "feats"), random_subset = 500, - name = NULL, + name = "pca.projection", feats_to_use = "hvf", return_gobject = TRUE, center = TRUE, @@ -22,7 +22,8 @@ runPCAprojection( rev = FALSE, set_seed = TRUE, seed_number = 1234, - verbose = TRUE + verbose = TRUE, + ... ) } \arguments{ @@ -62,15 +63,15 @@ runPCAprojection( \item{verbose}{verbosity of the function} -\item{genes_to_use}{deprecated use feats_to_use} - \item{...}{additional parameters for PCA (see details)} + +\item{genes_to_use}{deprecated use feats_to_use} } \value{ giotto object with updated PCA dimension recuction } \description{ -runs a Principal Component Analysis +runs a Principal Component Analysis on a random subet + projection } \details{ See \code{\link[BiocSingular]{runPCA}} and \code{\link[FactoMineR]{PCA}} for more information about other parameters. diff --git a/man/runPCAprojectionBatch.Rd b/man/runPCAprojectionBatch.Rd new file mode 100644 index 000000000..2533e097c --- /dev/null +++ b/man/runPCAprojectionBatch.Rd @@ -0,0 +1,90 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dimension_reduction.R +\name{runPCAprojectionBatch} +\alias{runPCAprojectionBatch} +\title{runPCAprojectionBatch} +\usage{ +runPCAprojectionBatch( + gobject, + spat_unit = NULL, + feat_type = NULL, + expression_values = c("normalized", "scaled", "custom"), + reduction = c("cells", "feats"), + random_subset = 500, + batch_number = 5, + name = "pca.projection.batch", + feats_to_use = "hvf", + return_gobject = TRUE, + center = TRUE, + scale_unit = TRUE, + ncp = 100, + method = c("irlba"), + method_params = BiocParallel::SerialParam(), + rev = FALSE, + set_seed = TRUE, + seed_number = 1234, + verbose = TRUE, + ... +) +} +\arguments{ +\item{gobject}{giotto object} + +\item{spat_unit}{spatial unit} + +\item{feat_type}{feature type} + +\item{expression_values}{expression values to use} + +\item{reduction}{cells or genes} + +\item{random_subset}{random subset to perform PCA on} + +\item{batch_number}{number of random batches to run} + +\item{name}{arbitrary name for PCA run} + +\item{feats_to_use}{subset of features to use for PCA} + +\item{return_gobject}{boolean: return giotto object (default = TRUE)} + +\item{center}{center data first (default = TRUE)} + +\item{scale_unit}{scale features before PCA (default = TRUE)} + +\item{ncp}{number of principal components to calculate} + +\item{method}{which implementation to use} + +\item{method_params}{BiocParallelParam object} + +\item{rev}{do a reverse PCA} + +\item{set_seed}{use of seed} + +\item{seed_number}{seed number to use} + +\item{verbose}{verbosity of the function} + +\item{...}{additional parameters for PCA (see details)} + +\item{genes_to_use}{deprecated use feats_to_use} +} +\value{ +giotto object with updated PCA dimension recuction +} +\description{ +runs a Principal Component Analysis on multiple random batches + projection +} +\details{ +See \code{\link[BiocSingular]{runPCA}} and \code{\link[FactoMineR]{PCA}} for more information about other parameters. +This PCA implementation is similar to \code{\link{runPCA}} and \code{\link{runPCAprojection}}, +except that it performs PCA on multiple subsets (batches) of the cells or features, +and predict on the others. This can significantly increase speed without sacrificing accuracy too much. +\itemize{ + \item feats_to_use = NULL: will use all features from the selected matrix + \item feats_to_use = : can be used to select a column name of + highly variable features, created by (see \code{\link{calculateHVF}}) + \item feats_to_use = c('geneA', 'geneB', ...): will use all manually provided features +} +}