update pca project code and documentation

drieslab · Aug 26, 2023 · 88d87eb · 88d87eb
1 parent 1ec1298
commit 88d87eb
Show file tree

Hide file tree

Showing 4 changed files with 436 additions and 10 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -333,6 +333,7 @@ export(runPAGEEnrich)
 export(runPAGEEnrich_OLD)
 export(runPCA)
 export(runPCAprojection)
+export(runPCAprojectionBatch)
 export(runPatternSimulation)
 export(runRankEnrich)
 export(runSpatialDeconv)

diff --git a/R/dimension_reduction.R b/R/dimension_reduction.R
@@ -810,7 +810,7 @@ runPCA_BiocSingular_irlba_projection = function(x,
 
 #' @title runPCAprojection
 #' @name runPCAprojection
-#' @description runs a Principal Component Analysis
+#' @description runs a Principal Component Analysis on a random subet + projection
 #' @param gobject giotto object
 #' @param spat_unit spatial unit
 #' @param feat_type feature type
@@ -849,7 +849,7 @@ runPCAprojection = function(gobject,
                             expression_values = c('normalized', 'scaled', 'custom'),
                             reduction = c('cells', 'feats'),
                             random_subset = 500,
-                            name = NULL,
+                            name = 'pca.projection',
                             feats_to_use = 'hvf',
                             return_gobject = TRUE,
                             center = TRUE,
@@ -860,7 +860,8 @@ runPCAprojection = function(gobject,
                             rev = FALSE,
                             set_seed = TRUE,
                             seed_number = 1234,
-                            verbose = TRUE) {
+                            verbose = TRUE,
+                            ...) {
 
 
   # Set feat_type and spat_unit
@@ -934,7 +935,7 @@ runPCAprojection = function(gobject,
     pca_object = runPCA_BiocSingular_irlba_projection(x = t_flex(expr_values),
                                                       ncp = ncp,
                                                       center = center,
-                                                      scale = scale,
+                                                      scale = scale_unit,
                                                       rev = rev,
                                                       set_seed = set_seed,
                                                       seed_number = seed_number,
@@ -949,7 +950,7 @@ runPCAprojection = function(gobject,
     pca_object = runPCA_BiocSingular_irlba_projection(x = expr_values,
                                                       ncp = ncp,
                                                       center = center,
-                                                      scale = scale,
+                                                      scale = scale_unit,
                                                       rev = rev,
                                                       set_seed = set_seed,
                                                       seed_number = seed_number,
@@ -1012,6 +1013,339 @@ runPCAprojection = function(gobject,
 
 
 
+
+#' @title runPCAprojectionBatch
+#' @name runPCAprojectionBatch
+#' @description runs a Principal Component Analysis on multiple random batches + projection
+#' @param gobject giotto object
+#' @param spat_unit spatial unit
+#' @param feat_type feature type
+#' @param expression_values expression values to use
+#' @param reduction cells or genes
+#' @param random_subset random subset to perform PCA on
+#' @param batch_number number of random batches to run
+#' @param name arbitrary name for PCA run
+#' @param feats_to_use subset of features to use for PCA
+#' @param genes_to_use deprecated use feats_to_use
+#' @param return_gobject boolean: return giotto object (default = TRUE)
+#' @param center center data first (default = TRUE)
+#' @param scale_unit scale features before PCA (default = TRUE)
+#' @param ncp number of principal components to calculate
+#' @param method which implementation to use
+#' @param method_params BiocParallelParam object
+#' @param rev do a reverse PCA
+#' @param set_seed use of seed
+#' @param seed_number seed number to use
+#' @param verbose verbosity of the function
+#' @param ... additional parameters for PCA (see details)
+#' @return giotto object with updated PCA dimension recuction
+#' @details See \code{\link[BiocSingular]{runPCA}} and \code{\link[FactoMineR]{PCA}} for more information about other parameters.
+#' This PCA implementation is similar to  \code{\link{runPCA}} and  \code{\link{runPCAprojection}},
+#' except that it performs PCA on multiple subsets (batches) of the cells or features,
+#' and predict on the others. This can significantly increase speed without sacrificing accuracy too much.
+#' \itemize{
+#'   \item feats_to_use = NULL: will use all features from the selected matrix
+#'   \item feats_to_use = <hvg name>: can be used to select a column name of
+#'   highly variable features, created by (see \code{\link{calculateHVF}})
+#'   \item feats_to_use = c('geneA', 'geneB', ...): will use all manually provided features
+#' }
+#' @export
+runPCAprojectionBatch = function(gobject,
+                                 spat_unit = NULL,
+                                 feat_type = NULL,
+                                 expression_values = c('normalized', 'scaled', 'custom'),
+                                 reduction = c('cells', 'feats'),
+                                 random_subset = 500,
+                                 batch_number = 5,
+                                 name = 'pca.projection.batch',
+                                 feats_to_use = 'hvf',
+                                 return_gobject = TRUE,
+                                 center = TRUE,
+                                 scale_unit = TRUE,
+                                 ncp = 100,
+                                 method = c('irlba'),
+                                 method_params = BiocParallel::SerialParam(),
+                                 rev = FALSE,
+                                 set_seed = TRUE,
+                                 seed_number = 1234,
+                                 verbose = TRUE,
+                                 ...) {
+
+
+  # Set feat_type and spat_unit
+  spat_unit = set_default_spat_unit(gobject = gobject,
+                                    spat_unit = spat_unit)
+  feat_type = set_default_feat_type(gobject = gobject,
+                                    spat_unit = spat_unit,
+                                    feat_type = feat_type)
+
+  # specify name to use for pca
+  if(is.null(name)) {
+    if(feat_type == 'rna') {
+      name = 'pca'
+    } else {
+      name = paste0(feat_type,'.','pca')
+    }
+  }
+
+  # expression values to be used
+  values = match.arg(expression_values, unique(c('normalized', 'scaled', 'custom', expression_values)))
+  expr_values = get_expression_values(gobject = gobject,
+                                      feat_type = feat_type,
+                                      spat_unit = spat_unit,
+                                      values = values,
+                                      output = 'exprObj')
+
+  provenance = prov(expr_values)
+
+  if(!is.null(slot(gobject, 'h5_file'))) {
+    expr_path = slot(expr_values, 'exprMat')
+
+    expr_values = HDF5Array::h5mread(filepath = slot(gobject, 'h5_file'),
+                                     name = paste0('expression/',
+                                                   feat_type,'/',
+                                                   values))
+
+    expr_dimnames = HDF5Array::h5readDimnames(filepath = slot(gobject, 'h5_file'),
+                                              name = paste0('expression/',
+                                                            feat_type,'/',
+                                                            values))
+
+    rownames(expr_values) = expr_dimnames[[1]]
+    colnames(expr_values) = expr_dimnames[[2]]
+
+  } else {
+    expr_values = expr_values[] # extract matrix
+  }
+
+
+
+  ## subset matrix
+  if(!is.null(feats_to_use)) {
+    expr_values = create_feats_to_use_matrix(gobject = gobject,
+                                             spat_unit = spat_unit,
+                                             feat_type = feat_type,
+                                             sel_matrix = expr_values,
+                                             feats_to_use = feats_to_use,
+                                             verbose = verbose)
+  }
+
+
+  # do PCA dimension reduction
+  reduction = match.arg(reduction, c('cells', 'feats'))
+
+  # PCA implementation
+  method = match.arg(method, c('irlba'))
+
+  if(reduction == 'cells') {
+
+    pca_batch_results = list()
+
+    for(batch in 1:batch_number) {
+
+      if(verbose) wrap_msg('start batch ', batch)
+
+      if(isTRUE(set_seed)) {
+        seed_batch = seed_number+batch
+      } else {
+        seed_batch = NULL
+      }
+
+
+      # PCA on cells
+      pca_object = runPCA_BiocSingular_irlba_projection(x = t_flex(expr_values),
+                                                        ncp = ncp,
+                                                        center = center,
+                                                        scale = scale_unit,
+                                                        rev = rev,
+                                                        set_seed = set_seed,
+                                                        seed_number = seed_batch,
+                                                        BPPARAM = method_params,
+                                                        random_subset = random_subset,
+                                                        verbose = verbose,
+                                                        ...)
+
+      # adjust the sign of the coordinates and loadings vector relative to first batch
+      # this is necessary for the next averaging step
+      if(batch == 1) {
+        pca_batch_results[[batch]] = pca_object
+      } else {
+
+        for(dimension in 1:ncol(pca_object[['coords']])) {
+          sum_evaluation = sum(sign(pca_batch_results[[1]][['coords']][1:20, dimension]) *
+                                 sign(pca_object[['coords']][1:20, dimension]))
+          if(sum_evaluation < 0) {
+            pca_object$coords[, dimension] = -1 * pca_object$coords[, dimension]
+            pca_object$loadings[, dimension] = -1 * pca_object$loadings[, dimension]
+          }
+        }
+        pca_batch_results[[batch]] = pca_object
+      }
+    }
+
+    if(verbose) wrap_msg('start averaging pca results of batches')
+
+    # calculate average eigenvalues, coordinates and loadings
+    # TODO: test out DT approach, might be faster and more efficient for
+    # large datasets
+
+    # eigenvalues
+    eigenvalues_list = lapply(pca_batch_results, FUN = function(x) x$eigenvalues)
+    eigenvalues_matrix = do.call('cbind', eigenvalues_list)
+    eigenvalues_mean = rowMeans_flex(eigenvalues_matrix)
+
+    # coordinates
+    coords_list = lapply(pca_batch_results, FUN = function(x) x$coords)
+    coords_vector = do.call('c', coords_list)
+    coords_array = array(data = coords_vector, dim = c(ncol(expr_values), ncp, length(pca_batch_results)))
+    coords_all = apply(coords_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
+    rownames(coords_all) = rownames(pca_batch_results[[1]][['coords']])
+    colnames(coords_all) = colnames(pca_batch_results[[1]][['coords']])
+
+    # loadings
+    loadings_list = lapply(pca_batch_results, FUN = function(x) x$loadings)
+    loadings_vector = do.call('c', loadings_list)
+    loadings_array = array(data = loadings_vector, dim = c(nrow(expr_values), ncp, length(pca_batch_results)))
+    loadings_all = apply(loadings_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
+    rownames(loadings_all) = rownames(pca_batch_results[[1]][['loadings']])
+    colnames(loadings_all) = colnames(pca_batch_results[[1]][['loadings']])
+
+
+    pca_object = list(eigenvalues = eigenvalues_mean, loadings = loadings_all, coords = coords_all)
+
+
+  } else {
+
+
+    pca_batch_results = list()
+
+    for(batch in 1:batch_number) {
+
+      if(verbose) wrap_msg('start batch ', batch)
+
+      if(isTRUE(set_seed)) {
+        seed_batch = seed_number+batch
+      } else {
+        seed_batch = NULL
+      }
+
+
+      # PCA on features
+      pca_object = runPCA_BiocSingular_irlba_projection(x = expr_values,
+                                                        ncp = ncp,
+                                                        center = center,
+                                                        scale = scale_unit,
+                                                        rev = rev,
+                                                        set_seed = set_seed,
+                                                        seed_number = seed_number,
+                                                        BPPARAM = method_params,
+                                                        random_subset = random_subset,
+                                                        verbose = verbose,
+                                                        ...)
+
+
+      # adjust the sign of the coordinates and loadings vector relative to first batch
+      # this is necessary for the next averaging step
+      if(batch == 1) {
+        pca_batch_results[[batch]] = pca_object
+      } else {
+
+        for(dimension in 1:ncol(pca_object[['coords']])) {
+          sum_evaluation = sum(sign(pca_batch_results[[1]][['coords']][1:20, dimension]) *
+                                 sign(pca_object[['coords']][1:20, dimension]))
+          if(sum_evaluation < 0) {
+            pca_object$coords[, dimension] = -1 * pca_object$coords[, dimension]
+            pca_object$loadings[, dimension] = -1 * pca_object$loadings[, dimension]
+          }
+        }
+        pca_batch_results[[batch]] = pca_object
+      }
+    }
+
+    if(verbose) wrap_msg('start averaging pca results of batches')
+
+    # calculate average eigenvalues, coordinates and loadings
+    # TODO: test out DT approach, might be faster and more efficient for
+    # large datasets
+
+    # eigenvalues
+    eigenvalues_list = lapply(pca_batch_results, FUN = function(x) x$eigenvalues)
+    eigenvalues_matrix = do.call('cbind', eigenvalues_list)
+    eigenvalues_mean = rowMeans_flex(eigenvalues_matrix)
+
+    # coordinates
+    coords_list = lapply(pca_batch_results, FUN = function(x) x$coords)
+    coords_vector = do.call('c', coords_list)
+    coords_array = array(data = coords_vector, dim = c(ncol(expr_values), ncp, length(pca_batch_results)))
+    coords_all = apply(coords_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
+    rownames(coords_all) = rownames(pca_batch_results[[1]][['coords']])
+    colnames(coords_all) = colnames(pca_batch_results[[1]][['coords']])
+
+    # loadings
+    loadings_list = lapply(pca_batch_results, FUN = function(x) x$loadings)
+    loadings_vector = do.call('c', loadings_list)
+    loadings_array = array(data = loadings_vector, dim = c(nrow(expr_values), ncp, length(pca_batch_results)))
+    loadings_all = apply(loadings_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
+    rownames(loadings_all) = rownames(pca_batch_results[[1]][['loadings']])
+    colnames(loadings_all) = colnames(pca_batch_results[[1]][['loadings']])
+
+
+    pca_object = list(eigenvalues = eigenvalues_mean, loadings = loadings_all, coords = coords_all)
+
+
+
+  }
+
+
+  if(return_gobject == TRUE) {
+
+    pca_names = list_dim_reductions_names(gobject = gobject,
+                                          data_type = reduction,
+                                          spat_unit = spat_unit,
+                                          feat_type = feat_type,
+                                          dim_type = 'pca')
+
+    if(name %in% pca_names) {
+      cat('\n ', name, ' has already been used, will be overwritten \n')
+    }
+
+    if (reduction == "cells") {
+      my_row_names = colnames(expr_values)
+    } else {
+      my_row_names = rownames(expr_values)
+    }
+
+    dimObject = create_dim_obj(name = name,
+                               feat_type = feat_type,
+                               spat_unit = spat_unit,
+                               provenance = provenance,
+                               reduction = reduction,
+                               reduction_method = 'pca',
+                               coordinates = pca_object$coords,
+                               misc = list(eigenvalues = pca_object$eigenvalues,
+                                           loadings = pca_object$loadings),
+                               my_rownames = my_row_names)
+
+    ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
+    gobject = set_dimReduction(gobject = gobject, dimObject = dimObject)
+    ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
+
+
+    ## update parameters used ##
+    gobject = update_giotto_params(gobject, description = '_pca')
+    return(gobject)
+
+
+
+  } else {
+    return(pca_object)
+  }
+
+
+}
+
+
+
 ## * PC estimates ####
 # ------------------ #