Skip to content

Commit

Permalink
Merge pull request #741 from RubD/suite_dev
Browse files Browse the repository at this point in the history
update pca project code and documentation
  • Loading branch information
RubD authored Aug 26, 2023
2 parents 8f90490 + 88d87eb commit 116cefc
Show file tree
Hide file tree
Showing 4 changed files with 436 additions and 10 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ export(runPAGEEnrich)
export(runPAGEEnrich_OLD)
export(runPCA)
export(runPCAprojection)
export(runPCAprojectionBatch)
export(runPatternSimulation)
export(runRankEnrich)
export(runSpatialDeconv)
Expand Down
344 changes: 339 additions & 5 deletions R/dimension_reduction.R
Original file line number Diff line number Diff line change
Expand Up @@ -810,7 +810,7 @@ runPCA_BiocSingular_irlba_projection = function(x,

#' @title runPCAprojection
#' @name runPCAprojection
#' @description runs a Principal Component Analysis
#' @description runs a Principal Component Analysis on a random subet + projection
#' @param gobject giotto object
#' @param spat_unit spatial unit
#' @param feat_type feature type
Expand Down Expand Up @@ -849,7 +849,7 @@ runPCAprojection = function(gobject,
expression_values = c('normalized', 'scaled', 'custom'),
reduction = c('cells', 'feats'),
random_subset = 500,
name = NULL,
name = 'pca.projection',
feats_to_use = 'hvf',
return_gobject = TRUE,
center = TRUE,
Expand All @@ -860,7 +860,8 @@ runPCAprojection = function(gobject,
rev = FALSE,
set_seed = TRUE,
seed_number = 1234,
verbose = TRUE) {
verbose = TRUE,
...) {


# Set feat_type and spat_unit
Expand Down Expand Up @@ -934,7 +935,7 @@ runPCAprojection = function(gobject,
pca_object = runPCA_BiocSingular_irlba_projection(x = t_flex(expr_values),
ncp = ncp,
center = center,
scale = scale,
scale = scale_unit,
rev = rev,
set_seed = set_seed,
seed_number = seed_number,
Expand All @@ -949,7 +950,7 @@ runPCAprojection = function(gobject,
pca_object = runPCA_BiocSingular_irlba_projection(x = expr_values,
ncp = ncp,
center = center,
scale = scale,
scale = scale_unit,
rev = rev,
set_seed = set_seed,
seed_number = seed_number,
Expand Down Expand Up @@ -1012,6 +1013,339 @@ runPCAprojection = function(gobject,




#' @title runPCAprojectionBatch
#' @name runPCAprojectionBatch
#' @description runs a Principal Component Analysis on multiple random batches + projection
#' @param gobject giotto object
#' @param spat_unit spatial unit
#' @param feat_type feature type
#' @param expression_values expression values to use
#' @param reduction cells or genes
#' @param random_subset random subset to perform PCA on
#' @param batch_number number of random batches to run
#' @param name arbitrary name for PCA run
#' @param feats_to_use subset of features to use for PCA
#' @param genes_to_use deprecated use feats_to_use
#' @param return_gobject boolean: return giotto object (default = TRUE)
#' @param center center data first (default = TRUE)
#' @param scale_unit scale features before PCA (default = TRUE)
#' @param ncp number of principal components to calculate
#' @param method which implementation to use
#' @param method_params BiocParallelParam object
#' @param rev do a reverse PCA
#' @param set_seed use of seed
#' @param seed_number seed number to use
#' @param verbose verbosity of the function
#' @param ... additional parameters for PCA (see details)
#' @return giotto object with updated PCA dimension recuction
#' @details See \code{\link[BiocSingular]{runPCA}} and \code{\link[FactoMineR]{PCA}} for more information about other parameters.
#' This PCA implementation is similar to \code{\link{runPCA}} and \code{\link{runPCAprojection}},
#' except that it performs PCA on multiple subsets (batches) of the cells or features,
#' and predict on the others. This can significantly increase speed without sacrificing accuracy too much.
#' \itemize{
#' \item feats_to_use = NULL: will use all features from the selected matrix
#' \item feats_to_use = <hvg name>: can be used to select a column name of
#' highly variable features, created by (see \code{\link{calculateHVF}})
#' \item feats_to_use = c('geneA', 'geneB', ...): will use all manually provided features
#' }
#' @export
runPCAprojectionBatch = function(gobject,
spat_unit = NULL,
feat_type = NULL,
expression_values = c('normalized', 'scaled', 'custom'),
reduction = c('cells', 'feats'),
random_subset = 500,
batch_number = 5,
name = 'pca.projection.batch',
feats_to_use = 'hvf',
return_gobject = TRUE,
center = TRUE,
scale_unit = TRUE,
ncp = 100,
method = c('irlba'),
method_params = BiocParallel::SerialParam(),
rev = FALSE,
set_seed = TRUE,
seed_number = 1234,
verbose = TRUE,
...) {


# Set feat_type and spat_unit
spat_unit = set_default_spat_unit(gobject = gobject,
spat_unit = spat_unit)
feat_type = set_default_feat_type(gobject = gobject,
spat_unit = spat_unit,
feat_type = feat_type)

# specify name to use for pca
if(is.null(name)) {
if(feat_type == 'rna') {
name = 'pca'
} else {
name = paste0(feat_type,'.','pca')
}
}

# expression values to be used
values = match.arg(expression_values, unique(c('normalized', 'scaled', 'custom', expression_values)))
expr_values = get_expression_values(gobject = gobject,
feat_type = feat_type,
spat_unit = spat_unit,
values = values,
output = 'exprObj')

provenance = prov(expr_values)

if(!is.null(slot(gobject, 'h5_file'))) {
expr_path = slot(expr_values, 'exprMat')

expr_values = HDF5Array::h5mread(filepath = slot(gobject, 'h5_file'),
name = paste0('expression/',
feat_type,'/',
values))

expr_dimnames = HDF5Array::h5readDimnames(filepath = slot(gobject, 'h5_file'),
name = paste0('expression/',
feat_type,'/',
values))

rownames(expr_values) = expr_dimnames[[1]]
colnames(expr_values) = expr_dimnames[[2]]

} else {
expr_values = expr_values[] # extract matrix
}



## subset matrix
if(!is.null(feats_to_use)) {
expr_values = create_feats_to_use_matrix(gobject = gobject,
spat_unit = spat_unit,
feat_type = feat_type,
sel_matrix = expr_values,
feats_to_use = feats_to_use,
verbose = verbose)
}


# do PCA dimension reduction
reduction = match.arg(reduction, c('cells', 'feats'))

# PCA implementation
method = match.arg(method, c('irlba'))

if(reduction == 'cells') {

pca_batch_results = list()

for(batch in 1:batch_number) {

if(verbose) wrap_msg('start batch ', batch)

if(isTRUE(set_seed)) {
seed_batch = seed_number+batch
} else {
seed_batch = NULL
}


# PCA on cells
pca_object = runPCA_BiocSingular_irlba_projection(x = t_flex(expr_values),
ncp = ncp,
center = center,
scale = scale_unit,
rev = rev,
set_seed = set_seed,
seed_number = seed_batch,
BPPARAM = method_params,
random_subset = random_subset,
verbose = verbose,
...)

# adjust the sign of the coordinates and loadings vector relative to first batch
# this is necessary for the next averaging step
if(batch == 1) {
pca_batch_results[[batch]] = pca_object
} else {

for(dimension in 1:ncol(pca_object[['coords']])) {
sum_evaluation = sum(sign(pca_batch_results[[1]][['coords']][1:20, dimension]) *
sign(pca_object[['coords']][1:20, dimension]))
if(sum_evaluation < 0) {
pca_object$coords[, dimension] = -1 * pca_object$coords[, dimension]
pca_object$loadings[, dimension] = -1 * pca_object$loadings[, dimension]
}
}
pca_batch_results[[batch]] = pca_object
}
}

if(verbose) wrap_msg('start averaging pca results of batches')

# calculate average eigenvalues, coordinates and loadings
# TODO: test out DT approach, might be faster and more efficient for
# large datasets

# eigenvalues
eigenvalues_list = lapply(pca_batch_results, FUN = function(x) x$eigenvalues)
eigenvalues_matrix = do.call('cbind', eigenvalues_list)
eigenvalues_mean = rowMeans_flex(eigenvalues_matrix)

# coordinates
coords_list = lapply(pca_batch_results, FUN = function(x) x$coords)
coords_vector = do.call('c', coords_list)
coords_array = array(data = coords_vector, dim = c(ncol(expr_values), ncp, length(pca_batch_results)))
coords_all = apply(coords_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
rownames(coords_all) = rownames(pca_batch_results[[1]][['coords']])
colnames(coords_all) = colnames(pca_batch_results[[1]][['coords']])

# loadings
loadings_list = lapply(pca_batch_results, FUN = function(x) x$loadings)
loadings_vector = do.call('c', loadings_list)
loadings_array = array(data = loadings_vector, dim = c(nrow(expr_values), ncp, length(pca_batch_results)))
loadings_all = apply(loadings_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
rownames(loadings_all) = rownames(pca_batch_results[[1]][['loadings']])
colnames(loadings_all) = colnames(pca_batch_results[[1]][['loadings']])


pca_object = list(eigenvalues = eigenvalues_mean, loadings = loadings_all, coords = coords_all)


} else {


pca_batch_results = list()

for(batch in 1:batch_number) {

if(verbose) wrap_msg('start batch ', batch)

if(isTRUE(set_seed)) {
seed_batch = seed_number+batch
} else {
seed_batch = NULL
}


# PCA on features
pca_object = runPCA_BiocSingular_irlba_projection(x = expr_values,
ncp = ncp,
center = center,
scale = scale_unit,
rev = rev,
set_seed = set_seed,
seed_number = seed_number,
BPPARAM = method_params,
random_subset = random_subset,
verbose = verbose,
...)


# adjust the sign of the coordinates and loadings vector relative to first batch
# this is necessary for the next averaging step
if(batch == 1) {
pca_batch_results[[batch]] = pca_object
} else {

for(dimension in 1:ncol(pca_object[['coords']])) {
sum_evaluation = sum(sign(pca_batch_results[[1]][['coords']][1:20, dimension]) *
sign(pca_object[['coords']][1:20, dimension]))
if(sum_evaluation < 0) {
pca_object$coords[, dimension] = -1 * pca_object$coords[, dimension]
pca_object$loadings[, dimension] = -1 * pca_object$loadings[, dimension]
}
}
pca_batch_results[[batch]] = pca_object
}
}

if(verbose) wrap_msg('start averaging pca results of batches')

# calculate average eigenvalues, coordinates and loadings
# TODO: test out DT approach, might be faster and more efficient for
# large datasets

# eigenvalues
eigenvalues_list = lapply(pca_batch_results, FUN = function(x) x$eigenvalues)
eigenvalues_matrix = do.call('cbind', eigenvalues_list)
eigenvalues_mean = rowMeans_flex(eigenvalues_matrix)

# coordinates
coords_list = lapply(pca_batch_results, FUN = function(x) x$coords)
coords_vector = do.call('c', coords_list)
coords_array = array(data = coords_vector, dim = c(ncol(expr_values), ncp, length(pca_batch_results)))
coords_all = apply(coords_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
rownames(coords_all) = rownames(pca_batch_results[[1]][['coords']])
colnames(coords_all) = colnames(pca_batch_results[[1]][['coords']])

# loadings
loadings_list = lapply(pca_batch_results, FUN = function(x) x$loadings)
loadings_vector = do.call('c', loadings_list)
loadings_array = array(data = loadings_vector, dim = c(nrow(expr_values), ncp, length(pca_batch_results)))
loadings_all = apply(loadings_array, MARGIN = c(1:2), function(arr){mean(arr, na.rm=TRUE)})
rownames(loadings_all) = rownames(pca_batch_results[[1]][['loadings']])
colnames(loadings_all) = colnames(pca_batch_results[[1]][['loadings']])


pca_object = list(eigenvalues = eigenvalues_mean, loadings = loadings_all, coords = coords_all)



}


if(return_gobject == TRUE) {

pca_names = list_dim_reductions_names(gobject = gobject,
data_type = reduction,
spat_unit = spat_unit,
feat_type = feat_type,
dim_type = 'pca')

if(name %in% pca_names) {
cat('\n ', name, ' has already been used, will be overwritten \n')
}

if (reduction == "cells") {
my_row_names = colnames(expr_values)
} else {
my_row_names = rownames(expr_values)
}

dimObject = create_dim_obj(name = name,
feat_type = feat_type,
spat_unit = spat_unit,
provenance = provenance,
reduction = reduction,
reduction_method = 'pca',
coordinates = pca_object$coords,
misc = list(eigenvalues = pca_object$eigenvalues,
loadings = pca_object$loadings),
my_rownames = my_row_names)

### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###
gobject = set_dimReduction(gobject = gobject, dimObject = dimObject)
### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ###


## update parameters used ##
gobject = update_giotto_params(gobject, description = '_pca')
return(gobject)



} else {
return(pca_object)
}


}



## * PC estimates ####
# ------------------ #

Expand Down
Loading

0 comments on commit 116cefc

Please sign in to comment.