Merge pull request #99 from zhanghao-njmu/develop

Update readme
zhanghao-njmu · Feb 9, 2023 · 5abb7f4 · 5abb7f4
2 parents 62b94d7 + a166332
commit 5abb7f4
Show file tree

Hide file tree

Showing 12 changed files with 147 additions and 136 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -91,11 +91,12 @@ Suggests:
     scDblFinder,
     scmap,
     styler,
+    sva,
     SingleR,
     testthat (>= 3.0.0),
     UCell,
     umap
 LinkingTo: Rcpp
 RoxygenNote: 7.2.3
 Config/testthat/edition: 3
-URL: https://github.com/zhanghao-njmu/SCP/, https://zhanghao-njmu.github.io/SCP/
+URL: https://github.com/zhanghao-njmu/SCP
diff --git a/NAMESPACE b/NAMESPACE
@@ -264,7 +264,6 @@ importFrom(dplyr,across)
 importFrom(dplyr,all_of)
 importFrom(dplyr,arrange)
 importFrom(dplyr,arrange_at)
-importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
 importFrom(dplyr,desc)
 importFrom(dplyr,distinct)
@@ -276,8 +275,6 @@ importFrom(dplyr,n)
 importFrom(dplyr,pull)
 importFrom(dplyr,reframe)
 importFrom(dplyr,summarise_at)
-importFrom(dplyr,summarize)
-importFrom(dplyr,top_n)
 importFrom(ggnewscale,new_scale)
 importFrom(ggnewscale,new_scale_color)
 importFrom(ggnewscale,new_scale_fill)

diff --git a/R/SCP-analysis.R b/R/SCP-analysis.R
@@ -42,7 +42,6 @@
 #' homologs_counts <- as(as.matrix(homologs_counts[, -1]), "dgCMatrix")
 #' homologs_counts
 #'
-#' @importFrom dplyr "%>%" group_by mutate .data bind_rows
 #' @importFrom reshape2 dcast melt
 #' @importFrom R.cache loadCache saveCache
 #' @importFrom biomaRt listEnsemblArchives useMart listDatasets useDataset getBM listAttributes useEnsembl
@@ -367,18 +366,20 @@ GeneConvert <- function(geneID, geneID_from_IDtype = "symbol", geneID_to_IDtype
     ), "\n",
     paste0(rep("=", 30), collapse = ""), "\n"
   )
-  geneID_res <- bind_rows(geneID_res_list) %>% unique()
-  if (is.null(geneID_res) || nrow(geneID_res) == 0) {
-    warning(paste0("No gene mapped"), immediate. = TRUE)
+  geneID_res <- unique(do.call(rbind, geneID_res_list))
+  if (is.null(geneID_res) || nrow(geneID_res) == 0 || all(is.na(geneID_res[["to_geneID"]]))) {
+    warning(paste0("None of the gene IDs were converted"), immediate. = TRUE)
     return(list(geneID_res = NULL, geneID_collapse = NULL, geneID_expand = NULL, Datasets = Datasets, Attributes = Attributes))
   }
-  geneID_collapse <- geneID_res %>%
-    group_by(.data[["from_geneID"]], .data[["to_IDtype"]]) %>%
-    mutate(
-      from_geneID = unique(.data[["from_geneID"]]),
-      to_IDtype = unique(.data[["to_IDtype"]]),
-      to_geneID = list(unique(.data[["to_geneID"]][!.data[["to_geneID"]] %in% c("", NA)]))
+  geneID_res_stat <- by(geneID_res, list(geneID_res[["from_geneID"]], geneID_res[["to_IDtype"]]), function(x) {
+    data.frame(
+      from_IDtype = unique(x[["from_IDtype"]]),
+      from_geneID = unique(x[["from_geneID"]]),
+      to_IDtype = unique(x[["to_IDtype"]]),
+      to_geneID = I(list(unique(x[["to_geneID"]][!x[["to_geneID"]] %in% c("", NA)])))
     )
+  })
+  geneID_collapse <- do.call(rbind, geneID_res_stat)
   geneID_collapse <- unique(as.data.frame(geneID_collapse[, c("from_geneID", "to_IDtype", "to_geneID")]))
   geneID_collapse <- geneID_collapse[sapply(geneID_collapse$to_geneID, length) > 0, ]
   geneID_collapse <- reshape2::dcast(geneID_collapse, formula = from_geneID ~ to_IDtype, value.var = "to_geneID")
@@ -2765,7 +2766,6 @@ PrepareDB <- function(species = c("Homo_sapiens", "Mus_musculus"),
 #' EnrichmentPlot(res = enrich_out, group_use = c("Ngn3 low EP", "Endocrine"), db = "GO_BP")
 #'
 #' @importFrom BiocParallel bplapply
-#' @importFrom dplyr bind_rows
 #' @export
 #'
 RunEnrichment <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_threshold = "avg_log2FC > 0 & p_val_adj < 0.05",
@@ -2797,7 +2797,8 @@ RunEnrichment <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_t
     }
     de <- names(srt@tools[[slot]])[index]
     de_df <- srt@tools[[slot]][[de]]
-    de_df <- dplyr::filter(de_df, eval(rlang::parse_expr(DE_threshold)))
+    de_df <- de_df[with(de_df, eval(rlang::parse_expr(DE_threshold))), , drop = FALSE]
+    rownames(de_df) <- seq_len(nrow(de_df))
 
     geneID <- de_df[["gene"]]
     geneID_groups <- de_df[["group1"]]
@@ -2891,13 +2892,12 @@ RunEnrichment <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_t
         result[, "Database"] <- term
         result[, "Groups"] <- group
         result[, "BgVersion"] <- as.character(db_list[[species]][[term]][["version"]])
-        IDlist <- result$geneID %>%
-          strsplit("/")
-        result$geneID <- lapply(IDlist, function(x) {
+        IDlist <- strsplit(result$geneID, split = "/")
+        result$geneID <- unlist(lapply(IDlist, function(x) {
           result_ID <- geneMap[geneMap[, IDtype] %in% x, result_IDtype]
           remain_ID <- x[!x %in% geneMap[, IDtype]]
           paste0(c(result_ID, remain_ID), collapse = "/")
-        }) %>% unlist()
+        }))
         enrich_res@result <- result
         enrich_res@gene2Symbol <- as.character(gene_mapid)
 
@@ -2942,7 +2942,7 @@ RunEnrichment <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_t
   results <- c(raw_list, sim_list)
   results <- results[!sapply(results, is.null)]
   results <- results[intersect(c(nm, paste0(nm, "_sim")), names(results))]
-  res_enrichment <- bind_rows(lapply(results, function(x) x@result))
+  res_enrichment <- do.call(rbind, lapply(results, function(x) x@result))
   rownames(res_enrichment) <- NULL
 
   time_end <- Sys.time()
@@ -3004,7 +3004,6 @@ RunEnrichment <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_t
 #' GSEAPlot(res = gsea_out, group_use = c("Ngn3 low EP", "Endocrine"), db = "GO_BP")
 #'
 #' @importFrom BiocParallel bplapply
-#' @importFrom dplyr bind_rows
 #' @export
 #'
 RunGSEA <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_threshold = "p_val_adj < 0.05",
@@ -3036,7 +3035,8 @@ RunGSEA <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_thresho
     }
     de <- names(srt@tools[[slot]])[index]
     de_df <- srt@tools[[slot]][[de]]
-    de_df <- dplyr::filter(de_df, eval(rlang::parse_expr(DE_threshold)))
+    de_df <- de_df[with(de_df, eval(rlang::parse_expr(DE_threshold))), , drop = FALSE]
+    rownames(de_df) <- seq_len(nrow(de_df))
 
     geneID <- de_df[["gene"]]
     geneScore <- de_df[["avg_log2FC"]]
@@ -3184,13 +3184,12 @@ RunGSEA <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_thresho
         result[, "Database"] <- term
         result[, "Groups"] <- group
         result[, "BgVersion"] <- as.character(db_list[[species]][[term]][["version"]])
-        IDlist <- result$core_enrichment %>%
-          strsplit("/")
-        result$core_enrichment <- lapply(IDlist, function(x) {
+        IDlist <- strsplit(result$core_enrichment, "/")
+        result$core_enrichment <- unlist(lapply(IDlist, function(x) {
           result_ID <- input[input[, IDtype] %in% x, result_IDtype]
           remain_ID <- x[!x %in% input[, IDtype]]
           paste0(c(result_ID, remain_ID), collapse = "/")
-        }) %>% unlist()
+        }))
         enrich_res@result <- result
         enrich_res@gene2Symbol <- as.character(gene_mapid)
 
@@ -3234,7 +3233,7 @@ RunGSEA <- function(srt = NULL, group_by = NULL, test.use = "wilcox", DE_thresho
   results <- c(raw_list, sim_list)
   results <- results[!sapply(results, is.null)]
   results <- results[intersect(c(nm, paste0(nm, "_sim")), names(results))]
-  res_enrichment <- bind_rows(lapply(results, function(x) x@result))
+  res_enrichment <- do.call(rbind, lapply(results, function(x) x@result))
   rownames(res_enrichment) <- NULL
 
   time_end <- Sys.time()

diff --git a/R/SCP-cell_annotation.R b/R/SCP-cell_annotation.R
@@ -90,7 +90,6 @@ NULL
 #' @importFrom methods as
 #' @importFrom Matrix t colSums rowSums
 #' @importFrom Seurat DefaultAssay GetAssayData FindVariableFeatures VariableFeatures AverageExpression FindNeighbors as.sparse
-#' @importFrom dplyr bind_rows group_by top_n pull
 #' @importFrom rlang %||%
 #' @export
 #'
@@ -142,10 +141,10 @@ RunKNNPredict <- function(srt_query, srt_ref = NULL, bulk_ref = NULL,
           de <- names(srt_query@tools[[slot]])[index]
           message("Use the DE features from ", de, " to calculate distance metric.")
           de_df <- srt_query@tools[[slot]][[de]]
-          de_df <- filter(de_df, eval(rlang::parse_expr(DE_threshold)))
-          de_top <- de_df %>%
-            group_by(gene) %>%
-            top_n(1, avg_log2FC)
+          de_df <- de_df[with(de_df, eval(rlang::parse_expr(DE_threshold))), , drop = FALSE]
+          rownames(de_df) <- seq_len(nrow(de_df))
+          de_df <- de_df[order(de_df[["avg_log2FC"]], decreasing = TRUE), , drop = FALSE]
+          de_top <- de_df[!duplicated(de_df[["gene"]]), , drop = FALSE]
           stat <- sort(table(de_top$group1))
           stat <- stat[stat > 0]
           mat <- matrix(FALSE, nrow = max(stat), ncol = length(stat))
@@ -155,15 +154,15 @@ RunKNNPredict <- function(srt_query, srt_ref = NULL, bulk_ref = NULL,
           }
           nfeatures <- sum(cumsum(rowSums(mat)) <= nfeatures)
           if (test.use == "roc") {
-            features_query <- de_top %>%
-              group_by(group1) %>%
-              top_n(nfeatures, power) %>%
-              pull("gene")
+            features_query <- unlist(by(de_top, list(de_top[["group1"]]), function(x) {
+              x <- x[order(x[["power"]], decreasing = TRUE), , drop = FALSE]
+              head(x[["gene"]], nfeatures)
+            }))
           } else {
-            features_query <- de_top %>%
-              group_by(group1) %>%
-              top_n(nfeatures, -p_val) %>%
-              pull("gene")
+            features_query <- unlist(by(de_top, list(de_top[["group1"]]), function(x) {
+              x <- x[order(x[["p_val"]], decreasing = FALSE), , drop = FALSE]
+              head(x[["gene"]], nfeatures)
+            }))
           }
           message("DE features number of the query data: ", length(features_query))
         }
@@ -233,10 +232,10 @@ RunKNNPredict <- function(srt_query, srt_ref = NULL, bulk_ref = NULL,
           de <- names(srt_ref@tools[[slot]])[index]
           message("Use the DE features from ", de, " to calculate distance metric.")
           de_df <- srt_ref@tools[[slot]][[de]]
-          de_df <- filter(de_df, eval(rlang::parse_expr(DE_threshold)))
-          de_top <- de_df %>%
-            group_by(gene) %>%
-            top_n(1, avg_log2FC)
+          de_df <- de_df[with(de_df, eval(rlang::parse_expr(DE_threshold))), , drop = FALSE]
+          rownames(de_df) <- seq_len(nrow(de_df))
+          de_df <- de_df[order(de_df[["avg_log2FC"]], decreasing = TRUE), , drop = FALSE]
+          de_top <- de_df[!duplicated(de_df[["gene"]]), , drop = FALSE]
           stat <- sort(table(de_top$group1))
           stat <- stat[stat > 0]
           mat <- matrix(FALSE, nrow = max(stat), ncol = length(stat))
@@ -246,15 +245,15 @@ RunKNNPredict <- function(srt_query, srt_ref = NULL, bulk_ref = NULL,
           }
           nfeatures <- sum(cumsum(rowSums(mat)) <= nfeatures)
           if (test.use == "roc") {
-            features_ref <- de_top %>%
-              group_by(group1) %>%
-              top_n(nfeatures, power) %>%
-              pull("gene")
+            features_ref <- unlist(by(de_top, list(de_top[["group1"]]), function(x) {
+              x <- x[order(x[["power"]], decreasing = TRUE), , drop = FALSE]
+              head(x[["gene"]], nfeatures)
+            }))
           } else {
-            features_ref <- de_top %>%
-              group_by(group1) %>%
-              top_n(nfeatures, -p_val) %>%
-              pull("gene")
+            features_ref <- unlist(by(de_top, list(de_top[["group1"]]), function(x) {
+              x <- x[order(x[["p_val"]], decreasing = FALSE), , drop = FALSE]
+              head(x[["gene"]], nfeatures)
+            }))
           }
           message("DE features number of the ref data: ", length(features_ref))
         } else {
@@ -335,7 +334,7 @@ RunKNNPredict <- function(srt_query, srt_ref = NULL, bulk_ref = NULL,
   # }",
   #     depends = c("RcppArmadillo")
   #   )
-  #   cors <- parDist(t(query), method = "custom", func = CosineCPP) %>% as.matrix()
+  #   cors <- as.matrix(parDist(t(query), method = "custom", func = CosineCPP))
   #   cors <- cors[colnames(ref.expr), colnames(tst.expr)]
   #
 
@@ -433,12 +432,11 @@ RunKNNPredict <- function(srt_query, srt_ref = NULL, bulk_ref = NULL,
         match_freq <- as.list(setNames(object = rep(k, nrow(match_k_cell)), rn))
         match_freq <- lapply(setNames(names(match_freq), names(match_freq)), function(x) setNames(k, match_k_cell[x, 1]))
       }
-      match_prob <- lapply(match_freq, function(x) {
+      match_prob <- do.call(rbind, lapply(match_freq, function(x) {
         x[level[!level %in% names(x)]] <- 0
         x <- x / sum(x)
         return(x)
-      }) %>%
-        bind_rows()
+      }))
       match_prob <- as.matrix(match_prob)
       rownames(match_prob) <- names(match_freq)
       match_best <- apply(match_prob, 1, function(x) names(x)[order(x, decreasing = TRUE)][1])