From ce5b842dab5d0ca857154d6a9166b5bdddf17cf6 Mon Sep 17 00:00:00 2001
From: ShixiangWang <wangshx@shanghaitech.edu.cn>
Date: Wed, 17 Jun 2020 12:20:37 +0800
Subject: [PATCH] :+1: speed up copy number annotation, close #230

---
 R/CN-mutex-classification-methed.R |  10 +-
 R/get.R                            | 230 ++++++++++++++++++++---------
 R/get_groups.R                     |  19 ++-
 R/helper_join_segments.R           |   9 +-
 R/read_copynumber.R                |  15 +-
 R/show_sig_fit.R                   |  13 +-
 R/show_sig_profile.R               |   5 +-
 R/sig_fit.R                        |   1 -
 R/sig_fit_bootstrap_batch.R        |   2 +-
 inst/test/test.R                   |  27 ++++
 10 files changed, 234 insertions(+), 97 deletions(-)

diff --git a/R/CN-mutex-classification-methed.R b/R/CN-mutex-classification-methed.R
index e93616fc..384fea52 100644
--- a/R/CN-mutex-classification-methed.R
+++ b/R/CN-mutex-classification-methed.R
@@ -6,9 +6,9 @@
 ## similar to previous work but here we focus on each **segment**.
 ## Secondly, we classified all segments into mutually exclusive types based on features.
 get_features_mutex <- function(CN_data,
-                              cores = 1,
-                              genome_build = c("hg19", "hg38"),
-                              feature_setting = sigminer::CN.features) {
+                               cores = 1,
+                               genome_build = c("hg19", "hg38"),
+                               feature_setting = sigminer::CN.features) {
   genome_build <- match.arg(genome_build)
   # get chromosome lengths and centromere locations
   chrlen <- get_genome_annotation(data_type = "chr_size", genome_build = genome_build)
@@ -18,7 +18,7 @@ get_features_mutex <- function(CN_data,
   future::plan("multiprocess", workers = cores)
   on.exit(future::plan(oplan), add = TRUE)
 
-  #features <- unique(feature_setting$feature)
+  # features <- unique(feature_setting$feature)
   features <- c("CN", "SS")
   # c("BP10MB", "CN", "SS", "CNCP-L", "CNCP-R", "CNCP-M", "OsCN") # more?
 
@@ -36,7 +36,7 @@ get_features_mutex <- function(CN_data,
   }
 
   res <- furrr::future_map(features, .get_feature,
-                           .progress = TRUE
+    .progress = TRUE
   )
   res <- res %>% setNames(features)
   res
diff --git a/R/get.R b/R/get.R
index 5c5d0cfe..66bccab6 100644
--- a/R/get.R
+++ b/R/get.R
@@ -365,73 +365,171 @@ get_LengthFraction <- function(CN_data,
 
   segTab <- data.table::merge.data.table(segTab, arm_data, by.x = "chromosome", by.y = "chrom", all.x = TRUE)
 
-  .annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) {
-    if (end <= p_end & start >= p_start) {
-      location <- paste0(sub("chr", "", chrom), "p")
-      annotation <- "short arm"
-      fraction <- (end - start + 1) / (p_end - p_start + 1)
-    } else if (end <= q_end &
-      start >= q_start) {
-      location <- paste0(sub("chr", "", chrom), "q")
-      annotation <- "long arm"
-      fraction <- (end - start + 1) / (q_end - q_start + 1)
-    } else if (start >= p_start &
-      start <= p_end &
-      end >= q_start & end <= q_end) {
-      location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm
-      annotation <- "across short and long arm"
-      fraction <- 2 * ((end - start + 1) / total_size)
-    } else if (start < p_end & end < q_start) {
-      location <- paste0(sub("chr", "", chrom), "p")
-      annotation <- "short arm intersect with centromere region"
-      # only calculate region does not intersect
-      fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1)
-    } else if (start > p_end &
-      start < q_start & end > q_start) {
-      location <- paste0(sub("chr", "", chrom), "q")
-      annotation <- "long arm intersect with centromere region"
-      # only calculate region does not intersect
-      fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1)
-    } else {
-      location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq
-      annotation <- "segment locate in centromere region"
-      fraction <- 2 * ((end - start + 1) / total_size)
-    }
-
-    dplyr::tibble(location = location, annotation = annotation, fraction = fraction)
-  }
-
-  annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start,
-                        q_end, q_length, total_size, .pb = NULL) {
-    if (.pb$i < .pb$n) .pb$tick()$print()
-    .annot_fun(
-      chrom, start, end, p_start, p_end, p_length, q_start,
-      q_end, q_length, total_size
+  segTab[, flag := data.table::fifelse(
+    end <= p_end & start >= p_start,
+    1L,
+    data.table::fifelse(
+      end <= q_end & start >= q_start,
+      2L,
+      data.table::fifelse(
+        start >= p_start & start <= p_end & end >= q_start & end <= q_end,
+        3L,
+        data.table::fifelse(
+          start < p_end & end < q_start,
+          4L,
+          data.table::fifelse(
+            start > p_end & start < q_start & end > q_start,
+            5L,
+            6L
+          )
+        )
+      )
     )
-  }
-
-  pb <- progress_estimated(nrow(segTab), 0)
-
-  annot <- purrr::pmap_df(
-    list(
-      chrom = segTab$chromosome,
-      start = segTab$start,
-      end = segTab$end,
-      p_start = segTab$p_start,
-      p_end = segTab$p_end,
-      p_length = segTab$p_length,
-      q_start = segTab$q_start,
-      q_end = segTab$q_end,
-      q_length = segTab$q_length,
-      total_size = segTab$total_size
-    ), annot_fun,
-    .pb = pb
-  )
-
-  cbind(
-    data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL],
-    data.table::as.data.table(annot)
-  )
+  )]
+
+  segTab[, location := data.table::fifelse(
+    flag == 1L,
+    paste0(sub("chr", "", chromosome), "p"),
+    data.table::fifelse(
+      flag == 2L,
+      paste0(sub("chr", "", chromosome), "q"),
+      data.table::fifelse(
+        flag == 3L,
+        paste0(sub("chr", "", chromosome), "pq"),
+        data.table::fifelse(
+          flag == 4L,
+          paste0(sub("chr", "", chromosome), "p"),
+          data.table::fifelse(
+            flag == 5L,
+            paste0(sub("chr", "", chromosome), "q"),
+            paste0(sub("chr", "", chromosome), "pq")
+          )
+        )
+      )
+    )
+  )]
+
+  segTab[, annotation := data.table::fifelse(
+    flag == 1L,
+    "short arm",
+    data.table::fifelse(
+      flag == 2L,
+      "long arm",
+      data.table::fifelse(
+        flag == 3L,
+        "across short and long arm",
+        data.table::fifelse(
+          flag == 4L,
+          "short arm intersect with centromere region",
+          data.table::fifelse(
+            flag == 5L,
+            "long arm intersect with centromere region",
+            "segment locate in centromere region"
+          )
+        )
+      )
+    )
+  )]
+
+  segTab[, fraction := data.table::fifelse(
+    flag == 1L,
+    (end - start + 1) / (p_end - p_start + 1),
+    data.table::fifelse(
+      flag == 2L,
+      (end - start + 1) / (q_end - q_start + 1),
+      data.table::fifelse(
+        flag == 3L,
+        2 * ((end - start + 1) / total_size),
+        data.table::fifelse(
+          flag == 4L,
+          (end - start + 1 - (end - p_end)) / (p_end - p_start + 1),
+          data.table::fifelse(
+            flag == 5L,
+            (end - start + 1 - (start - q_start)) / (q_end - q_start + 1),
+            2 * ((end - start + 1) / total_size)
+          )
+        )
+      )
+    )
+  )]
+
+  segTab[, c(colnames(arm_data)[-1], "flag") := NULL]
+  segTab
+
+
+  # .annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) {
+  #   if (end <= p_end & start >= p_start) {
+  #     ## 1L
+  #     location <- paste0(sub("chr", "", chrom), "p")
+  #     annotation <- "short arm"
+  #     fraction <- (end - start + 1) / (p_end - p_start + 1)
+  #   } else if (end <= q_end &
+  #     start >= q_start) {
+  #     ## 2L
+  #     location <- paste0(sub("chr", "", chrom), "q")
+  #     annotation <- "long arm"
+  #     fraction <- (end - start + 1) / (q_end - q_start + 1)
+  #   } else if (start >= p_start &
+  #     start <= p_end &
+  #     end >= q_start & end <= q_end) {
+  #     ## 3L
+  #     location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm
+  #     annotation <- "across short and long arm"
+  #     fraction <- 2 * ((end - start + 1) / total_size)
+  #   } else if (start < p_end & end < q_start) {
+  #     ## 4L
+  #     location <- paste0(sub("chr", "", chrom), "p")
+  #     annotation <- "short arm intersect with centromere region"
+  #     # only calculate region does not intersect
+  #     fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1)
+  #   } else if (start > p_end &
+  #     start < q_start & end > q_start) {
+  #     ## 5L
+  #     location <- paste0(sub("chr", "", chrom), "q")
+  #     annotation <- "long arm intersect with centromere region"
+  #     # only calculate region does not intersect
+  #     fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1)
+  #   } else {
+  #     ## 6L
+  #     location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq
+  #     annotation <- "segment locate in centromere region"
+  #     fraction <- 2 * ((end - start + 1) / total_size)
+  #   }
+  #
+  #   dplyr::tibble(location = location, annotation = annotation, fraction = fraction)
+  # }
+  #
+  # annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start,
+  #                       q_end, q_length, total_size, .pb = NULL) {
+  #   if (.pb$i < .pb$n) .pb$tick()$print()
+  #   .annot_fun(
+  #     chrom, start, end, p_start, p_end, p_length, q_start,
+  #     q_end, q_length, total_size
+  #   )
+  # }
+  #
+  # pb <- progress_estimated(nrow(segTab), 0)
+  #
+  # annot <- purrr::pmap_df(
+  #   list(
+  #     chrom = segTab$chromosome,
+  #     start = segTab$start,
+  #     end = segTab$end,
+  #     p_start = segTab$p_start,
+  #     p_end = segTab$p_end,
+  #     p_length = segTab$p_length,
+  #     q_start = segTab$q_start,
+  #     q_end = segTab$q_end,
+  #     q_length = segTab$q_length,
+  #     total_size = segTab$total_size
+  #   ), annot_fun,
+  #   .pb = pb
+  # )
+  #
+  # cbind(
+  #   data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL],
+  #   data.table::as.data.table(annot)
+  # )
 }
 
 
diff --git a/R/get_groups.R b/R/get_groups.R
index f620bd90..4b217928 100644
--- a/R/get_groups.R
+++ b/R/get_groups.R
@@ -56,9 +56,11 @@ get_groups <- function(Signature,
                        method = c("consensus", "k-means", "exposure", "samples"),
                        n_cluster = NULL,
                        match_consensus = TRUE) {
-  fit_flag = data.table::is.data.table(Signature)
-  stopifnot(inherits(Signature, "Signature") | fit_flag,
-            is.null(n_cluster) | n_cluster > 1)
+  fit_flag <- data.table::is.data.table(Signature)
+  stopifnot(
+    inherits(Signature, "Signature") | fit_flag,
+    is.null(n_cluster) | n_cluster > 1
+  )
   method <- match.arg(method)
 
   timer <- Sys.time()
@@ -73,11 +75,10 @@ get_groups <- function(Signature,
     }
     send_success("Method checked.")
 
-    if (purrr::map_lgl(Signature, ~ifelse(is.numeric(.), any(. > 1), FALSE)) %>% any()) {
+    if (purrr::map_lgl(Signature, ~ ifelse(is.numeric(.), any(. > 1), FALSE)) %>% any()) {
       send_stop("When input is {.code data.table} (from sig_fit), a relative exposure result is valid.")
     }
     send_success("Exposure should be relative checked.")
-
   } else {
     send_success("'Signature' object detected.")
   }
@@ -141,7 +142,7 @@ get_groups <- function(Signature,
       expo_df <- get_sig_exposure(Signature, type = "relative")
     }
 
-    sig_names = colnames(expo_df)[-1]
+    sig_names <- colnames(expo_df)[-1]
     common_prefix <- Biobase::lcPrefixC(sig_names)
     mps <- seq_along(sig_names)
     names(mps) <- sig_names
@@ -216,8 +217,10 @@ get_groups <- function(Signature,
     attr(data, "map_table") <- ztable
   }
 
-  send_warning("The 'enrich_sig' column is set to dominant signature in one group, ",
-               "please check and make it consistent with biological meaning (correct it by hand if necessary).")
+  send_warning(
+    "The 'enrich_sig' column is set to dominant signature in one group, ",
+    "please check and make it consistent with biological meaning (correct it by hand if necessary)."
+  )
   return(data)
 }
 
diff --git a/R/helper_join_segments.R b/R/helper_join_segments.R
index 2d194ca6..22602a74 100644
--- a/R/helper_join_segments.R
+++ b/R/helper_join_segments.R
@@ -40,9 +40,12 @@ join_segments <- function(df) {
       } else {
         dplyr::bind_cols(
           out,
-          dplyr::summarise_at(res, dplyr::vars(-c("start", "end", "segVal")),
-                              ~ifelse(is.numeric(.), mean(., na.rm = TRUE),
-                                      paste0(unique(na.omit(.)), collapse = ",")))
+          dplyr::summarise_at(
+            res, dplyr::vars(-c("start", "end", "segVal")),
+            ~ ifelse(is.numeric(.), mean(., na.rm = TRUE),
+              paste0(unique(na.omit(.)), collapse = ",")
+            )
+          )
         )
       }
     }, df = df)
diff --git a/R/read_copynumber.R b/R/read_copynumber.R
index a5e051a3..d85eaa85 100644
--- a/R/read_copynumber.R
+++ b/R/read_copynumber.R
@@ -357,8 +357,10 @@ read_copynumber <- function(input,
   # order by segment start position by each chromosome in each sample
   data_df <- data_df[, .SD[order(.SD$start, decreasing = FALSE)], by = c("sample", "chromosome")]
   all_cols <- colnames(data_df)
-  data.table::setcolorder(data_df, neworder = c(c("chromosome", "start", "end", "segVal", "sample"),
-                                                setdiff(all_cols, c("chromosome", "start", "end", "segVal", "sample"))))
+  data.table::setcolorder(data_df, neworder = c(
+    c("chromosome", "start", "end", "segVal", "sample"),
+    setdiff(all_cols, c("chromosome", "start", "end", "segVal", "sample"))
+  ))
 
   send_success("Segmental table cleaned.")
 
@@ -368,7 +370,6 @@ read_copynumber <- function(input,
     seg_cols = new_cols[1:4],
     samp_col = new_cols[5]
   )
-  message()
   send_success("Annotation done.")
 
   send_info("Summarizing per sample.")
@@ -403,6 +404,12 @@ utils::globalVariables(
     ".",
     "N",
     ".N",
-    ".SD"
+    ".SD",
+    "flag",
+    "p_start",
+    "p_end",
+    "q_start",
+    "q_end",
+    "total_size"
   )
 )
diff --git a/R/show_sig_fit.R b/R/show_sig_fit.R
index 866e5dd2..d0dc7a59 100644
--- a/R/show_sig_fit.R
+++ b/R/show_sig_fit.R
@@ -20,7 +20,6 @@ show_sig_fit <- function(fit_result, samples = NULL, signatures = NULL,
                          width = 0.3, outlier.shape = NA,
                          add = "jitter", add.params = list(alpha = 0.3),
                          ...) {
-
   fun_setting <- plot_fun <- match.arg(plot_fun)
   plot_fun <- switch(
     plot_fun,
@@ -71,18 +70,18 @@ show_sig_fit <- function(fit_result, samples = NULL, signatures = NULL,
   ## Plotting
   if (isFALSE(fun_setting == "scatter")) {
     plot_fun(dat,
-             x = "sig", y = "exposure", color = "sig", outlier.shape = outlier.shape,
-             palette = palette, width = width, add = add, add.params = add.params,
-             title = title, xlab = xlab, ylab = ylab, legend = legend, ...
+      x = "sig", y = "exposure", color = "sig", outlier.shape = outlier.shape,
+      palette = palette, width = width, add = add, add.params = add.params,
+      title = title, xlab = xlab, ylab = ylab, legend = legend, ...
     )
   } else {
     if (legend == "none") {
       send_warning("When plot_fun='scatter', setting legend='top' is recommended.")
     }
     plot_fun(dat,
-             x = "sig", y = "exposure", color = "sample", shape = "sample",
-             palette = palette,
-             title = title, xlab = xlab, ylab = ylab, legend = legend, ...
+      x = "sig", y = "exposure", color = "sample", shape = "sample",
+      palette = palette,
+      title = title, xlab = xlab, ylab = ylab, legend = legend, ...
     )
   }
 }
diff --git a/R/show_sig_profile.R b/R/show_sig_profile.R
index 0e896cab..a24d079b 100644
--- a/R/show_sig_profile.R
+++ b/R/show_sig_profile.R
@@ -429,8 +429,9 @@ show_sig_profile <- function(Signature, mode = c("SBS", "copynumber", "DBS", "ID
   if (mode == "copynumber" & startsWith(method, "M")) {
     if (!is.null(params)) {
       params <- dplyr::mutate(params,
-                              feature = mp[.data$feature],
-                              components = paste0(.data$feature, "[", sub(".*[^0-9]+(\\d+$)", "\\1", .data$components), "]"))
+        feature = mp[.data$feature],
+        components = paste0(.data$feature, "[", sub(".*[^0-9]+(\\d+$)", "\\1", .data$components), "]")
+      )
       params$class <- factor(levels(mat[["class"]])[1], levels = levels(mat[["class"]]))
       p <- p + geom_text(aes(
         x = .data$components, y = Inf,
diff --git a/R/sig_fit.R b/R/sig_fit.R
index 91e62a07..2aa04499 100644
--- a/R/sig_fit.R
+++ b/R/sig_fit.R
@@ -422,7 +422,6 @@ decompose_QP <- function(x, y, P, type = "absolute", ...) {
 
 
 decompose_SA <- function(x, y, P, type = "absolute", ...) {
-
   if (sum(x) != 0) {
     control <- list(...)
 
diff --git a/R/sig_fit_bootstrap_batch.R b/R/sig_fit_bootstrap_batch.R
index 483e539e..80e290d6 100644
--- a/R/sig_fit_bootstrap_batch.R
+++ b/R/sig_fit_bootstrap_batch.R
@@ -128,7 +128,7 @@ sig_fit_bootstrap_batch <- function(catalogue_matrix,
   if (use_parallel) {
     oplan <- future::plan()
     future::plan("multiprocess", workers = future::availableCores())
-    #furrr::future_options(seed = TRUE)
+    # furrr::future_options(seed = TRUE)
     on.exit(future::plan(oplan), add = TRUE)
     bt_list <- furrr::future_map2(as.data.frame(catalogue_matrix), colnames(catalogue_matrix), call_bt,
       y = rownames(catalogue_matrix), methods = methods, n = n, ..., .progress = TRUE,
diff --git a/inst/test/test.R b/inst/test/test.R
index fb279727..622ea96d 100644
--- a/inst/test/test.R
+++ b/inst/test/test.R
@@ -185,3 +185,30 @@ bt_res <- sig_fit_bootstrap_batch(mt_tally$nmf_matrix %>% t(),
                                   sig_index = 1:30,
                                   sig_db = "legacy",
                                   methods = c("QP"), n = 2)
+
+load(system.file("extdata", "toy_segTab.RData",
+                 package = "sigminer", mustWork = TRUE
+))
+debug(get_LengthFraction)
+cn <- read_copynumber(segTabs,
+                      seg_cols = c("chromosome", "start", "end", "segVal"),
+                      genome_build = "hg19", complement = FALSE
+)
+
+
+cn2 <- read_copynumber(segTabs,
+                      seg_cols = c("chromosome", "start", "end", "segVal"),
+                      genome_build = "hg19", complement = FALSE
+)
+
+
+identical(cn@annotation, cn2@annotation)
+
+
+load("~/biodata/DoAbsolute/CN_list.RData")
+cn <- read_copynumber(input = CN_list$ACC@data, genome_build = "hg38",
+                      seg_cols = c("chromosome", "start", "end", "segVal"))
+cn2 <- read_copynumber(input = CN_list$ACC@data, genome_build = "hg38",
+                      seg_cols = c("chromosome", "start", "end", "segVal"))
+
+identical(cn@annotation, cn2@annotation)