From ce5b842dab5d0ca857154d6a9166b5bdddf17cf6 Mon Sep 17 00:00:00 2001 From: ShixiangWang Date: Wed, 17 Jun 2020 12:20:37 +0800 Subject: [PATCH] :+1: speed up copy number annotation, close #230 --- R/CN-mutex-classification-methed.R | 10 +- R/get.R | 230 ++++++++++++++++++++--------- R/get_groups.R | 19 ++- R/helper_join_segments.R | 9 +- R/read_copynumber.R | 15 +- R/show_sig_fit.R | 13 +- R/show_sig_profile.R | 5 +- R/sig_fit.R | 1 - R/sig_fit_bootstrap_batch.R | 2 +- inst/test/test.R | 27 ++++ 10 files changed, 234 insertions(+), 97 deletions(-) diff --git a/R/CN-mutex-classification-methed.R b/R/CN-mutex-classification-methed.R index e93616fc..384fea52 100644 --- a/R/CN-mutex-classification-methed.R +++ b/R/CN-mutex-classification-methed.R @@ -6,9 +6,9 @@ ## similar to previous work but here we focus on each **segment**. ## Secondly, we classified all segments into mutually exclusive types based on features. get_features_mutex <- function(CN_data, - cores = 1, - genome_build = c("hg19", "hg38"), - feature_setting = sigminer::CN.features) { + cores = 1, + genome_build = c("hg19", "hg38"), + feature_setting = sigminer::CN.features) { genome_build <- match.arg(genome_build) # get chromosome lengths and centromere locations chrlen <- get_genome_annotation(data_type = "chr_size", genome_build = genome_build) @@ -18,7 +18,7 @@ get_features_mutex <- function(CN_data, future::plan("multiprocess", workers = cores) on.exit(future::plan(oplan), add = TRUE) - #features <- unique(feature_setting$feature) + # features <- unique(feature_setting$feature) features <- c("CN", "SS") # c("BP10MB", "CN", "SS", "CNCP-L", "CNCP-R", "CNCP-M", "OsCN") # more? @@ -36,7 +36,7 @@ get_features_mutex <- function(CN_data, } res <- furrr::future_map(features, .get_feature, - .progress = TRUE + .progress = TRUE ) res <- res %>% setNames(features) res diff --git a/R/get.R b/R/get.R index 5c5d0cfe..66bccab6 100644 --- a/R/get.R +++ b/R/get.R @@ -365,73 +365,171 @@ get_LengthFraction <- function(CN_data, segTab <- data.table::merge.data.table(segTab, arm_data, by.x = "chromosome", by.y = "chrom", all.x = TRUE) - .annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) { - if (end <= p_end & start >= p_start) { - location <- paste0(sub("chr", "", chrom), "p") - annotation <- "short arm" - fraction <- (end - start + 1) / (p_end - p_start + 1) - } else if (end <= q_end & - start >= q_start) { - location <- paste0(sub("chr", "", chrom), "q") - annotation <- "long arm" - fraction <- (end - start + 1) / (q_end - q_start + 1) - } else if (start >= p_start & - start <= p_end & - end >= q_start & end <= q_end) { - location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm - annotation <- "across short and long arm" - fraction <- 2 * ((end - start + 1) / total_size) - } else if (start < p_end & end < q_start) { - location <- paste0(sub("chr", "", chrom), "p") - annotation <- "short arm intersect with centromere region" - # only calculate region does not intersect - fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1) - } else if (start > p_end & - start < q_start & end > q_start) { - location <- paste0(sub("chr", "", chrom), "q") - annotation <- "long arm intersect with centromere region" - # only calculate region does not intersect - fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1) - } else { - location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq - annotation <- "segment locate in centromere region" - fraction <- 2 * ((end - start + 1) / total_size) - } - - dplyr::tibble(location = location, annotation = annotation, fraction = fraction) - } - - annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, - q_end, q_length, total_size, .pb = NULL) { - if (.pb$i < .pb$n) .pb$tick()$print() - .annot_fun( - chrom, start, end, p_start, p_end, p_length, q_start, - q_end, q_length, total_size + segTab[, flag := data.table::fifelse( + end <= p_end & start >= p_start, + 1L, + data.table::fifelse( + end <= q_end & start >= q_start, + 2L, + data.table::fifelse( + start >= p_start & start <= p_end & end >= q_start & end <= q_end, + 3L, + data.table::fifelse( + start < p_end & end < q_start, + 4L, + data.table::fifelse( + start > p_end & start < q_start & end > q_start, + 5L, + 6L + ) + ) + ) ) - } - - pb <- progress_estimated(nrow(segTab), 0) - - annot <- purrr::pmap_df( - list( - chrom = segTab$chromosome, - start = segTab$start, - end = segTab$end, - p_start = segTab$p_start, - p_end = segTab$p_end, - p_length = segTab$p_length, - q_start = segTab$q_start, - q_end = segTab$q_end, - q_length = segTab$q_length, - total_size = segTab$total_size - ), annot_fun, - .pb = pb - ) - - cbind( - data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL], - data.table::as.data.table(annot) - ) + )] + + segTab[, location := data.table::fifelse( + flag == 1L, + paste0(sub("chr", "", chromosome), "p"), + data.table::fifelse( + flag == 2L, + paste0(sub("chr", "", chromosome), "q"), + data.table::fifelse( + flag == 3L, + paste0(sub("chr", "", chromosome), "pq"), + data.table::fifelse( + flag == 4L, + paste0(sub("chr", "", chromosome), "p"), + data.table::fifelse( + flag == 5L, + paste0(sub("chr", "", chromosome), "q"), + paste0(sub("chr", "", chromosome), "pq") + ) + ) + ) + ) + )] + + segTab[, annotation := data.table::fifelse( + flag == 1L, + "short arm", + data.table::fifelse( + flag == 2L, + "long arm", + data.table::fifelse( + flag == 3L, + "across short and long arm", + data.table::fifelse( + flag == 4L, + "short arm intersect with centromere region", + data.table::fifelse( + flag == 5L, + "long arm intersect with centromere region", + "segment locate in centromere region" + ) + ) + ) + ) + )] + + segTab[, fraction := data.table::fifelse( + flag == 1L, + (end - start + 1) / (p_end - p_start + 1), + data.table::fifelse( + flag == 2L, + (end - start + 1) / (q_end - q_start + 1), + data.table::fifelse( + flag == 3L, + 2 * ((end - start + 1) / total_size), + data.table::fifelse( + flag == 4L, + (end - start + 1 - (end - p_end)) / (p_end - p_start + 1), + data.table::fifelse( + flag == 5L, + (end - start + 1 - (start - q_start)) / (q_end - q_start + 1), + 2 * ((end - start + 1) / total_size) + ) + ) + ) + ) + )] + + segTab[, c(colnames(arm_data)[-1], "flag") := NULL] + segTab + + + # .annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) { + # if (end <= p_end & start >= p_start) { + # ## 1L + # location <- paste0(sub("chr", "", chrom), "p") + # annotation <- "short arm" + # fraction <- (end - start + 1) / (p_end - p_start + 1) + # } else if (end <= q_end & + # start >= q_start) { + # ## 2L + # location <- paste0(sub("chr", "", chrom), "q") + # annotation <- "long arm" + # fraction <- (end - start + 1) / (q_end - q_start + 1) + # } else if (start >= p_start & + # start <= p_end & + # end >= q_start & end <= q_end) { + # ## 3L + # location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm + # annotation <- "across short and long arm" + # fraction <- 2 * ((end - start + 1) / total_size) + # } else if (start < p_end & end < q_start) { + # ## 4L + # location <- paste0(sub("chr", "", chrom), "p") + # annotation <- "short arm intersect with centromere region" + # # only calculate region does not intersect + # fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1) + # } else if (start > p_end & + # start < q_start & end > q_start) { + # ## 5L + # location <- paste0(sub("chr", "", chrom), "q") + # annotation <- "long arm intersect with centromere region" + # # only calculate region does not intersect + # fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1) + # } else { + # ## 6L + # location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq + # annotation <- "segment locate in centromere region" + # fraction <- 2 * ((end - start + 1) / total_size) + # } + # + # dplyr::tibble(location = location, annotation = annotation, fraction = fraction) + # } + # + # annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, + # q_end, q_length, total_size, .pb = NULL) { + # if (.pb$i < .pb$n) .pb$tick()$print() + # .annot_fun( + # chrom, start, end, p_start, p_end, p_length, q_start, + # q_end, q_length, total_size + # ) + # } + # + # pb <- progress_estimated(nrow(segTab), 0) + # + # annot <- purrr::pmap_df( + # list( + # chrom = segTab$chromosome, + # start = segTab$start, + # end = segTab$end, + # p_start = segTab$p_start, + # p_end = segTab$p_end, + # p_length = segTab$p_length, + # q_start = segTab$q_start, + # q_end = segTab$q_end, + # q_length = segTab$q_length, + # total_size = segTab$total_size + # ), annot_fun, + # .pb = pb + # ) + # + # cbind( + # data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL], + # data.table::as.data.table(annot) + # ) } diff --git a/R/get_groups.R b/R/get_groups.R index f620bd90..4b217928 100644 --- a/R/get_groups.R +++ b/R/get_groups.R @@ -56,9 +56,11 @@ get_groups <- function(Signature, method = c("consensus", "k-means", "exposure", "samples"), n_cluster = NULL, match_consensus = TRUE) { - fit_flag = data.table::is.data.table(Signature) - stopifnot(inherits(Signature, "Signature") | fit_flag, - is.null(n_cluster) | n_cluster > 1) + fit_flag <- data.table::is.data.table(Signature) + stopifnot( + inherits(Signature, "Signature") | fit_flag, + is.null(n_cluster) | n_cluster > 1 + ) method <- match.arg(method) timer <- Sys.time() @@ -73,11 +75,10 @@ get_groups <- function(Signature, } send_success("Method checked.") - if (purrr::map_lgl(Signature, ~ifelse(is.numeric(.), any(. > 1), FALSE)) %>% any()) { + if (purrr::map_lgl(Signature, ~ ifelse(is.numeric(.), any(. > 1), FALSE)) %>% any()) { send_stop("When input is {.code data.table} (from sig_fit), a relative exposure result is valid.") } send_success("Exposure should be relative checked.") - } else { send_success("'Signature' object detected.") } @@ -141,7 +142,7 @@ get_groups <- function(Signature, expo_df <- get_sig_exposure(Signature, type = "relative") } - sig_names = colnames(expo_df)[-1] + sig_names <- colnames(expo_df)[-1] common_prefix <- Biobase::lcPrefixC(sig_names) mps <- seq_along(sig_names) names(mps) <- sig_names @@ -216,8 +217,10 @@ get_groups <- function(Signature, attr(data, "map_table") <- ztable } - send_warning("The 'enrich_sig' column is set to dominant signature in one group, ", - "please check and make it consistent with biological meaning (correct it by hand if necessary).") + send_warning( + "The 'enrich_sig' column is set to dominant signature in one group, ", + "please check and make it consistent with biological meaning (correct it by hand if necessary)." + ) return(data) } diff --git a/R/helper_join_segments.R b/R/helper_join_segments.R index 2d194ca6..22602a74 100644 --- a/R/helper_join_segments.R +++ b/R/helper_join_segments.R @@ -40,9 +40,12 @@ join_segments <- function(df) { } else { dplyr::bind_cols( out, - dplyr::summarise_at(res, dplyr::vars(-c("start", "end", "segVal")), - ~ifelse(is.numeric(.), mean(., na.rm = TRUE), - paste0(unique(na.omit(.)), collapse = ","))) + dplyr::summarise_at( + res, dplyr::vars(-c("start", "end", "segVal")), + ~ ifelse(is.numeric(.), mean(., na.rm = TRUE), + paste0(unique(na.omit(.)), collapse = ",") + ) + ) ) } }, df = df) diff --git a/R/read_copynumber.R b/R/read_copynumber.R index a5e051a3..d85eaa85 100644 --- a/R/read_copynumber.R +++ b/R/read_copynumber.R @@ -357,8 +357,10 @@ read_copynumber <- function(input, # order by segment start position by each chromosome in each sample data_df <- data_df[, .SD[order(.SD$start, decreasing = FALSE)], by = c("sample", "chromosome")] all_cols <- colnames(data_df) - data.table::setcolorder(data_df, neworder = c(c("chromosome", "start", "end", "segVal", "sample"), - setdiff(all_cols, c("chromosome", "start", "end", "segVal", "sample")))) + data.table::setcolorder(data_df, neworder = c( + c("chromosome", "start", "end", "segVal", "sample"), + setdiff(all_cols, c("chromosome", "start", "end", "segVal", "sample")) + )) send_success("Segmental table cleaned.") @@ -368,7 +370,6 @@ read_copynumber <- function(input, seg_cols = new_cols[1:4], samp_col = new_cols[5] ) - message() send_success("Annotation done.") send_info("Summarizing per sample.") @@ -403,6 +404,12 @@ utils::globalVariables( ".", "N", ".N", - ".SD" + ".SD", + "flag", + "p_start", + "p_end", + "q_start", + "q_end", + "total_size" ) ) diff --git a/R/show_sig_fit.R b/R/show_sig_fit.R index 866e5dd2..d0dc7a59 100644 --- a/R/show_sig_fit.R +++ b/R/show_sig_fit.R @@ -20,7 +20,6 @@ show_sig_fit <- function(fit_result, samples = NULL, signatures = NULL, width = 0.3, outlier.shape = NA, add = "jitter", add.params = list(alpha = 0.3), ...) { - fun_setting <- plot_fun <- match.arg(plot_fun) plot_fun <- switch( plot_fun, @@ -71,18 +70,18 @@ show_sig_fit <- function(fit_result, samples = NULL, signatures = NULL, ## Plotting if (isFALSE(fun_setting == "scatter")) { plot_fun(dat, - x = "sig", y = "exposure", color = "sig", outlier.shape = outlier.shape, - palette = palette, width = width, add = add, add.params = add.params, - title = title, xlab = xlab, ylab = ylab, legend = legend, ... + x = "sig", y = "exposure", color = "sig", outlier.shape = outlier.shape, + palette = palette, width = width, add = add, add.params = add.params, + title = title, xlab = xlab, ylab = ylab, legend = legend, ... ) } else { if (legend == "none") { send_warning("When plot_fun='scatter', setting legend='top' is recommended.") } plot_fun(dat, - x = "sig", y = "exposure", color = "sample", shape = "sample", - palette = palette, - title = title, xlab = xlab, ylab = ylab, legend = legend, ... + x = "sig", y = "exposure", color = "sample", shape = "sample", + palette = palette, + title = title, xlab = xlab, ylab = ylab, legend = legend, ... ) } } diff --git a/R/show_sig_profile.R b/R/show_sig_profile.R index 0e896cab..a24d079b 100644 --- a/R/show_sig_profile.R +++ b/R/show_sig_profile.R @@ -429,8 +429,9 @@ show_sig_profile <- function(Signature, mode = c("SBS", "copynumber", "DBS", "ID if (mode == "copynumber" & startsWith(method, "M")) { if (!is.null(params)) { params <- dplyr::mutate(params, - feature = mp[.data$feature], - components = paste0(.data$feature, "[", sub(".*[^0-9]+(\\d+$)", "\\1", .data$components), "]")) + feature = mp[.data$feature], + components = paste0(.data$feature, "[", sub(".*[^0-9]+(\\d+$)", "\\1", .data$components), "]") + ) params$class <- factor(levels(mat[["class"]])[1], levels = levels(mat[["class"]])) p <- p + geom_text(aes( x = .data$components, y = Inf, diff --git a/R/sig_fit.R b/R/sig_fit.R index 91e62a07..2aa04499 100644 --- a/R/sig_fit.R +++ b/R/sig_fit.R @@ -422,7 +422,6 @@ decompose_QP <- function(x, y, P, type = "absolute", ...) { decompose_SA <- function(x, y, P, type = "absolute", ...) { - if (sum(x) != 0) { control <- list(...) diff --git a/R/sig_fit_bootstrap_batch.R b/R/sig_fit_bootstrap_batch.R index 483e539e..80e290d6 100644 --- a/R/sig_fit_bootstrap_batch.R +++ b/R/sig_fit_bootstrap_batch.R @@ -128,7 +128,7 @@ sig_fit_bootstrap_batch <- function(catalogue_matrix, if (use_parallel) { oplan <- future::plan() future::plan("multiprocess", workers = future::availableCores()) - #furrr::future_options(seed = TRUE) + # furrr::future_options(seed = TRUE) on.exit(future::plan(oplan), add = TRUE) bt_list <- furrr::future_map2(as.data.frame(catalogue_matrix), colnames(catalogue_matrix), call_bt, y = rownames(catalogue_matrix), methods = methods, n = n, ..., .progress = TRUE, diff --git a/inst/test/test.R b/inst/test/test.R index fb279727..622ea96d 100644 --- a/inst/test/test.R +++ b/inst/test/test.R @@ -185,3 +185,30 @@ bt_res <- sig_fit_bootstrap_batch(mt_tally$nmf_matrix %>% t(), sig_index = 1:30, sig_db = "legacy", methods = c("QP"), n = 2) + +load(system.file("extdata", "toy_segTab.RData", + package = "sigminer", mustWork = TRUE +)) +debug(get_LengthFraction) +cn <- read_copynumber(segTabs, + seg_cols = c("chromosome", "start", "end", "segVal"), + genome_build = "hg19", complement = FALSE +) + + +cn2 <- read_copynumber(segTabs, + seg_cols = c("chromosome", "start", "end", "segVal"), + genome_build = "hg19", complement = FALSE +) + + +identical(cn@annotation, cn2@annotation) + + +load("~/biodata/DoAbsolute/CN_list.RData") +cn <- read_copynumber(input = CN_list$ACC@data, genome_build = "hg38", + seg_cols = c("chromosome", "start", "end", "segVal")) +cn2 <- read_copynumber(input = CN_list$ACC@data, genome_build = "hg38", + seg_cols = c("chromosome", "start", "end", "segVal")) + +identical(cn@annotation, cn2@annotation)