Skip to content

Commit

Permalink
👍 speed up copy number annotation, close #230
Browse files Browse the repository at this point in the history
  • Loading branch information
ShixiangWang committed Jun 17, 2020
1 parent 33442b5 commit ce5b842
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 97 deletions.
10 changes: 5 additions & 5 deletions R/CN-mutex-classification-methed.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@
## similar to previous work but here we focus on each **segment**.
## Secondly, we classified all segments into mutually exclusive types based on features.
get_features_mutex <- function(CN_data,
cores = 1,
genome_build = c("hg19", "hg38"),
feature_setting = sigminer::CN.features) {
cores = 1,
genome_build = c("hg19", "hg38"),
feature_setting = sigminer::CN.features) {
genome_build <- match.arg(genome_build)
# get chromosome lengths and centromere locations
chrlen <- get_genome_annotation(data_type = "chr_size", genome_build = genome_build)
Expand All @@ -18,7 +18,7 @@ get_features_mutex <- function(CN_data,
future::plan("multiprocess", workers = cores)
on.exit(future::plan(oplan), add = TRUE)

#features <- unique(feature_setting$feature)
# features <- unique(feature_setting$feature)
features <- c("CN", "SS")
# c("BP10MB", "CN", "SS", "CNCP-L", "CNCP-R", "CNCP-M", "OsCN") # more?

Expand All @@ -36,7 +36,7 @@ get_features_mutex <- function(CN_data,
}

res <- furrr::future_map(features, .get_feature,
.progress = TRUE
.progress = TRUE
)
res <- res %>% setNames(features)
res
Expand Down
230 changes: 164 additions & 66 deletions R/get.R
Original file line number Diff line number Diff line change
Expand Up @@ -365,73 +365,171 @@ get_LengthFraction <- function(CN_data,

segTab <- data.table::merge.data.table(segTab, arm_data, by.x = "chromosome", by.y = "chrom", all.x = TRUE)

.annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) {
if (end <= p_end & start >= p_start) {
location <- paste0(sub("chr", "", chrom), "p")
annotation <- "short arm"
fraction <- (end - start + 1) / (p_end - p_start + 1)
} else if (end <= q_end &
start >= q_start) {
location <- paste0(sub("chr", "", chrom), "q")
annotation <- "long arm"
fraction <- (end - start + 1) / (q_end - q_start + 1)
} else if (start >= p_start &
start <= p_end &
end >= q_start & end <= q_end) {
location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm
annotation <- "across short and long arm"
fraction <- 2 * ((end - start + 1) / total_size)
} else if (start < p_end & end < q_start) {
location <- paste0(sub("chr", "", chrom), "p")
annotation <- "short arm intersect with centromere region"
# only calculate region does not intersect
fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1)
} else if (start > p_end &
start < q_start & end > q_start) {
location <- paste0(sub("chr", "", chrom), "q")
annotation <- "long arm intersect with centromere region"
# only calculate region does not intersect
fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1)
} else {
location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq
annotation <- "segment locate in centromere region"
fraction <- 2 * ((end - start + 1) / total_size)
}

dplyr::tibble(location = location, annotation = annotation, fraction = fraction)
}

annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start,
q_end, q_length, total_size, .pb = NULL) {
if (.pb$i < .pb$n) .pb$tick()$print()
.annot_fun(
chrom, start, end, p_start, p_end, p_length, q_start,
q_end, q_length, total_size
segTab[, flag := data.table::fifelse(
end <= p_end & start >= p_start,
1L,
data.table::fifelse(
end <= q_end & start >= q_start,
2L,
data.table::fifelse(
start >= p_start & start <= p_end & end >= q_start & end <= q_end,
3L,
data.table::fifelse(
start < p_end & end < q_start,
4L,
data.table::fifelse(
start > p_end & start < q_start & end > q_start,
5L,
6L
)
)
)
)
}

pb <- progress_estimated(nrow(segTab), 0)

annot <- purrr::pmap_df(
list(
chrom = segTab$chromosome,
start = segTab$start,
end = segTab$end,
p_start = segTab$p_start,
p_end = segTab$p_end,
p_length = segTab$p_length,
q_start = segTab$q_start,
q_end = segTab$q_end,
q_length = segTab$q_length,
total_size = segTab$total_size
), annot_fun,
.pb = pb
)

cbind(
data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL],
data.table::as.data.table(annot)
)
)]

segTab[, location := data.table::fifelse(
flag == 1L,
paste0(sub("chr", "", chromosome), "p"),
data.table::fifelse(
flag == 2L,
paste0(sub("chr", "", chromosome), "q"),
data.table::fifelse(
flag == 3L,
paste0(sub("chr", "", chromosome), "pq"),
data.table::fifelse(
flag == 4L,
paste0(sub("chr", "", chromosome), "p"),
data.table::fifelse(
flag == 5L,
paste0(sub("chr", "", chromosome), "q"),
paste0(sub("chr", "", chromosome), "pq")
)
)
)
)
)]

segTab[, annotation := data.table::fifelse(
flag == 1L,
"short arm",
data.table::fifelse(
flag == 2L,
"long arm",
data.table::fifelse(
flag == 3L,
"across short and long arm",
data.table::fifelse(
flag == 4L,
"short arm intersect with centromere region",
data.table::fifelse(
flag == 5L,
"long arm intersect with centromere region",
"segment locate in centromere region"
)
)
)
)
)]

segTab[, fraction := data.table::fifelse(
flag == 1L,
(end - start + 1) / (p_end - p_start + 1),
data.table::fifelse(
flag == 2L,
(end - start + 1) / (q_end - q_start + 1),
data.table::fifelse(
flag == 3L,
2 * ((end - start + 1) / total_size),
data.table::fifelse(
flag == 4L,
(end - start + 1 - (end - p_end)) / (p_end - p_start + 1),
data.table::fifelse(
flag == 5L,
(end - start + 1 - (start - q_start)) / (q_end - q_start + 1),
2 * ((end - start + 1) / total_size)
)
)
)
)
)]

segTab[, c(colnames(arm_data)[-1], "flag") := NULL]
segTab


# .annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start, q_end, q_length, total_size) {
# if (end <= p_end & start >= p_start) {
# ## 1L
# location <- paste0(sub("chr", "", chrom), "p")
# annotation <- "short arm"
# fraction <- (end - start + 1) / (p_end - p_start + 1)
# } else if (end <= q_end &
# start >= q_start) {
# ## 2L
# location <- paste0(sub("chr", "", chrom), "q")
# annotation <- "long arm"
# fraction <- (end - start + 1) / (q_end - q_start + 1)
# } else if (start >= p_start &
# start <= p_end &
# end >= q_start & end <= q_end) {
# ## 3L
# location <- paste0(sub("chr", "", chrom), "pq") # across p and q arm
# annotation <- "across short and long arm"
# fraction <- 2 * ((end - start + 1) / total_size)
# } else if (start < p_end & end < q_start) {
# ## 4L
# location <- paste0(sub("chr", "", chrom), "p")
# annotation <- "short arm intersect with centromere region"
# # only calculate region does not intersect
# fraction <- (end - start + 1 - (end - p_end)) / (p_end - p_start + 1)
# } else if (start > p_end &
# start < q_start & end > q_start) {
# ## 5L
# location <- paste0(sub("chr", "", chrom), "q")
# annotation <- "long arm intersect with centromere region"
# # only calculate region does not intersect
# fraction <- (end - start + 1 - (start - q_start)) / (q_end - q_start + 1)
# } else {
# ## 6L
# location <- paste0(sub("chr", "", chrom), "pq") # suppose as pq
# annotation <- "segment locate in centromere region"
# fraction <- 2 * ((end - start + 1) / total_size)
# }
#
# dplyr::tibble(location = location, annotation = annotation, fraction = fraction)
# }
#
# annot_fun <- function(chrom, start, end, p_start, p_end, p_length, q_start,
# q_end, q_length, total_size, .pb = NULL) {
# if (.pb$i < .pb$n) .pb$tick()$print()
# .annot_fun(
# chrom, start, end, p_start, p_end, p_length, q_start,
# q_end, q_length, total_size
# )
# }
#
# pb <- progress_estimated(nrow(segTab), 0)
#
# annot <- purrr::pmap_df(
# list(
# chrom = segTab$chromosome,
# start = segTab$start,
# end = segTab$end,
# p_start = segTab$p_start,
# p_end = segTab$p_end,
# p_length = segTab$p_length,
# q_start = segTab$q_start,
# q_end = segTab$q_end,
# q_length = segTab$q_length,
# total_size = segTab$total_size
# ), annot_fun,
# .pb = pb
# )
#
# cbind(
# data.table::as.data.table(segTab)[, colnames(arm_data)[-1] := NULL],
# data.table::as.data.table(annot)
# )
}


Expand Down
19 changes: 11 additions & 8 deletions R/get_groups.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,11 @@ get_groups <- function(Signature,
method = c("consensus", "k-means", "exposure", "samples"),
n_cluster = NULL,
match_consensus = TRUE) {
fit_flag = data.table::is.data.table(Signature)
stopifnot(inherits(Signature, "Signature") | fit_flag,
is.null(n_cluster) | n_cluster > 1)
fit_flag <- data.table::is.data.table(Signature)
stopifnot(
inherits(Signature, "Signature") | fit_flag,
is.null(n_cluster) | n_cluster > 1
)
method <- match.arg(method)

timer <- Sys.time()
Expand All @@ -73,11 +75,10 @@ get_groups <- function(Signature,
}
send_success("Method checked.")

if (purrr::map_lgl(Signature, ~ifelse(is.numeric(.), any(. > 1), FALSE)) %>% any()) {
if (purrr::map_lgl(Signature, ~ ifelse(is.numeric(.), any(. > 1), FALSE)) %>% any()) {
send_stop("When input is {.code data.table} (from sig_fit), a relative exposure result is valid.")
}
send_success("Exposure should be relative checked.")

} else {
send_success("'Signature' object detected.")
}
Expand Down Expand Up @@ -141,7 +142,7 @@ get_groups <- function(Signature,
expo_df <- get_sig_exposure(Signature, type = "relative")
}

sig_names = colnames(expo_df)[-1]
sig_names <- colnames(expo_df)[-1]
common_prefix <- Biobase::lcPrefixC(sig_names)
mps <- seq_along(sig_names)
names(mps) <- sig_names
Expand Down Expand Up @@ -216,8 +217,10 @@ get_groups <- function(Signature,
attr(data, "map_table") <- ztable
}

send_warning("The 'enrich_sig' column is set to dominant signature in one group, ",
"please check and make it consistent with biological meaning (correct it by hand if necessary).")
send_warning(
"The 'enrich_sig' column is set to dominant signature in one group, ",
"please check and make it consistent with biological meaning (correct it by hand if necessary)."
)
return(data)
}

Expand Down
9 changes: 6 additions & 3 deletions R/helper_join_segments.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,12 @@ join_segments <- function(df) {
} else {
dplyr::bind_cols(
out,
dplyr::summarise_at(res, dplyr::vars(-c("start", "end", "segVal")),
~ifelse(is.numeric(.), mean(., na.rm = TRUE),
paste0(unique(na.omit(.)), collapse = ",")))
dplyr::summarise_at(
res, dplyr::vars(-c("start", "end", "segVal")),
~ ifelse(is.numeric(.), mean(., na.rm = TRUE),
paste0(unique(na.omit(.)), collapse = ",")
)
)
)
}
}, df = df)
Expand Down
15 changes: 11 additions & 4 deletions R/read_copynumber.R
Original file line number Diff line number Diff line change
Expand Up @@ -357,8 +357,10 @@ read_copynumber <- function(input,
# order by segment start position by each chromosome in each sample
data_df <- data_df[, .SD[order(.SD$start, decreasing = FALSE)], by = c("sample", "chromosome")]
all_cols <- colnames(data_df)
data.table::setcolorder(data_df, neworder = c(c("chromosome", "start", "end", "segVal", "sample"),
setdiff(all_cols, c("chromosome", "start", "end", "segVal", "sample"))))
data.table::setcolorder(data_df, neworder = c(
c("chromosome", "start", "end", "segVal", "sample"),
setdiff(all_cols, c("chromosome", "start", "end", "segVal", "sample"))
))

send_success("Segmental table cleaned.")

Expand All @@ -368,7 +370,6 @@ read_copynumber <- function(input,
seg_cols = new_cols[1:4],
samp_col = new_cols[5]
)
message()
send_success("Annotation done.")

send_info("Summarizing per sample.")
Expand Down Expand Up @@ -403,6 +404,12 @@ utils::globalVariables(
".",
"N",
".N",
".SD"
".SD",
"flag",
"p_start",
"p_end",
"q_start",
"q_end",
"total_size"
)
)
Loading

0 comments on commit ce5b842

Please sign in to comment.