From bcc7023f04d2d4b0dda2813855bddc912708466d Mon Sep 17 00:00:00 2001 From: egeulgen Date: Thu, 2 May 2024 21:35:52 +0100 Subject: [PATCH 1/2] remove conversion functionality from `get_kegg_gsets()` and return KEGG IDs --- R/data_generation.R | 29 +++++---------------------- man/get_kegg_gsets.Rd | 2 +- tests/testthat/test-data_generation.R | 10 ++------- 3 files changed, 8 insertions(+), 33 deletions(-) diff --git a/R/data_generation.R b/R/data_generation.R index f4875a99..8f407ef6 100644 --- a/R/data_generation.R +++ b/R/data_generation.R @@ -175,26 +175,13 @@ gset_list_from_gmt <- function(path2gmt, descriptions_idx = 2) { #' of all available organisms, see \url{https://www.genome.jp/kegg/catalog/org_list.html} #' #' @return list containing 2 elements: \itemize{ -#' \item{gene_sets - A list containing the genes involved in each KEGG pathway} +#' \item{gene_sets - A list containing KEGG IDs for the genes involved in each KEGG pathway} #' \item{descriptions - A named vector containing the descriptions for each KEGG pathway} #' } get_kegg_gsets <- function(org_code = "hsa") { message("Grab a cup of coffee, this will take a while...") - gene_table_url <- paste0("https://rest.kegg.jp/list/", org_code) - gene_table_result <- httr::GET(gene_table_url) - gene_table_result <- httr::content(gene_table_result, "text") - - parsed_gene_table_result <- strsplit(gene_table_result, "\n")[[1]] - kegg_gene_table <- data.frame( - kegg_id = unname(vapply(parsed_gene_table_result, function(x) unlist(strsplit(x, "\t"))[1], "org:123")), - symbol = unname(vapply(parsed_gene_table_result, function(x) unlist(strsplit(unlist(strsplit(x, "\t"))[4], ";"))[1], "symbol")) - ) - # remove mistaken lines - kegg_gene_table <- kegg_gene_table[grep("^((,\\s)?[A-Za-z0-9_-]+(\\@)?)+$", kegg_gene_table$symbol), ] - - all_pathways_url <- paste0("https://rest.kegg.jp/list/pathway/", org_code) all_pathways_result <- httr::GET(all_pathways_url) all_pathways_result <- httr::content(all_pathways_result, "text") @@ -205,16 +192,10 @@ get_kegg_gsets <- function(org_code = "hsa") { genes_by_pathway <- lapply(pathway_ids, function(pw_id) { pathways_graph <- ggkegg::pathway(pid = pw_id, directory = tempdir(), use_cache = FALSE, return_tbl_graph = FALSE) - all_pw_gene_ids <- igraph::V(pathways_graph)$name[igraph::V(pathways_graph)$type == "gene"] - all_pw_gene_ids <- unlist(strsplit(all_pw_gene_ids, " ")) - all_pw_gene_ids <- unique(all_pw_gene_ids) - - all_pw_gene_symbols <- kegg_gene_table$symbol[match(all_pw_gene_ids, kegg_gene_table$kegg_id)] - all_pw_gene_symbols <- all_pw_gene_symbols[!is.na(all_pw_gene_symbols)] - all_pw_gene_symbols <- unname(vapply(all_pw_gene_symbols, function(x) unlist(strsplit(x, ", "))[1], "symbol")) - all_pw_gene_symbols <- unique(all_pw_gene_symbols) - - return(all_pw_gene_symbols) + all_pw_kegg_ids <- igraph::V(pathways_graph)$name[igraph::V(pathways_graph)$type == "gene"] + all_pw_kegg_ids <- unlist(strsplit(all_pw_kegg_ids, " ")) + all_pw_kegg_ids <- unique(all_pw_kegg_ids) + return(all_pw_kegg_ids) }) names(genes_by_pathway) <- pathway_ids diff --git a/man/get_kegg_gsets.Rd b/man/get_kegg_gsets.Rd index 2c53c133..e052ed64 100644 --- a/man/get_kegg_gsets.Rd +++ b/man/get_kegg_gsets.Rd @@ -12,7 +12,7 @@ of all available organisms, see \url{https://www.genome.jp/kegg/catalog/org_list } \value{ list containing 2 elements: \itemize{ -\item{gene_sets - A list containing the genes involved in each KEGG pathway} +\item{gene_sets - A list containing NCBI gene IDs for the genes involved in each KEGG pathway} \item{descriptions - A named vector containing the descriptions for each KEGG pathway} } } diff --git a/tests/testthat/test-data_generation.R b/tests/testthat/test-data_generation.R index f69c976e..df0c39b4 100644 --- a/tests/testthat/test-data_generation.R +++ b/tests/testthat/test-data_generation.R @@ -92,17 +92,11 @@ test_that("`gset_list_from_gmt()` -- works as expected", { test_that("`get_kegg_gsets()` -- works as expected", { skip_on_cran() - mock_responses <- c( - httr::content(httr::GET(paste0("https://rest.kegg.jp/list/eco")), "text"), - "eco00010\tdescription\neco00071\tdescription2" - ) - - call_count <- 0 + mock_response <- "eco00010\tdescription\neco00071\tdescription2" # function to manage sequential responses mock_content <- function(...) { - call_count <<- call_count + 1 - return(mock_responses[call_count]) + return(mock_response) } From d090c97eff4ec0a6f2e8f68e313ad0d64de9fcde Mon Sep 17 00:00:00 2001 From: egeulgen Date: Thu, 2 May 2024 21:36:17 +0100 Subject: [PATCH 2/2] update NEWS; bump dev version --- DESCRIPTION | 2 +- NEWS.md | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 632ed936..18cee1bc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: pathfindR Type: Package Title: Enrichment Analysis Utilizing Active Subnetworks -Version: 2.4.0.9000 +Version: 2.4.0.9001 Authors@R: c(person("Ege", "Ulgen", role = c("cre", "cph"), email = "egeulgen@gmail.com", diff --git a/NEWS.md b/NEWS.md index 3b25693d..b1d3a566 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # pathfindR (development version) +# Minor Changes and Bug Fixes + +- fixed a bug regarding KEGG gene set fetching: removed the conversion functionality in `get_kegg_gsets()` which now returns KEGG IDs so that the user can convert the returned identifiers using a more appropriate tool (e.g. BioMart) should they wish + # pathfindR 2.4.0 ## Major Changes