From bcc7023f04d2d4b0dda2813855bddc912708466d Mon Sep 17 00:00:00 2001
From: egeulgen <egeulgen@gmail.com>
Date: Thu, 2 May 2024 21:35:52 +0100
Subject: [PATCH 1/2] remove conversion functionality from `get_kegg_gsets()`
 and return KEGG IDs

---
 R/data_generation.R                   | 29 +++++----------------------
 man/get_kegg_gsets.Rd                 |  2 +-
 tests/testthat/test-data_generation.R | 10 ++-------
 3 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/R/data_generation.R b/R/data_generation.R
index f4875a99..8f407ef6 100644
--- a/R/data_generation.R
+++ b/R/data_generation.R
@@ -175,26 +175,13 @@ gset_list_from_gmt <- function(path2gmt, descriptions_idx = 2) {
 #' of all available organisms, see \url{https://www.genome.jp/kegg/catalog/org_list.html}
 #'
 #' @return list containing 2 elements: \itemize{
-#' \item{gene_sets - A list containing the genes involved in each KEGG pathway}
+#' \item{gene_sets - A list containing KEGG IDs for the genes involved in each KEGG pathway}
 #' \item{descriptions - A named vector containing the descriptions for each KEGG pathway}
 #' }
 get_kegg_gsets <- function(org_code = "hsa") {
 
   message("Grab a cup of coffee, this will take a while...")
 
-  gene_table_url <- paste0("https://rest.kegg.jp/list/", org_code)
-  gene_table_result <- httr::GET(gene_table_url)
-  gene_table_result <- httr::content(gene_table_result, "text")
-
-  parsed_gene_table_result <- strsplit(gene_table_result, "\n")[[1]]
-  kegg_gene_table <- data.frame(
-    kegg_id = unname(vapply(parsed_gene_table_result, function(x) unlist(strsplit(x, "\t"))[1], "org:123")),
-    symbol = unname(vapply(parsed_gene_table_result, function(x) unlist(strsplit(unlist(strsplit(x, "\t"))[4], ";"))[1], "symbol"))
-  )
-  # remove mistaken lines
-  kegg_gene_table <- kegg_gene_table[grep("^((,\\s)?[A-Za-z0-9_-]+(\\@)?)+$", kegg_gene_table$symbol), ]
-
-
   all_pathways_url <- paste0("https://rest.kegg.jp/list/pathway/", org_code)
   all_pathways_result <- httr::GET(all_pathways_url)
   all_pathways_result <- httr::content(all_pathways_result, "text")
@@ -205,16 +192,10 @@ get_kegg_gsets <- function(org_code = "hsa") {
 
   genes_by_pathway <- lapply(pathway_ids, function(pw_id) {
     pathways_graph <- ggkegg::pathway(pid = pw_id, directory = tempdir(), use_cache = FALSE, return_tbl_graph = FALSE)
-    all_pw_gene_ids <- igraph::V(pathways_graph)$name[igraph::V(pathways_graph)$type == "gene"]
-    all_pw_gene_ids <- unlist(strsplit(all_pw_gene_ids, " "))
-    all_pw_gene_ids <- unique(all_pw_gene_ids)
-
-    all_pw_gene_symbols <- kegg_gene_table$symbol[match(all_pw_gene_ids, kegg_gene_table$kegg_id)]
-    all_pw_gene_symbols <- all_pw_gene_symbols[!is.na(all_pw_gene_symbols)]
-    all_pw_gene_symbols <- unname(vapply(all_pw_gene_symbols, function(x) unlist(strsplit(x, ", "))[1], "symbol"))
-    all_pw_gene_symbols <- unique(all_pw_gene_symbols)
-
-    return(all_pw_gene_symbols)
+    all_pw_kegg_ids <- igraph::V(pathways_graph)$name[igraph::V(pathways_graph)$type == "gene"]
+    all_pw_kegg_ids <- unlist(strsplit(all_pw_kegg_ids, " "))
+    all_pw_kegg_ids <- unique(all_pw_kegg_ids)
+    return(all_pw_kegg_ids)
   })
 
   names(genes_by_pathway) <- pathway_ids
diff --git a/man/get_kegg_gsets.Rd b/man/get_kegg_gsets.Rd
index 2c53c133..e052ed64 100644
--- a/man/get_kegg_gsets.Rd
+++ b/man/get_kegg_gsets.Rd
@@ -12,7 +12,7 @@ of all available organisms, see \url{https://www.genome.jp/kegg/catalog/org_list
 }
 \value{
 list containing 2 elements: \itemize{
-\item{gene_sets - A list containing the genes involved in each KEGG pathway}
+\item{gene_sets - A list containing NCBI gene IDs for the genes involved in each KEGG pathway}
 \item{descriptions - A named vector containing the descriptions for each KEGG pathway}
 }
 }
diff --git a/tests/testthat/test-data_generation.R b/tests/testthat/test-data_generation.R
index f69c976e..df0c39b4 100644
--- a/tests/testthat/test-data_generation.R
+++ b/tests/testthat/test-data_generation.R
@@ -92,17 +92,11 @@ test_that("`gset_list_from_gmt()` -- works as expected", {
 
 test_that("`get_kegg_gsets()` -- works as expected", {
   skip_on_cran()
-  mock_responses <- c(
-    httr::content(httr::GET(paste0("https://rest.kegg.jp/list/eco")), "text"),
-    "eco00010\tdescription\neco00071\tdescription2"
-  )
-
-  call_count <- 0
+  mock_response <- "eco00010\tdescription\neco00071\tdescription2"
 
   # function to manage sequential responses
   mock_content <- function(...) {
-    call_count <<- call_count + 1
-    return(mock_responses[call_count])
+    return(mock_response)
   }
 
 

From d090c97eff4ec0a6f2e8f68e313ad0d64de9fcde Mon Sep 17 00:00:00 2001
From: egeulgen <egeulgen@gmail.com>
Date: Thu, 2 May 2024 21:36:17 +0100
Subject: [PATCH 2/2] update NEWS; bump dev version

---
 DESCRIPTION | 2 +-
 NEWS.md     | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 632ed936..18cee1bc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: pathfindR
 Type: Package
 Title: Enrichment Analysis Utilizing Active Subnetworks
-Version: 2.4.0.9000
+Version: 2.4.0.9001
 Authors@R: c(person("Ege", "Ulgen",
                     role = c("cre", "cph"), 
                     email = "egeulgen@gmail.com",
diff --git a/NEWS.md b/NEWS.md
index 3b25693d..b1d3a566 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # pathfindR (development version)
 
+# Minor Changes and Bug Fixes
+
+- fixed a bug regarding KEGG gene set fetching: removed the conversion functionality in `get_kegg_gsets()` which now returns KEGG IDs so that the user can convert the returned identifiers using a more appropriate tool (e.g. BioMart) should they wish
+
 # pathfindR 2.4.0
 
 ## Major Changes