Add a future integration layer to speed up the download process and a…

… progressr layer to announce progress.
coatless-rpkg · Nov 18, 2024 · 946dfad · 946dfad
1 parent 2a4a0ac
commit 946dfad
Show file tree

Hide file tree

Showing 16 changed files with 243 additions and 74 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .Rproj.user
 .Rhistory
 docs
+cache
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,14 +14,16 @@ Roxygen: list(markdown = TRUE)
 RoxygenNote: 7.3.2
 Imports: 
     httr2,
-    jsonlite,
     xml2,
     dplyr,
-    fs,
-    glue,
+    jsonlite,
     purrr,
-    readr,
-    tibble
+    glue,
+    fs,
+    tibble,
+    future.apply,
+    progressr,
+    readr
 Collate: 
     'cache.R'
     'constants.R'

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,4 +5,3 @@ export(fetch_department_courses)
 export(fetch_departments)
 export(parse_courses)
 export(read_cache)
-export(write_cache)
diff --git a/R/cache.R b/R/cache.R
@@ -1,30 +1,71 @@
-#' Write course data to cache
+#' Initialize cache directories
 #'
-#' @param data Course data frame
-#' @param cache_dir Directory to cache results
+#' @param cache_dir Base cache directory
+#' @return List of cache paths
+#' @keywords internal
+init_cache_dirs <- function(cache_dir) {
+  if (is.null(cache_dir)) {
+    return(NULL)
+  }
+
+  paths <- list(
+    base = cache_dir,
+    json = file.path(cache_dir, "json"),
+    xml = file.path(cache_dir, "xml")
+  )
+
+  fs::dir_create(paths$json)
+  fs::dir_create(paths$xml)
+
+  paths
+}
+
+#' Write XML data to cache
+#'
+#' @param content XML content
+#' @param cache_dir Base cache directory
 #' @param dept Department code
-#' @export
-write_cache <- function(data, cache_dir, dept) {
-  fs::dir_create(cache_dir)
+#' @keywords internal
+write_xml_cache <- function(content, cache_dir, dept) {
+  if (is.null(cache_dir)) return(NULL)
+
+  paths <- init_cache_dirs(cache_dir)
+  xml_path <- fs::path(paths$xml, paste0(dept, ".xml"))
+  readr::write_file(content, xml_path)
+}
+
+#' Write JSON data to cache
+#'
+#' @param data Data frame or list to cache
+#' @param cache_dir Base cache directory
+#' @param filename Filename without extension
+#' @keywords internal
+write_json_cache <- function(data, cache_dir, filename) {
+  if (is.null(cache_dir)) return(NULL)
+
+  paths <- init_cache_dirs(cache_dir)
   jsonlite::write_json(
     data,
-    fs::path(cache_dir, paste0(dept, ".json")),
+    fs::path(paths$json, paste0(filename, ".json")),
     pretty = TRUE
   )
 }
 
 #' Read course data from cache
 #'
-#' @param cache_dir Directory containing cached files
+#' @param cache_dir Base cache directory
 #' @param dept Department code (optional)
 #' @return Data frame of course data
 #' @export
 read_cache <- function(cache_dir, dept = NULL) {
+  paths <- init_cache_dirs(cache_dir)
+
   if (is.null(dept)) {
-    files <- fs::dir_ls(cache_dir, glob = "*.json")
-    purrr::map_dfr(files, jsonlite::read_json, simplifyVector = TRUE)
+    files <- fs::dir_ls(paths$json, glob = "*.json")
+    results <- lapply(files, jsonlite::read_json, simplifyVector = TRUE)
+    dplyr::bind_rows(results)
   } else {
-    file <- fs::path(cache_dir, paste0(dept, ".json"))
+    file <- fs::path(paths$json, paste0(dept, ".json"))
     if (fs::file_exists(file)) {
       jsonlite::read_json(file, simplifyVector = TRUE)
     } else {

diff --git a/R/fetch.R b/R/fetch.R
@@ -1,6 +1,6 @@
 #' Fetch department list from Stanford ExploreCourses
 #'
-#' @param cache_dir Directory to cache results
+#' @param cache_dir Base cache directory
 #' @return A data frame containing department information
 #' @export
 #' @include constants.R
@@ -28,24 +28,23 @@ fetch_departments <- function(cache_dir = NULL) {
   })
 
   if (!is.null(cache_dir)) {
-    fs::dir_create(cache_dir)
-    jsonlite::write_json(
-      departments,
-      fs::path(cache_dir, "departments.json"),
-      pretty = TRUE
-    )
+    write_json_cache(departments, cache_dir, "departments")
   }
 
   departments
 }
 
-#' Fetch courses for a specific department
+
+#' Fetch courses for a specific department with progress reporting
 #'
 #' @param name Department code
-#' @param cache_dir Directory to cache results
-#' @return XML content of courses
+#' @param cache_dir Base cache directory
+#' @param p Progress handler
+#' @return Data frame of parsed course information
 #' @export
-fetch_department_courses <- function(name, cache_dir = NULL) {
+fetch_department_courses <- function(name, cache_dir = NULL, p = NULL) {
+  if (!is.null(p)) p(message = sprintf("Fetching %s", name))
+
   url <- glue::glue(COURSE_ENDPOINT, name = name)
 
   req <- httr2::request(url) |>
@@ -54,19 +53,50 @@ fetch_department_courses <- function(name, cache_dir = NULL) {
   content <- httr2::resp_body_string(req)
 
   if (!is.null(cache_dir)) {
-    fs::dir_create(cache_dir)
-
-    xml_path <- fs::path(cache_dir, paste0(name, ".xml"))
-    readr::write_file(content, xml_path)
+    write_xml_cache(content, cache_dir, name)
   }
 
-
+  # Parse XML directly to data frame
   courses <- parse_courses(content)
   courses$department <- name
 
   courses
 }
 
+#' Process a single department
+#'
+#' @param name Department code
+#' @param cache_dir Base cache directory
+#' @param p Progress handler
+#' @return Data frame of parsed course information
+#' @keywords internal
+process_department <- function(name, cache_dir = NULL, p = NULL) {
+  if (!is.null(p)) p(message = sprintf("Fetching %s", name))
+
+  url <- glue::glue(COURSE_ENDPOINT, name = name)
+
+  # Try to fetch and parse the data
+  tryCatch({
+    req <- httr2::request(url) |>
+      httr2::req_perform()
+
+    content <- httr2::resp_body_string(req)
+
+    if (!is.null(cache_dir)) {
+      write_xml_cache(content, cache_dir, name)
+    }
+
+    # Parse XML directly to data frame
+    courses <- parse_courses(content)
+    courses$department <- name
+
+    courses
+  }, error = function(e) {
+    warning(sprintf("Error processing department %s: %s", name, e$message))
+    NULL
+  })
+}
+
 #' Parse course XML into a data frame
 #'
 #' @param xml_content XML content from fetch_department_courses
@@ -142,20 +172,38 @@ parse_courses <- function(xml_content) {
   course_data
 }
 
-#' Fetch and process courses for multiple departments
+#' Fetch courses for a specific department with progress reporting
+#'
+#' @param name Department code
+#' @param cache_dir Base cache directory
+#' @param p Progress handler
+#' @return Data frame of parsed course information
+#' @export
+fetch_department_courses <- function(name, cache_dir = NULL, p = NULL) {
+  process_department(name, cache_dir, p)
+}
+
+#' Fetch and process courses for multiple departments in parallel
 #'
 #' @param departments Character vector of department codes
-#' @param cache_dir Directory to cache results
-#' @return A list of data frames containing course information
+#' @param cache_dir Base cache directory
+#' @return A data frame containing course information
 #' @export
 fetch_all_courses <- function(departments = NULL, cache_dir = NULL) {
   if (is.null(departments)) {
     departments <- fetch_departments(cache_dir)$name
   }
 
-  purrr::map_dfr(departments, function(dept) {
-    message("Fetching department: ", dept)
-    courses <- fetch_department_courses(dept, cache_dir)
-    courses
-  })
+  p <- progressr::progressor(steps = length(departments))
+
+  results <- future.apply::future_lapply(
+    departments,
+    fetch_department_courses,
+    cache_dir = cache_dir,
+    p = p,
+    future.seed = TRUE
+  )
+
+  # Combine results
+  dplyr::bind_rows(results)
 }
diff --git a/README.Rmd b/README.Rmd
@@ -40,7 +40,8 @@ remotes::remotes("coatless-rpkg/explorecourses")
 
 First, load the package:
 
-```r
+```{r}
+#| eval: false
 library(explorecourses)
 ```
 
@@ -53,7 +54,8 @@ The package contains three main functions:
 By default, we'll retrieve all courses across all departments for the current
 academic year using:
 
-```r
+```{r}
+#| eval: false
 all_courses <- fetch_all_courses()
 ```
 
@@ -62,16 +64,21 @@ This information is stored in the `schedule_ay24_25` data frame.
 For just a single department, we can use it's code to retrieve a list of all
 classes:
 
-```r
+```{r}
+#| eval: false
 department_courses <- fetch_department_courses("STATS")
 ```
 
 To determine possible department shortcodes, we can use:
 
-```r
+```{r}
+#| eval: false
 departments <- fetch_departments()
 ```
 
+This will return a data frame with the department short name, long name, and school
+the department is associated with.
+
 ## License
 
 AGPL (>= 3)
diff --git a/README.md b/README.md
@@ -65,6 +65,9 @@ To determine possible department shortcodes, we can use:
 departments <- fetch_departments()
 ```
 
+This will return a data frame with the department short name, long name,
+and school the department is associated with.
+
 ## License
 
 AGPL (\>= 3)
diff --git a/man/fetch_all_courses.Rd b/man/fetch_all_courses.Rd
diff --git a/man/fetch_department_courses.Rd b/man/fetch_department_courses.Rd
diff --git a/man/fetch_departments.Rd b/man/fetch_departments.Rd
diff --git a/man/init_cache_dirs.Rd b/man/init_cache_dirs.Rd