Skip to content

Commit

Permalink
Add a future integration layer to speed up the download process and a…
Browse files Browse the repository at this point in the history
… progressr layer to announce progress.
  • Loading branch information
coatless committed Nov 18, 2024
1 parent 2a4a0ac commit 946dfad
Show file tree
Hide file tree
Showing 16 changed files with 243 additions and 74 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.Rproj.user
.Rhistory
docs
cache
12 changes: 7 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@ Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.2
Imports:
httr2,
jsonlite,
xml2,
dplyr,
fs,
glue,
jsonlite,
purrr,
readr,
tibble
glue,
fs,
tibble,
future.apply,
progressr,
readr
Collate:
'cache.R'
'constants.R'
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,3 @@ export(fetch_department_courses)
export(fetch_departments)
export(parse_courses)
export(read_cache)
export(write_cache)
63 changes: 52 additions & 11 deletions R/cache.R
Original file line number Diff line number Diff line change
@@ -1,30 +1,71 @@
#' Write course data to cache
#' Initialize cache directories
#'
#' @param data Course data frame
#' @param cache_dir Directory to cache results
#' @param cache_dir Base cache directory
#' @return List of cache paths
#' @keywords internal
init_cache_dirs <- function(cache_dir) {
if (is.null(cache_dir)) {
return(NULL)
}

paths <- list(
base = cache_dir,
json = file.path(cache_dir, "json"),
xml = file.path(cache_dir, "xml")
)

fs::dir_create(paths$json)
fs::dir_create(paths$xml)

paths
}

#' Write XML data to cache
#'
#' @param content XML content
#' @param cache_dir Base cache directory
#' @param dept Department code
#' @export
write_cache <- function(data, cache_dir, dept) {
fs::dir_create(cache_dir)
#' @keywords internal
write_xml_cache <- function(content, cache_dir, dept) {
if (is.null(cache_dir)) return(NULL)

paths <- init_cache_dirs(cache_dir)
xml_path <- fs::path(paths$xml, paste0(dept, ".xml"))
readr::write_file(content, xml_path)
}

#' Write JSON data to cache
#'
#' @param data Data frame or list to cache
#' @param cache_dir Base cache directory
#' @param filename Filename without extension
#' @keywords internal
write_json_cache <- function(data, cache_dir, filename) {
if (is.null(cache_dir)) return(NULL)

paths <- init_cache_dirs(cache_dir)
jsonlite::write_json(
data,
fs::path(cache_dir, paste0(dept, ".json")),
fs::path(paths$json, paste0(filename, ".json")),
pretty = TRUE
)
}

#' Read course data from cache
#'
#' @param cache_dir Directory containing cached files
#' @param cache_dir Base cache directory
#' @param dept Department code (optional)
#' @return Data frame of course data
#' @export
read_cache <- function(cache_dir, dept = NULL) {
paths <- init_cache_dirs(cache_dir)

if (is.null(dept)) {
files <- fs::dir_ls(cache_dir, glob = "*.json")
purrr::map_dfr(files, jsonlite::read_json, simplifyVector = TRUE)
files <- fs::dir_ls(paths$json, glob = "*.json")
results <- lapply(files, jsonlite::read_json, simplifyVector = TRUE)
dplyr::bind_rows(results)
} else {
file <- fs::path(cache_dir, paste0(dept, ".json"))
file <- fs::path(paths$json, paste0(dept, ".json"))
if (fs::file_exists(file)) {
jsonlite::read_json(file, simplifyVector = TRUE)
} else {
Expand Down
96 changes: 72 additions & 24 deletions R/fetch.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#' Fetch department list from Stanford ExploreCourses
#'
#' @param cache_dir Directory to cache results
#' @param cache_dir Base cache directory
#' @return A data frame containing department information
#' @export
#' @include constants.R
Expand Down Expand Up @@ -28,24 +28,23 @@ fetch_departments <- function(cache_dir = NULL) {
})

if (!is.null(cache_dir)) {
fs::dir_create(cache_dir)
jsonlite::write_json(
departments,
fs::path(cache_dir, "departments.json"),
pretty = TRUE
)
write_json_cache(departments, cache_dir, "departments")
}

departments
}

#' Fetch courses for a specific department

#' Fetch courses for a specific department with progress reporting
#'
#' @param name Department code
#' @param cache_dir Directory to cache results
#' @return XML content of courses
#' @param cache_dir Base cache directory
#' @param p Progress handler
#' @return Data frame of parsed course information
#' @export
fetch_department_courses <- function(name, cache_dir = NULL) {
fetch_department_courses <- function(name, cache_dir = NULL, p = NULL) {
if (!is.null(p)) p(message = sprintf("Fetching %s", name))

url <- glue::glue(COURSE_ENDPOINT, name = name)

req <- httr2::request(url) |>
Expand All @@ -54,19 +53,50 @@ fetch_department_courses <- function(name, cache_dir = NULL) {
content <- httr2::resp_body_string(req)

if (!is.null(cache_dir)) {
fs::dir_create(cache_dir)

xml_path <- fs::path(cache_dir, paste0(name, ".xml"))
readr::write_file(content, xml_path)
write_xml_cache(content, cache_dir, name)
}


# Parse XML directly to data frame
courses <- parse_courses(content)
courses$department <- name

courses
}

#' Process a single department
#'
#' @param name Department code
#' @param cache_dir Base cache directory
#' @param p Progress handler
#' @return Data frame of parsed course information
#' @keywords internal
process_department <- function(name, cache_dir = NULL, p = NULL) {
if (!is.null(p)) p(message = sprintf("Fetching %s", name))

url <- glue::glue(COURSE_ENDPOINT, name = name)

# Try to fetch and parse the data
tryCatch({
req <- httr2::request(url) |>
httr2::req_perform()

content <- httr2::resp_body_string(req)

if (!is.null(cache_dir)) {
write_xml_cache(content, cache_dir, name)
}

# Parse XML directly to data frame
courses <- parse_courses(content)
courses$department <- name

courses
}, error = function(e) {
warning(sprintf("Error processing department %s: %s", name, e$message))
NULL
})
}

#' Parse course XML into a data frame
#'
#' @param xml_content XML content from fetch_department_courses
Expand Down Expand Up @@ -142,20 +172,38 @@ parse_courses <- function(xml_content) {
course_data
}

#' Fetch and process courses for multiple departments
#' Fetch courses for a specific department with progress reporting
#'
#' @param name Department code
#' @param cache_dir Base cache directory
#' @param p Progress handler
#' @return Data frame of parsed course information
#' @export
fetch_department_courses <- function(name, cache_dir = NULL, p = NULL) {
process_department(name, cache_dir, p)
}

#' Fetch and process courses for multiple departments in parallel
#'
#' @param departments Character vector of department codes
#' @param cache_dir Directory to cache results
#' @return A list of data frames containing course information
#' @param cache_dir Base cache directory
#' @return A data frame containing course information
#' @export
fetch_all_courses <- function(departments = NULL, cache_dir = NULL) {
if (is.null(departments)) {
departments <- fetch_departments(cache_dir)$name
}

purrr::map_dfr(departments, function(dept) {
message("Fetching department: ", dept)
courses <- fetch_department_courses(dept, cache_dir)
courses
})
p <- progressr::progressor(steps = length(departments))

results <- future.apply::future_lapply(
departments,
fetch_department_courses,
cache_dir = cache_dir,
p = p,
future.seed = TRUE
)

# Combine results
dplyr::bind_rows(results)
}
15 changes: 11 additions & 4 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ remotes::remotes("coatless-rpkg/explorecourses")

First, load the package:

```r
```{r}
#| eval: false
library(explorecourses)
```

Expand All @@ -53,7 +54,8 @@ The package contains three main functions:
By default, we'll retrieve all courses across all departments for the current
academic year using:

```r
```{r}
#| eval: false
all_courses <- fetch_all_courses()
```

Expand All @@ -62,16 +64,21 @@ This information is stored in the `schedule_ay24_25` data frame.
For just a single department, we can use it's code to retrieve a list of all
classes:

```r
```{r}
#| eval: false
department_courses <- fetch_department_courses("STATS")
```

To determine possible department shortcodes, we can use:

```r
```{r}
#| eval: false
departments <- fetch_departments()
```

This will return a data frame with the department short name, long name, and school
the department is associated with.

## License

AGPL (>= 3)
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ To determine possible department shortcodes, we can use:
departments <- fetch_departments()
```

This will return a data frame with the department short name, long name,
and school the department is associated with.

## License

AGPL (\>= 3)
8 changes: 4 additions & 4 deletions man/fetch_all_courses.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 13 additions & 5 deletions man/fetch_department_courses.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/fetch_departments.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions man/init_cache_dirs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 946dfad

Please sign in to comment.