Skip to content

Commit

Permalink
Add depth data [minor] (#102)
Browse files Browse the repository at this point in the history
* Update water depths [minor]
  • Loading branch information
gmyenni authored Jan 11, 2024
1 parent 93eb1ba commit 51be02a
Show file tree
Hide file tree
Showing 8 changed files with 23,877 additions and 17,054 deletions.
49 changes: 26 additions & 23 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Install packages, update data, test, and archive
# Install packages, update data, test, and archive

name: CI

Expand All @@ -17,7 +17,7 @@ on:
jobs:
build:
if: contains(toJson(github.event.commits), '[skip ci]') == false

runs-on: ${{ matrix.config.os }}

name: ${{ matrix.config.os }} (${{ matrix.config.r }})
Expand All @@ -26,64 +26,67 @@ jobs:
matrix:
config:
- {os: ubuntu-latest, r: '4.2.2', rspm: "https://packagemanager.rstudio.com/cran/__linux__/bionic/latest"}

env:
R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
RSPM: ${{ matrix.config.rspm }}
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

steps:
- uses: actions/checkout@v3
with:
fetch-depth: 3

- uses: r-lib/actions/setup-r@v2

- name: Cache R packages
uses: actions/cache@v3
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-2-${{ hashFiles('.github/depends.Rds') }}
restore-keys: |
${{ runner.os }}-${{ hashFiles('.github/R-version') }}-2-
- name: Install system dependencies
run: sudo apt-get install libgit2-dev libcurl4-openssl-dev libudunits2-dev libgdal-dev libnetcdf-dev libgeos-dev libproj-dev

run: |
sudo apt-get -y update && sudo apt-get install -y \
libgit2-dev libicu-dev gdal-bin proj-data proj-bin libv8-dev libprotobuf-dev protobuf-compiler \
libudunits2-dev libgdal-dev libgeos-dev libproj-dev libfontconfig1-dev libjq-dev libmysqlclient-dev libpng-dev
sudo apt-get update
- name: Install packages
run: Rscript install-packages.R
run: Rscript install-packages.R

- name: Update data
run: Rscript update-data.R

- name: Test
run: Rscript testthat.R

- name: Check for new files
id: check_files
run: |
if [[ -z "$(git status --porcelain)" ]] ; then
if [[ -z "$(git status --porcelain)" ]] ; then
echo "Git status is empty"
echo "diff=FALSE" >> $GITHUB_OUTPUT
else
else
echo "diff=TRUE" >> $GITHUB_OUTPUT
fi
- name: Bump version
id: version
env:
env:
LOG: ${{ github.event_name == 'pull_request' && '$(git log --no-merges --format=%B -n 1 HEAD^2)' || '$(git log --no-merges --format=%B -n 1 HEAD)' }}
CRON: ${{ github.event_name == 'schedule' && 'TRUE' || 'FALSE' }}
run: |
R -e "source('version_bump.R'); bump_version('${{ env.LOG }}', ${{ env.CRON }}, ${{ steps.check_files.outputs.diff }})"
if echo "$(git status --porcelain)" | grep -q version.txt ; then
if echo "$(git status --porcelain)" | grep -q version.txt ; then
echo "Version changed"
echo "new_ver=TRUE" >> $GITHUB_OUTPUT
else
else
echo "No version bump"
echo "new_ver=FALSE" >> $GITHUB_OUTPUT
fi
- name: Setup Tag
if: github.event_name != 'pull_request' && steps.version.outputs.new_ver == 'TRUE'
id: tagging
Expand All @@ -92,18 +95,18 @@ jobs:
echo "tag=$value" >> $GITHUB_OUTPUT
value="v${value}"
echo "release=$value" >> $GITHUB_OUTPUT
- name: Tag new version for release
if: github.event_name != 'pull_request' && steps.version.outputs.new_ver == 'TRUE'
env:
env:
JOB_TAGS: ${{ github.event_name == 'schedule' && '[skip ci] [cron]' || '[skip ci]' }}
uses: EndBug/add-and-commit@v9
with:
author_name: Weecology Deploy Bot
author_email: weecologydeploy@weecology.org
message: "Update data and trigger archive: GitHub Build ${{ github.run_number }} ${{ env.JOB_TAGS }}"
tag: ${{ steps.tagging.outputs.tag }}

- name: Create Release
if: github.event_name != 'pull_request' && steps.version.outputs.new_ver == 'TRUE'
id: create_release
Expand Down
136 changes: 136 additions & 0 deletions DataCleaningScripts/download_eden.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Functions to find and download EDEN water depth data
#' @name get_metadata
#'
#' @title Get EDEN metadata
#'
#' @export
#'

get_metadata <- function() {
url <- "https://sflthredds.er.usgs.gov/thredds/catalog/eden/depths/catalog.html"
metadata <- url %>%
rvest::read_html() %>%
rvest::html_table()
metadata <- metadata[[1]] %>%
dplyr::filter(Dataset != "depths") %>% #Drop directory name from first row
dplyr::rename(dataset = Dataset, size = Size, last_modified = `Last Modified`) %>%
dplyr::mutate(last_modified = as.POSIXct(last_modified,
format = "%Y-%m-%dT%H:%M:%S"),
year = as.integer(substr(dataset, start = 1, stop = 4)))
}

#' @name get_data_urls
#'
#' @title Get EDEN depths data URLs for download
#'
#' @param file_names file names to download from metadata
#'
#' @return list of file urls
#'
#' @export
#'

get_data_urls <- function(file_names) {
base_url <- "https://sflthredds.er.usgs.gov/thredds/fileServer/eden/depths"
urls <- file.path(base_url, file_names)
return(list(file_names = file_names, urls = urls))
}

#' @name get_last_download
#'
#' @title Get list of EDEN depths data already downloaded
#'
#' @param eden_path path where the EDEN data should be stored
#' @param metadata EDEN file metadata
#' @param force_update if TRUE update all data files even if checks indicate
#' that remote files are unchanged since the current local copies were
#' created
#'
#' @return table of files already downloaded
#'
#' @export
#'
get_last_download <- function(eden_path = file.path("Water"),
metadata, force_update = FALSE) {
if ("last_download.csv" %in% list.files(eden_path) & !force_update) {
last_download <- read.csv(file.path(eden_path, "last_download.csv"))
} else {
last_download <- data.frame(dataset = metadata$dataset, size = "0 Mbytes",
last_modified = as.POSIXct("1900-01-01 00:00:01",
format = "%Y-%m-%d %H:%M:%S"))
}
return(last_download)
}

#' @name get_files_to_update
#'
#' @title Determine list of new EDEN files to download
#'
#' @param eden_path path where the EDEN data should be stored
#' @param metadata EDEN file metadata
#' @param force_update if TRUE update all data files even if checks indicate
#' that remote files are unchanged since the current local copies were
#' created
#'
#' @export
#'
get_files_to_update <- function(eden_path = file.path("Water"),
metadata, force_update = FALSE){
# Find files that have been updated since last download
last_download <- get_last_download(eden_path, metadata, force_update = force_update)
new <- metadata %>%
dplyr::left_join(last_download, by = "dataset", suffix = c(".curr", ".last")) %>%
dplyr::filter(last_modified.curr > last_modified.last | size.curr != size.last | is.na(last_modified.last))
metadata %>%
dplyr::filter(year %in% c(new$year-2, new$year-1, new$year, new$year+1, new$year+2))
}

#' @name update_last_download
#'
#' @title Write new metata file for files already downloaded
#'
#' @param eden_path path where the EDEN data should be stored
#' @param metadata EDEN file metadata
#'
#' @export
#'
update_last_download <- function(eden_path = file.path("Water"),
metadata){
current_files <- list.files(eden_path, pattern = "*_depth.nc")
write.csv(metadata, file.path(eden_path, 'last_download.csv'))
}

#' @name download_eden_depths
#'
#' @title Download the EDEN depths data
#'
#' @param eden_path path where the EDEN data should be stored
#' @param force_update if TRUE update all data files even if checks indicate
#' that remote files are unchanged since the current local copies were
#' created
#'
#' @return char vector of downloaded/updated files
#'
#' @export
#'
download_eden_depths <- function(eden_path = file.path("Water"),
force_update = FALSE) {

if (!dir.exists(eden_path)) {
dir.create(eden_path, recursive = TRUE)
}

metadata <- get_metadata()
to_update <- get_files_to_update(eden_path, metadata,
force_update = force_update)
data_urls <- get_data_urls(to_update$dataset)
options(timeout = 226)

downloaded <- mapply(download.file,
data_urls$urls,
file.path(eden_path, data_urls$file_names))

update_last_download(eden_path, metadata)

return(file.path(eden_path, data_urls$file_names))
}
Loading

0 comments on commit 51be02a

Please sign in to comment.