R/utils.R

# Copyright 2018 Province of British Columbia
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.

base_url <- function() "https://catalogue.data.gov.bc.ca/api/3/"

bcdata_user_agent <- function(){
  "https://github.com/bcgov/bcdata"
}

compact <- function(l) Filter(Negate(is.null), l)


bcdc_number_wfs_records <- function(query_list, client){

  if(!is.null(query_list$propertyName)){
    query_list$propertyName <- NULL
  }

  query_list <- c(resultType = "hits", query_list)
  res_max <- client$post(body = query_list, encode = "form")
  txt_max <- res_max$parse("UTF-8")

  ## resultType is only returned as XML.
  ## regex to extract the number
  as.numeric(sub(".*numberMatched=\"([0-9]{1,20})\".*", "\\1", txt_max))

}

specify_geom_name <- function(cols_df, CQL_statement){
  # Find the geometry field and get the name of the field
  geom_col <- geom_col_name(cols_df)

  # substitute the geometry column name into the CQL statement and add sql class
  dbplyr::sql(glue::glue(CQL_statement, geom_name = geom_col))
}

bcdc_read_sf <- function(x, ...){

  if(length(x) == 1){

    return(sf::read_sf(x, stringsAsFactors = FALSE, quiet = TRUE, ...))

  } else{
    # tests that cover this are skipped due to large size
    # nocov start
    ## Parse the Paginated response
    message("Parsing data")
    sf_responses <- lapply(x, function(x) {sf::read_sf(x, stringsAsFactors = FALSE, quiet = TRUE, ...)})

    do.call(rbind, sf_responses)
    # nocov end
  }


}

slug_from_url <- function(x) {
  if (grepl("^(http|www)", x)) x <- basename(x)
  x
}

formats_supported <- function(){
  c(bcdc_read_functions()[["format"]], "zip")
}

bcdc_http_client <- function(url = NULL) {

  crul::HttpClient$new(url = url,
                       headers = list(`User-Agent` = bcdata_user_agent()))

}

## Check if there is internet
## h/t to https://github.com/ropensci/handlr/blob/pluralize/tests/testthat/helper-handlr.R
has_internet <- function() {
  z <- try(suppressWarnings(readLines('https://www.google.com', n = 1)),
           silent = TRUE)
  !inherits(z, "try-error")
}

# Need the actual name of the geometry column
geom_col_name <- function(x) {
  geom_type <- intersect(x$remote_col_type, gml_types())
  x[x$remote_col_type == geom_type, , drop = FALSE]$col_name
}

#' @param x a resource_df from formatted record
#' @noRd
wfs_available <- function(x) {
  x$location %in% c("bcgwdatastore", "bcgeographicwarehouse") &
    x$format == "wms"
}

#' @param x a resource_df from formatted record
#' @noRd
other_format_available <- function(x) {
  x$ext %in% formats_supported() &
    !x$location %in% c("bcgwdatastore", "bcgeographicwarehouse")
}

wfs_to_r_col_type <- function(col){

  dplyr::case_when(
    col == "xsd:string" ~ "character",
    col == "xsd:date" ~ "date",
    col == "xsd:decimal" ~ "numeric",
    col == "xsd:hexBinary" ~ "numeric",
    grepl("^gml:", col) ~ "sfc geometry",
    TRUE ~ as.character(col)
  )
}


##from a record
formats_from_record <- function(x, trim = TRUE){

  resource_df <- dplyr::tibble(
      name = purrr::map_chr(x$resources, "name"),
      url = purrr::map_chr(x$resources, safe_file_ext),
      format = purrr::map_chr(x$resources, "format")
    )
  x <- formats_from_resource(resource_df)

  if(trim) return(x[x != ""])

  x
}

formats_from_resource <- function(x){
  dplyr::case_when(
    x$format == x$url ~ x$format,
    x$format == "wms" ~ "wms",
    !nzchar(x$url) ~ x$format,
    !nzchar(safe_file_ext(x)) ~ x$format,
    x$url == "zip" ~ paste0(x$format, "(zipped)"),
    TRUE ~ safe_file_ext(x)
  )
}

safe_file_ext <- function(resource) {
  url_format <- tools::file_ext(resource$url)
  url_format <- ifelse(url_format == "zip", resource$format, url_format)
  url_format
}

gml_types <- function(x) {
  c(
    "gml:PointPropertyType",
    "gml:CurvePropertyType",
    "gml:SurfacePropertyType",
    "gml:GeometryPropertyType",
    "gml:MultiPointPropertyType",
    "gml:MultiCurvePropertyType",
    "gml:MultiSurfacePropertyType",
    "gml:MultiGeometryPropertyType"
  )
}

get_record_warn_once <- function(...) {
  silence <- isTRUE(getOption("silence_named_get_record_warning"))
  warned <- bcdata_env$named_get_record_warned
  if (!silence && !warned) {
    warning(..., call. = FALSE)
    assign("named_get_record_warned", TRUE, envir = bcdata_env)
  }
}


clean_wfs <- function(x){
  dplyr::case_when(
    x == "WMS getCapabilities request" ~ "WFS request (Spatial Data)",
    x == "wms" ~ "wfs",
    TRUE ~ x
  )
}

read_from_url <- function(resource, ...){
  if (nrow(resource) > 1) stop("more than one resource specified", call. = FALSE)
  file_url <- resource$url
  reported_format <- safe_file_ext(resource)
  if (!reported_format %in% formats_supported()) {
    stop("Reading ", reported_format, " files is not currently supported in bcdata.")
  }

  cli <- bcdc_http_client(file_url)

  ## Establish where to download file
  tmp <- tempfile(tmpdir = unique_temp_dir(),
                  fileext = paste0(".", tools::file_ext(file_url)))
  on.exit(unlink(tmp))

  r <- cli$get(disk = tmp)
  r$raise_for_status()

  tmp <- handle_zip(tmp)
  final_format <- tools::file_ext(tmp)

  # Match the read function to the file format format and retrieve the function
  funs <- bcdc_read_functions()
  fun <- funs[funs$format == final_format, ]

  # This assumes that the function we are using to read the data takes the
  # data as the first argument - will need revisiting if we find a situation
  # where that's not the case
  message("Reading the data using the ", fun$fun, " function from the ",
          fun$package, " package.")
  tryCatch(do.call(fun$fun, list(tmp, ...)),
           error = function(e) {
             stop("Could not read data set. The file can be found here:\n '",
                  tmp, "'\n if you would like to try to read it manually.\n\n",
                  call. = FALSE)
           })
}

resource_to_tibble <- function(x){
  dplyr::tibble(
    name = purrr::map_chr(x, "name"),
    url = purrr::map_chr(x, "url"),
    id = purrr::map_chr(x, "id"),
    format = purrr::map_chr(x, "format"),
    ext = purrr::map_chr(x, safe_file_ext),
    package_id = purrr::map_chr(x, "package_id"),
    location = simplify_string(purrr::map_chr(x, "resource_storage_location"))
  )
}

simplify_string <- function(x) {
  tolower(gsub("\\s+", "", x))
}

pagination_sort_col <- function(cols_df) {
  cols <- cols_df[["col_name"]]
  # use OBJECTID or OBJECT_ID as default sort key, if present
  # if not present (several GSR tables), use SEQUENCE_ID
  # Then try FEATURE_ID
  for (idcol in c("OBJECTID", "OBJECT_ID", "SEQUENCE_ID", "FEATURE_ID")) {
    if (idcol %in% cols) return(idcol)
  }
  #Otherwise use the first column - this is likely pretty fragile
  warning("Unable to find a suitable column to sort on for pagination. Using",
          " the first column (", cols[1],
          "). Please check your data for obvious duplicated or missing rows.",
          call. = FALSE)
  cols[1]
}

handle_zip <- function(x) {
  # Just give file back if it's not zipped
  if (!is_filetype(x, "zip")) {
    return(x)
  }
  # decompress into same dir
  dir <- dirname(x)
  utils::unzip(x, exdir = dir)
  unlink(x)
  files <- list_supported_files(dir)
  # check if it's a shapefile

  if (length(files) > 1L) {
    stop("More than one supported file in zip file. It has been downloaded and ",
         "extracted to '", dir, "', where you can access its contents manually.",
         call. = FALSE)
  }

  files
}


unique_temp_dir <- function(pattern = "bcdata_") {
  dir <- tempfile(pattern = pattern)
  dir.create(file.path(dirname(dir), basename(dir)))
  dir
}

list_supported_files <- function(dir) {
  files <- list.files(dir, full.names = TRUE)
  supported <- is_filetype(files, formats_supported())

  if (!any(supported)) {
    # check if any are directories, if not bail, otherwise extract them
    if (!length(list.dirs(dir, recursive = FALSE))) {
      stop("No supported files found", call. = FALSE)
    }
      files <- list.files(dir, full.names = TRUE, recursive = TRUE)
      supported <- is_filetype(files, formats_supported())
  }

  files[supported]
}

catch_catalogue_error <- function(catalogue_response) {
  msg <- "The BC data catalogue is currently unable to process this request\n"

  if (inherits(catalogue_response, "Paginator")) {
    statuses <- catalogue_response$status_code()
    status_failed <- any(statuses >= 300)
    msg <- paste0(msg, paste0(length(statuses), " paginated requests issued"))

  } else {
    status_failed <- catalogue_response$status_code >= 300
    request_res <- catalogue_response$request_headers
    response_res <- catalogue_response$response_headers

    msg <- paste0(msg, "Catalogue request:\n")
    for (i in seq_along(request_res)) {
      msg <- paste0(
        msg, "  ", names(request_res)[i], ": ",
        request_res[i], "\n"
      )
    }
    msg <- paste0(msg, "Catalogue response:\n")
    for (i in seq_along(response_res)) {
      msg <- paste0(
        msg, "  ", names(response_res)[i], ": ",
        response_res[i], "\n"
      )
    }
  }

  if (status_failed) {
    stop(msg, call. = FALSE)
  }
}