Merge pull request #110 from USEPA/Aug11cm

Aug11cm
USEPA · Aug 25, 2022 · 3e954b3 · 3e954b3
2 parents 73c4c4a + 9f74799
commit 3e954b3
Show file tree

Hide file tree

Showing 17 changed files with 21,235 additions and 23 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -3,3 +3,4 @@
 ^_pkgdown\.yml$
 ^docs$
 ^pkgdown$
+^\.github$
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -0,0 +1,52 @@
+# Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
+# Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
+on:
+  push:
+    branches: [develop]
+  pull_request:
+    branches: [develop]
+  release:
+    types: [published]
+  workflow_dispatch:
+
+name: pkgdown
+
+jobs:
+  pkgdown:
+    runs-on: ubuntu-latest
+    # Only restrict concurrency for non-PR jobs
+    concurrency:
+      group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }}
+    env:
+      GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - uses: actions/checkout@v2
+
+      - uses: r-lib/actions/setup-pandoc@v2
+
+      - uses: r-lib/actions/setup-r@v2
+        with:
+          use-public-rspm: true
+
+      - uses: r-lib/actions/setup-r-dependencies@v2
+        with:
+          extra-packages: any::pkgdown, local::.
+          needs: website
+
+      - name: Build site
+        run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE)
+        shell: Rscript {0}
+
+      - name: Deploy to GitHub pages 🚀
+        if: github.event_name != 'pull_request'
+        uses: JamesIves/github-pages-deploy-action@4.1.4
+        with:
+          clean: false
+          branch: gh-pages
+          folder: docs
+
+url: https://usepa.github.io/TADA/
+
+template:
+  bootstrap: 5
+  bootswatch: cerulean
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -24,7 +24,9 @@ Imports:
     stringr,
     utils, 
     RColorBrewer,
-    stats
+    stats, 
+    tidyverse, 
+    lubridate
 Depends: 
     R (>= 2.10)
 Suggests: 
@@ -40,3 +42,4 @@ Suggests:
 VignetteBuilder: knitr, rmarkdown
 Language: en-US
 Config/testthat/edition: 3
+URL: https://usepa.github.io/TADA/
diff --git a/NAMESPACE b/NAMESPACE
@@ -23,6 +23,7 @@ export(PotentialDuplicateRowID)
 export(QAPPDocAvailable)
 export(QAPPapproved)
 export(RemoveEmptyColumns)
+export(TADABigdataRetrieval)
 export(TADAdataRetrieval)
 export(WQXTargetUnits)
 export(readWQPwebservice)

diff --git a/R/DataDiscoveryRetrieval.R b/R/DataDiscoveryRetrieval.R
@@ -2,16 +2,25 @@
 #'
 #' Retrieve data from Water Quality Portal (WQP) and output a TADA-compatible
 #' dataset.
+#' 
+#' Keep in mind that all the query filters for the WQP work as an AND 
+#' but within the fields there are ORs. So for example, 
+#' characteristics – if you choose pH & DO – it’s an OR. Similarly, if you
+#' choose VA and IL, it’s an OR. But the combo of fields are ANDs. 
+#' Such as State/VA AND Characteristic/DO". 
+#' "Characteristic" and "Characteristic Group" also work as an AND. 
 #'
 #' @param statecode Code that identifies a state
 #' @param startDate Start Date
 #' @param countycode Code that identifies a county 
 #' @param siteid Unique monitoring station identifier
 #' @param siteType Type of waterbody
-#' @param characteristicName Name of characteristic
+#' @param characteristicName Name of parameter
+#' @param ActivityMediaName Sampling substrate such as water, air, or sediment
 #' @param endDate End Date
 #'
 #' @return TADA-compatible dataframe
+#' 
 #' @export
 #'
 
@@ -21,6 +30,7 @@ TADAdataRetrieval <- function(statecode = "null",
                               siteid = "null",
                               siteType = "null",
                               characteristicName = "null",
+                              ActivityMediaName = "null", 
                               endDate = "null"
                               ) {
 
@@ -62,6 +72,12 @@ TADAdataRetrieval <- function(statecode = "null",
     WQPquery <- c(WQPquery, characteristicName = characteristicName)
   }
 
+  if (length(ActivityMediaName)>1) {
+    WQPquery <- c(WQPquery, ActivityMediaName = list(ActivityMediaName)) 
+  } else if (ActivityMediaName != "null") {
+    WQPquery <- c(WQPquery, ActivityMediaName = ActivityMediaName)
+  }
+
   if (length(endDate)>1) {
     WQPquery <- c(WQPquery, endDate = list(endDate)) 
   } else if (endDate != "null") {
@@ -281,3 +297,184 @@ TADAprofileCheck <- function(.data) {
     stop("The dataframe does not contain the required fields to use TADA. Use either the full physical/chemical profile downloaded from WQP or download the TADA profile template available on the EPA TADA webpage.")
   }
 }
+
+
+
+#' Large WQP data pulls using dataRetrieval for all data from all sites in the contiguous United States.
+#'  
+#' This function uses the WQP summary service to limit the amount
+#' downloaded to only relevant data. For large data sets, that can 
+#' save a lot of time and ultimately reduce the complexity of subsequent
+#' data processing. 
+#' 
+#' This function will join data from multiple WQP profiles and output a 
+#' TADA-compatible dataset.
+#'
+#' @param startDate Start Date YYYY-MM-DD format, for example, "1995-01-01"
+#' @param endDate end date in YYYY-MM-DD format, for example, "2020-12-31"
+#' @param characteristicName Name of water quality parameter
+#' @param siteType Name of water body type (e.g., "Stream", "Lake, Reservoir, Impoundment")
+#' 
+#' @return TADA-compatible dataframe
+#' 
+#' @export
+#'
+
+TADABigdataRetrieval <- function(startDate = "null",
+                              endDate = "null",
+                              characteristicName = "null", 
+                              siteType = "null"
+) {
+
+  startDate_Low = lubridate::ymd(startDate)
+  startYearLo = lubridate::year(startDate_Low)
+
+  endDate_High = lubridate::ymd(endDate)
+  startYearHi = lubridate::year(endDate_High)
+
+  if (length(characteristicName)>1) {
+    characteristicName = list(characteristicName) 
+  } else if (characteristicName != "null") {
+    characteristicName = characteristicName
+  }
+
+  if (length(siteType)>1) {
+    siteType = list(siteType)
+  } else if (siteType != "null") {
+    siteType = siteType
+  }
+
+  state_cd_cont = utils::read.csv(file = "inst/extdata/statecode.csv")
+
+  for(i in seq_len(nrow(state_cd_cont))){
+
+    state_cd = as.numeric(state_cd_cont$STATE[i])
+    state_nm = state_cd_cont$STUSAB[i]
+
+    df_summary = dataRetrieval::readWQPsummary(statecode = state_cd,
+                     characteristicName = characteristicName, 
+                     siteType = siteType, 
+                     startDate = startDate)
+
+    sites = df_summary %>%
+      dplyr::filter(YearSummarized >= startYearLo,
+                    YearSummarized <= startYearHi)
+
+    siteid_all = unique(sites$MonitoringLocationIdentifier)
+
+    if(length(siteid_all) > 0) {
+
+      l=length(siteid_all)  #len(sites)
+      g=250   #grouping size
+      nl=ceiling(l/g) #number of queries
+
+      i=0
+      j=0
+      k=0
+
+      while (i < nl) {
+
+        j=i*g
+        k=j+g-1
+
+        if (k>l){k=l}
+        sites=siteid_all[j:k]
+
+        results.DR <- dataRetrieval::readWQPdata(siteid = sites,
+                                               characteristicName = characteristicName) 
+                                               #startDate = startDate)
+
+        narrow.DR <- dataRetrieval::readWQPdata(siteid = sites,
+                                                characteristicName = characteristicName,
+                                                dataProfile = "narrowResult")
+
+        sites.DR <- dataRetrieval::whatWQPsites(siteid = sites,
+                                                characteristicName = characteristicName)
+
+        #projects.DR <- dataRetrieval::readWQPdata(siteid = siteid,
+                                                  #characteristicName = characteristicName,
+                                                  #service = "Project")
+
+        #})
+
+        # Join station data to full phys/chem (results.DR)
+        join1 <- results.DR %>%
+        # join stations to results
+        dplyr::left_join(sites.DR, by = "MonitoringLocationIdentifier") %>%
+        # remove ".x" suffix from column names
+        dplyr::rename_at(dplyr::vars(ends_with(".x")), ~ stringr::str_replace(., "\\..$", "")) %>%
+        # remove columns with ".y" suffix
+        dplyr::select_at(dplyr::vars(-ends_with(".y")))
+
+        # Join Speciation column from narrow to full profile
+        join2 <- join1 %>%
+        dplyr::left_join(dplyr::select(
+        narrow.DR, ActivityIdentifier, MonitoringLocationIdentifier,
+        CharacteristicName, ResultMeasureValue,
+        MethodSpecificationName
+        ),
+        by = c(
+          "ActivityIdentifier", "MonitoringLocationIdentifier",
+          "CharacteristicName", "ResultMeasureValue"
+        )
+        )
+
+        join2$ResultMeasureValue = as.character(join2$ResultMeasureValue)
+
+        if (i==0){
+          df = join2 }  
+        else {
+          join2 = rbind(df, join2)
+          }
+        print(j)
+        print(k)
+
+        i = i+1
+      }
+    }
+
+    if(nrow(join2) > 0){
+
+      #####
+      #need to edit below if temporary rds files do not go away
+      #may be able to delete below
+      #https://stackoverflow.com/questions/47626331/saving-and-retrieving-temp-files-in-r-packages
+      #####
+
+      #original
+      #saveRDS(df_state, file = paste0(state_nm, "_raw_data.rds"))
+
+      tempfilename = paste0(state_nm, "_raw_data.rds")
+      file.path(tempdir(), saveRDS(join2, file = paste0("inst/tempdata/", tempfilename)))
+
+    }
+  }
+    all_data <- data.frame()
+    for(state in state_cd_cont$STUSAB){
+      allstates_df <- tryCatch({
+        #####
+        #need to edit line below if rds files do not go away
+        #####
+
+        #original below
+       #readRDS(paste0(state, "_raw_data.rds"))
+
+        readRDS(paste0("inst/tempdata/", tempfilename))
+      })
+
+      if(nrow(allstates_df) > 0){
+        all_data <- bind_rows(all_data, allstates_df)
+      }
+
+    } 
+
+    finalprofile = all_data %>%
+      dplyr::filter(ActivityStartDate <= endDate, 
+                    ActivityStartDate >= startDate)
+
+    finalprofile2 = autoclean(finalprofile)
+    #not sure if above is working correctly, thousands of "duplicated" rows are removed
+    # you will still need to filter on activity media subdivision now
+
+  return(finalprofile2)
+}
diff --git a/R/Transformations.R b/R/Transformations.R
@@ -331,7 +331,7 @@ WQXTargetUnits <- function(.data, transform = TRUE) {
 #'
 #' @return When transform = FALSE and flag = TRUE, Harmonization Reference Table
 #' columns are appended to the dataset only. When transform = TRUE and flag = TRUE,
-#' Harmoinzation columns are appended to the dataset and transformations are
+#' Harmonization columns are appended to the dataset and transformations are
 #' executed. When transform = TRUE and flag = FALSE, transformations are executed
 #' only. When transform = FALSE and flag = FALSE, an error is returned (function
 #' would return the input dataframe unchanged if input was allowed).

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -1,4 +1,4 @@
-url: https://pkgdown.r-lib.org
+url: https://usepa.github.io/TADA/
 template:
   bootstrap: 5
-  bootswatch: cerulean
+
diff --git a/inst/extdata/statecode.csv b/inst/extdata/statecode.csv
@@ -0,0 +1,50 @@
+"STATE","STATE_NAME","STUSAB","STATENS"
+"01","Alabama","AL","01779775"
+"04","Arizona","AZ","01779777"
+"05","Arkansas","AR","00068085"
+"06","California","CA","01779778"
+"08","Colorado","CO","01779779"
+"09","Connecticut","CT","01779780"
+"10","Delaware","DE","01779781"
+"11","District of Columbia","DC","01702382"
+"12","Florida","FL","00294478"
+"13","Georgia","GA","01705317"
+"16","Idaho","ID","01779783"
+"17","Illinois","IL","01779784"
+"18","Indiana","IN","00448508"
+"19","Iowa","IA","01779785"
+"20","Kansas","KS","00481813"
+"21","Kentucky","KY","01779786"
+"22","Louisiana","LA","01629543"
+"23","Maine","ME","01779787"
+"24","Maryland","MD","01714934"
+"25","Massachusetts","MA","00606926"
+"26","Michigan","MI","01779789"
+"27","Minnesota","MN","00662849"
+"28","Mississippi","MS","01779790"
+"29","Missouri","MO","01779791"
+"30","Montana","MT","00767982"
+"31","Nebraska","NE","01779792"
+"32","Nevada","NV","01779793"
+"33","New Hampshire","NH","01779794"
+"34","New Jersey","NJ","01779795"
+"35","New Mexico","NM","00897535"
+"36","New York","NY","01779796"
+"37","North Carolina","NC","01027616"
+"38","North Dakota","ND","01779797"
+"39","Ohio","OH","01085497"
+"40","Oklahoma","OK","01102857"
+"41","Oregon","OR","01155107"
+"42","Pennsylvania","PA","01779798"
+"44","Rhode Island","RI","01219835"
+"45","South Carolina","SC","01779799"
+"46","South Dakota","SD","01785534"
+"47","Tennessee","TN","01325873"
+"48","Texas","TX","01779801"
+"49","Utah","UT","01455989"
+"50","Vermont","VT","01779802"
+"51","Virginia","VA","01779803"
+"53","Washington","WA","01779804"
+"54","West Virginia","WV","01779805"
+"55","Wisconsin","WI","01779806"
+"56","Wyoming","WY","01779807"
diff --git a/man/HarmonizeData.Rd b/man/HarmonizeData.Rd