diff --git a/NPSdataverse.Rproj b/NPSdataverse.Rproj index 553f642..69fafd4 100644 --- a/NPSdataverse.Rproj +++ b/NPSdataverse.Rproj @@ -1,5 +1,4 @@ Version: 1.0 -ProjectId: a49a17c2-d2d1-45c6-a1ff-3bcdbf2ac52e RestoreWorkspace: No SaveWorkspace: No diff --git a/paper.bib b/paper.bib index 2f340ad..cce9dab 100644 --- a/paper.bib +++ b/paper.bib @@ -92,7 +92,7 @@ @article{EML2019 title = {Ecological Metadata Language version 2.2.0}, url = {https://eml.ecoinformatics.org}, DOI={10.5063/f11834t2}, publisher = {KNB Data Repository}, - author = {Jones, Matthew and O’Brien, Margaret and Mecum, Bryce and Boettiger, Carl and Schildhauer, Mark and Maier, Mitchell and Whiteaker, Timothy and Earl, Stevan and Chong, Steven}, + author = {Jones, Matthew and O'Brien, Margaret and Mecum, Bryce and Boettiger, Carl and Schildhauer, Mark and Maier, Mitchell and Whiteaker, Timothy and Earl, Stevan and Chong, Steven}, year = {2019} } @@ -114,7 +114,7 @@ @article{Boettiger2019 } @article{Jones2006, - author = "Jones, Matthew B. and Schildhauer, Mark P. and Reichman, O.J. and Bowers, Shawn", + author = "Jones, Matthew and Schildhauer, Mark P. and Reichman, O.J. and Bowers, Shawn", title = "The New Bioinformatics: Integrating Ecological Data from the Gene to the Biosphere", journal= "Annual Review of Ecology, Evolution, and Systematics", year = "2006", @@ -179,42 +179,52 @@ @Manual{Smith2022 https://ediorg.github.io/EMLassemblyline/}, } -@Manual{Baker_QCkit2024, +@Manual{Baker_QCkit2025, title = {QCkit: NPS Inventory and Monitoring Quality Control Toolkit}, - author = {Robert Baker and Judd Patterson and Joe DeVivo and Issac Quevedo and Sarah Wright}, - year = {2024}, - note = {R package version 0.1.7}, + author = {Baker, Robert and Judd Patterson and Joe DeVivo and Issac Quevedo and Sarah Wright}, + year = {2025}, + note = {R package version 1.0.0}, url = {https://github.com/nationalparkservice/QCkit/}, } -@Manual{Baker_NPSdataverse2024, +@Manual{Baker_NPSdataverse2025, title = {NPSdataverse: Tools and Packages for Data and Metadata Manipulation}, - author = {Robert Baker and Judd Patterson and Joe DeVivo}, - year = {2024}, - note = {R package version 0.1.0}, + author = {Baker, Robert and Judd Patterson and Joe DeVivo}, + year = {2025}, + note = {R package version 1.0.0}, url = {https://github.com/nationalparkservice/NPSdataverse}, } -@Manual{Baker_EMLeditor2024, +@Manual{Baker_EMLeditor2025, title = {EMLeditor: View and Edit EML Metadata}, - author = {Robert Baker and Judd Patterson}, - year = {2024}, - note = {R package version 0.1.6}, + author = {Baker, Robert and Judd Patterson}, + year = {2025}, + note = {R package version 1.0.0}, url = {https://github.com/nationalparkservice/EMLeditor}, } -@Manual{Baker_DPchecker2024, +@Manual{Baker_DPchecker2025, title = {DPchecker: Checks Data Packages for Congruence}, - author = {Rob Baker and Sarah E. Wright}, - year = {2024}, - note = {R package version 0.3.4}, + author = {Baker, Robert and Sarah E. Wright}, + year = {2025}, + note = {R package version 1.0.0}, url = {https://nationalparkservice.github.io/DPchecker/}, } -@Manual{Baker_NPSutils2024, +@Manual{Baker_NPSutils2025, title = {NPSutils: Collection of Functions to read and manipulate information from the NPS DataStore}, - author = {Robert Baker and Joe DeVivo and Judd Patterson}, - year = {2024}, - note = {R package version 0.3.1}, + author = {Baker, Robert and Joe DeVivo and Judd Patterson}, + year = {2025}, + note = {R package version 1.0.0}, url = {https://github.com/nationalparkservice/NPSutils}, } + +@techreport{Bailard2024, + title = {Mojave Desert Network Desert Springs Data Package 2016 - 2023}, + author = {Bailard, Jennifer and Mark Lehman}, + year = {2024}, + institution = {National Park Service}, + type = {}, + number = {}, + note = {\url{https://doi.org/10.57830/2300498}} + } diff --git a/paper.md b/paper.md index 0e2335b..8b2a3e0 100644 --- a/paper.md +++ b/paper.md @@ -15,12 +15,19 @@ tags: - data package - data publication - data access -date: "21 October 2024" -output: - pdf_document: default - html_document: - df_print: paged - word_document: default +date: "29 January 2025" +output: pdf_document +affiliations: +- name: National Park Service, USA + index: 1 +- name: Environmental Data Initiative, USA + index: 2 +- name: University of Wisconsin, USA + index: 3 +- name: Student Conservation Association, USA + index: 4 +- name: University of California, Berkeley, USA + index: 5 authors: - name: Robert L. Baker orcid: "0000-0001-7591-5035" @@ -54,38 +61,24 @@ authors: equal-contrib: true affiliation: 1 bibliography: paper.bib -editor_options: - markdown: - wrap: 72 -affiliations: -- name: National Park Service, USA - index: 1 -- name: Environmental Data Initiative, USA - index: 2 -- name: University of Wisconsin, USA - index: 3 -- name: Student Conservation Association, USA - index: 4 -- name: University of California, Berkeley, USA - index: 5 --- # Summary -The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) is a suite of R packages developed to create, document, publish, and access data and metadata in open and machine-readable format. NPSdataverse is modeled off of the tidyverse concept of several packages built with a common goal [@Wickham2019]. The NPSdataverse supports Ecological Metadata Language (EML) metadata and .csv data files. Some of the constituent R packages ([EML](https://docs.ropensci.org/EML/) and [EMLassemblyline](https://ediorg.github.io/EMLassemblyline/)) are general-use and aimed at authoring EML documents. Other R packages ([QCkit](https://nationalparkservice.github.io/QCkit/), [EMLeditor](https://nationalparkservice.github.io/EMLeditor/), [DPchecker](https://nationalparkservice.github.io/DPchecker/) and [NPSutils](https://nationalparkservice.github.io/NPSutils/)) are designed and maintained by the National Park Service (NPS). Although many functions within the NPSdataverse packages are NPS-specific (particularly some API calls), whenever possible the functions are written so that they can also be used by the general public. Scientists conducting permitted research in NPS units can utilize the NPSdataverse to efficiently and consistently meet the data delivery requirements of their permits. Additionally, the packages will be useful for data management plans in a wide variety of grant proposals and for anyone that needs to create open data and machine readable metadata. Finally, the ability to swiftly and easily author, edit, and check Ecological Metadata Language (EML) metadata in a reproducible fashion will be useful for data publication at any number of repositories or data journals. +The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) is a suite of R packages developed to create, document, publish, and access data and metadata in open and machine-readable formats.NPSdataverse is modeled off of the tidyverse concept of several packages built with a common goal [@Wickham2019].The NPSdataverse supports Ecological Metadata Language (EML) metadata and .csv data files. Some of the constituent R packages ([EML](https://docs.ropensci.org/EML/) and [EMLassemblyline](https://ediorg.github.io/EMLassemblyline/)) are general-use and aimed at authoring EML documents. Other R packages ([QCkit](https://nationalparkservice.github.io/QCkit/), [EMLeditor](https://nationalparkservice.github.io/EMLeditor/), [DPchecker](https://nationalparkservice.github.io/DPchecker/) and [NPSutils](https://nationalparkservice.github.io/NPSutils/)) are designed and maintained by the National Park Service (NPS).Although many functions within the NPSdataverse packages are NPS-specific (particularly some API calls), whenever possible the functions are written so that they can also be used by the general public. Scientists conducting permitted research in NPS units can utilize the NPSdataverse to efficiently and consistently meet the data delivery requirements of their permits. Additionally, the packages will be useful for data management plans in a wide variety of grant proposals and for anyone that needs to create open data and machine-readable metadata. The ability to swiftly and easily author, edit, and check Ecological Metadata Language (EML) metadata in a reproducible fashion will be useful for data publication at any number of repositories or data journals. Finally, a scripted interface for downloading NPS data and leveraging metadata while loading it into R or other platforms for subsequent analyses and visualizations will be useful to researchers in the government, academia, and industry as well as the public. # Statement of Need -Following a movement for transparency in scientific research and data accessibility, the U.S. implemented the federal OPEN Government Data Act [@OpenData2018]. The Open Data Act mandates that federal agencies provide data in open formats with metadata. Subsequently, many funding agencies such as the National Science Foundation have required grant awardees make data public, often including metadata [@nsf2015]. Multiple publishers have followed suit [@Wiley2022; @Springer2023] and require data availability statements upon publication. +Following a movement for transparency in scientific research and data accessibility, the U.S. implemented the federal OPEN Government Data Act [@OpenData2018]. The Open Data Act mandates that federal agencies provide data in open formats with metadata. Subsequently, many funding agencies such as the National Science Foundation have required grant awardees make their data public, often including metadata [@nsf2015]. Multiple publishers have followed suit [@Wiley2022; @Springer2023] and require data availability statements upon publication. One goal of open science, and requirement of the recent "Nelson Memo" from the U.S. Office of Science and Technology Policy [@Nelson2022] is to make data FAIR: findable, inter-operable, accessible, and reuseable [@Wilkinson2016]. These goals are often achieved by including structured, machine-readable metadata that conforms to a defined schema along with the data. Ecological Metadata Language Metadata (EML) is one metadata standard that is particularly amenable to studies with rich taxonomy [@Jones2006; @EML2019]. It has been adopted by multiple research organizations including the Ecological Data Initiative (EDI), National Ecological Observatory Network (NEON), Global Biodiversity Information Facility (GBIF), Swedish Biodiversity Data Infrastructure (SBDI), French Biodiversity Hub ("Pole National de Donnees de Biodiversite"), U.S. National Park Service, and others. -Nevertheless, actual availability of data and metadata varies [@Federer2018; @Tedersoo2021], perhaps because there is a need for more infrastructure and tools to meet the goals of open data and open science [@Huston2019]. Multiple solutions have been presented, including ezEML, a tool for authoring metadata in Ecological Metadata Language and publishing data and metadata to a repository [@Vanderbilt2022]. ezEML has an intuitive graphical user interface with a relatively low learning curve; however, it does have some drawbacks. For instance, ezEML is not scriptable, which makes repeated deployments of the same or similar workflows challenging and can limit reproducibility. ezEML also requires that the user upload their data to an external site for processing, which may not be suitable for sensitive data. Here we introduce the NPSdataverse, a series of R packages for authoring, editing, and checking EML metadata locally in a robust, repeatable, and scriptable fashion. R Packages within the NPSdataverse leverage earlier work using R to create and manipulate XML based EML files [@Boettiger2019]. Building upon that framework, we add user-friendly EML creation workflows; integration with taxonomic databases; fast, easy editing of existing metadata; congruence checks to test correspondence between data and metadata; and integration with public repositories such as the National Park Service's [DataStore](https://irma.nps.gov/DataStore/). The EML metadata file in .xml format along with the .csv data files it describes comprise a "data package". In addition, R packages within the NPSdataverse also include data functions that expedite quality control, facilitate interoperability, provide the ability to download data directly from DataStore, and leverage the rich EML associated with the data regardless of repository of origin. +Nevertheless, actual availability of data and metadata varies [@Federer2018; @Tedersoo2021], perhaps because there is a need for more infrastructure and tools to meet the goals of open data and open science [@Huston2019]. Multiple solutions have been presented, including ezEML, a tool for authoring metadata in Ecological Metadata Language and publishing data and metadata to a repository [@Vanderbilt2022]. ezEML has an intuitive graphical user interface with a relatively low learning curve; however, it does have some drawbacks. For instance, ezEML is not scriptable, which makes repeated deployments of the same or similar workflows challenging and can limit reproducibility. ezEML also requires that the user upload their data to an external site for processing, which may not be suitable for sensitive data. Here we introduce the NPSdataverse, a series of R packages for authoring, editing, and checking EML metadata locally in a robust, repeatable, and scriptable fashion. R Packages within the NPSdataverse leverage earlier work using R to create and manipulate XML based EML files [@Boettiger2019]. Building upon that framework, we add user-friendly EML creation workflows; integration with taxonomic databases; fast, easy editing of existing metadata; congruence checks to test correspondence between data and metadata; and integration with public repositories such as the National Park Service's [DataStore](https://irma.nps.gov/DataStore/). R packages within the NPSdataverse also include functions that expedite data quality control, facilitate data interoperability, provide the ability to download data directly from DataStore, and leverage the rich EML associated with the data regardless of repository of origin. # NPSdataverse R package -The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) package is a meta-package that loads packages within the NPSdataverse into R [@Baker_NPSdataverse2024]. It provides a convenient way to download, install, and load many of the R packages needed to create and access data packages consisting of rich Ecological Metadata Language metadata and .csv data files: +The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) package is a meta-package that loads packages within the NPSdataverse into R [@Baker_NPSdataverse2025]. The NPSdataverse provides a convenient way to download, install, and load many of the R packages needed to create and access data packages, which consist of rich Ecological Metadata Language metadata and .csv data files: ``` pak::pkg_install("nationalparkservice/NPSdataverse") @@ -96,7 +89,7 @@ library(NPSdataverse) # QCkit R package -[QCkit](https://nationalparkservice.github.io/QCkit/) ("Quality Control kit") is primarily a data processing package designed to prepare data for metadata creation and publication [@Baker_QCkit2024]. This package serves two main functions: 1) Providing a suite of data quality control functions to be used across datasets regardless of the project, and 2) a suite of functions to apply data standards that promotes interoperability among datasets. For instance, `QCkit` includes functions that can help manage date-time formatting, can check data files for threatened or endangered species, and can help increase inter-operability by suggesting appropriate [Darwin Core](https://dwc.tdwg.org/) standards for naming data. `QCkit` also facilitates documenting data processing with functions that can generate a DataStore reference based on GitHub.com releases. The DataStore reference can hold processing scripts, code, or packages and have Digital Object Identifiers (DOIs) attached to them that are registered with [DataCite](https://datacite.org/) once the DataStore reference is activated. `QCkit` is designed as an expandable framework that can adapt to new quality control tests or as new data standards are adopted. +[QCkit](https://nationalparkservice.github.io/QCkit/) ("Quality Control kit") is primarily a data processing package designed to prepare data for metadata creation and publication [@Baker_QCkit2025]. This package serves two main functions: 1) Providing a suite of data quality control functions to be used across datasets regardless of the project, and 2) a suite of functions to apply data standards that promotes interoperability among datasets. For instance, `QCkit` includes functions that can help manage date-time formatting, can check data files for threatened or endangered species, and can help increase inter-operability by suggesting appropriate [Darwin Core](https://dwc.tdwg.org/) standards for naming data. `QCkit` also facilitates documenting data processing with functions that can generate a DataStore reference based on GitHub.com releases. The DataStore reference can hold processing scripts, code, or packages and have Digital Object Identifiers (DOIs) attached to them that are registered with [DataCite](https://datacite.org/) once the DataStore reference is activated. `QCkit` is designed as an expandable framework that can adapt to new quality control tests or as new data standards are adopted. # EML R package @@ -108,7 +101,7 @@ The [EMLassemblyline](https://ediorg.github.io/EMLassemblyline/) package builds # EMLeditor R package -The [EMLeditor](https://nationalparkservice.github.io/EMLeditor/) package allows users to quickly and easily view components of metadata in R and make on-the-fly edits to metadata [@Baker_EMLeditor2024]. Edits made to EML objects using `EMLeditor` do not require re-running the `EMLassemblyline` functions to make EML. This is a significant improvement because running `EMLassemblyline` functions can be time consuming, especially if there are many taxa that need to be resolved. `EMLeditor` includes the ability to pick specific licenses (CC0, CC-BY, etc), add [ORCIDs](https://orcid.org/), include organizations as authors, and much more. `EMLeditor` also adds specific content necessary to be compliant with NPS's DataStore. With the proper permissions, `EMLeditor` can be used to generate draft references and reserve DOIs on DataStore as well as upload data and metadata files to DataStore. Finally, `EMLeditor` contains a .rmd template file that, after loading the package, is accessible in Rstudio under `Files > New File > R markdown`. The template provides an editable script that walks the user through using `EMLassemblyline`, `EMLeditor`, and `DPchecker` to create and validate EML metadata in R. +The [EMLeditor](https://nationalparkservice.github.io/EMLeditor/) package allows users to quickly and easily view components of metadata in R and make on-the-fly edits to metadata [@Baker_EMLeditor2025]. Edits made to EML objects using `EMLeditor` do not require re-running the `EMLassemblyline` functions to make EML. This is a significant improvement because running `EMLassemblyline` functions can be time consuming, especially if there are many taxa that need to be resolved. `EMLeditor` includes the ability to pick specific licenses (CC0, CC-BY, etc), add [ORCIDs](https://orcid.org/), include organizations as authors, and much more. `EMLeditor` also adds specific content necessary to be compliant with NPS's DataStore. With the proper permissions, `EMLeditor` can be used to generate draft references and reserve DOIs on DataStore as well as upload data and metadata files to DataStore. Finally, `EMLeditor` contains a .rmd template file that, after loading the package, is accessible in Rstudio under `Files > New File > R markdown`. The template provides an editable script that walks the user through using `EMLassemblyline`, `EMLeditor`, and `DPchecker` to create and validate EML metadata in R. `EMLeditor` "set" class functions (which includes all functions that begin with "set_" such as "`EMLeditor::set_abstract()`") will add several NPS-specific items to the metadata using their default settings. For instance, these functions will set NPS as the publisher, Fort Collins as the publication location, and will add a "for or by NPS = TRUE" statement to the metadata. To invoke these functions without adding the NPS-specific metadata elements, set the parameter `NPS = FALSE` when calling each "set_" class function. Non-NPS publisher information can be added using the `EMLeditor::set_publisher()` function with the parameters `for_or_by_NPS` and `NPS` set to `FALSE`: @@ -134,7 +127,7 @@ new_metadata2 <- set_publisher(eml_object = new_metadata1, for_or_by_NPS = FALSE, NPS = FALSE) ``` -By default, `EMLeditor` functions provide verbose user feedback and may require user input to confirm some operations. These checks are intended to help guide users, prevent inadvertent mistakes, and limit unnecessary API calls. However, requiring user input can hamper highly scripted approaches and limits reproducibility. Therefore, all `EMLeditor` functions can be set to circumvent these requirements using the parameter `force = TRUE`. +By default, `EMLeditor` functions provide verbose user feedback and may require user input to confirm some operations. These checks are intended to help guide users, reduce mistakes, and limit unnecessary API calls. However, requiring user input can hamper highly scripted approaches and limits reproducibility. Therefore, all `EMLeditor` functions can be set to circumvent these requirements using the parameter `force = TRUE`. ``` #example setting the abstract while suppressing user feedback and input: @@ -146,7 +139,7 @@ new_metadata <- set_abstract(eml_object = old_metadata, # DPchecker R Package -The [DPchecker](https://nationalparkservice.github.io/DPchecker/) ("Data Package checker") package provides feedback on data-metadata congruence [@Baker_DPchecker2024]. Here, a "data package" consists of the EML metadata file with a filename that ends in *_metadata.xml and one or more data files in .csv format, all of which are in a single directory (and the directory contains no extraneous .csv or .xml files). `DPchecker` is useful for both data package authors and reviewers. `DPchecker` goes beyond validating EML objects in R against the EML schema. Using the `DPchecker::run_congruence_checks` function, `DPchecker` will conduct a series of 46 tests. These are divided into several categories to check whether: +The [DPchecker](https://nationalparkservice.github.io/DPchecker/) ("Data Package checker") package provides feedback on data-metadata congruence [@Baker_DPchecker2025]. Here, a "data package" consists of the EML metadata file with a filename that ends in *_metadata.xml and one or more data files in .csv format, all of which are in a single directory (and the directory contains no extraneous .csv or .xml files). `DPchecker` is useful for both data package authors and reviewers. `DPchecker` goes beyond validating EML objects in R against the EML schema. Using the `DPchecker::run_congruence_checks` function, `DPchecker` will conduct a series of 46 tests. These are divided into several categories to check whether: 1. Metadata are well formatted (file names are not duplicated, files specify the field delimiter, data files have URLs, the proper delimiter and header row numbers are present, etc.). 2. Metadata elements necessary for DataStore automated extraction are present (creators have valid surnames, publication date is present and in the correct ISO-8601 format, keywords are present, abstract and methods are present and well formatted, etc). @@ -158,9 +151,9 @@ For each test, the data package may fail with an error, fail with a warning, or # NPSutils R Package -The `[NPSutils](https://nationalparkservice.github.io/NPSutils/)` ("NPS utilities") package serves primarily as a way to access data [@Baker_NPSutils2024]. `NPSutils` provides avenues for directly downloading data from DataStore using R. `NPSutils` can also import data downloaded from any repository into R and take advantage of rich EML metadata to call column types. `NPSutils` provides some basic meta-analysis capability. `NPSutils` can also be used to import data and metadata into common data visualization tools. +The [NPSutils](https://nationalparkservice.github.io/NPSutils/) ("NPS utilities") package serves primarily as a way to access data [@Baker_NPSutils2025]. `NPSutils` provides avenues for directly downloading data from DataStore using R. `NPSutils` can also import data downloaded from any repository (provided it is properly formatted as a data package) into R and take advantage of rich EML metadata to call column types. `NPSutils` provides some basic meta-analysis capability. `NPSutils` can also be used to import data and metadata into common data visualization tools. -Example of how to download and access data: +Example of how to download and access an example data package titled, "Mojave Desert Network Springs Data Package 2016 - 2023" [@Bailard2024]: ``` # download a data package from datastore: # the data package will be downloaded to ./data/2300498 @@ -175,6 +168,6 @@ mojn <- NPSutils::load_data_package(2300498, assign_attributes = TRUE) # Acknowledgements -We acknowledge contributions from across the National Park Service, but in particular from the Inventory and Monitoring Division. Members of the NPS Long Term Data Management Governing Board provided critical guidance and insight (in addition to several of the authors, these include Kristen Bonebrake, Adam Kozlowski, Ryan Monello, Mark Isley, and Megan Swan). Justin Mills (currently at U.S. Fish and Wildlife Service) and Derrick Dardano helped with navigating API and Active Directory interfaces, Marsha Leavitt made and explained numerous updates to DataStore. Dan Gussett, Kate Miller, and Pete Budde facilitated software availability, and Meg White supported and endorsed the project. We are particularly indebted to our strong user base and their very helpful feedback including Alison Loar, Christina Appleby, Kirk Sherrill, Lisa Nelson and Tom Phillipi. Numerous Student Conservation Association interns made contributions to the code base including Sarah Kelso, James Brown, and Amy Sherman. Alissa Graff (currently at the Internal Revenue Service) provided important input on early versions of NPSutils. +We acknowledge contributions from across the National Park Service, but in particular from the Inventory and Monitoring Division. Members of the NPS Long Term Data Management Governance Board provided critical guidance and insight (in addition to several of the authors, these include Kristen Bonebrake, Adam Kozlowski, Ryan Monello, Mark Isley, and Megan Swan). Justin Mills (currently at U.S. Fish and Wildlife Service) and Derrick Dardano helped with navigating API and Active Directory interfaces, Marsha Leavitt made and explained numerous updates to DataStore. Dan Gussett, Kate Miller, and Pete Budde facilitated software availability, and Meg White supported and endorsed the project. We are particularly indebted to our strong user base and their very helpful feedback including Alison Loar, Christina Appleby, Kirk Sherrill, Lisa Nelson and Tom Phillipi. Numerous Student Conservation Association interns made contributions to the code base including Sarah Kelso, James Brown, and Amy Sherman. Alissa Graff (currently at the Internal Revenue Service) provided important input on early versions of NPSutils. # References diff --git a/paper.pdf b/paper.pdf new file mode 100644 index 0000000..e396245 Binary files /dev/null and b/paper.pdf differ