greenelab · dhimmel · Dec 19, 2017 · Dec 12, 2017 · Dec 12, 2017 · Dec 12, 2017
diff --git a/README.md b/README.md
@@ -18,6 +18,10 @@ conda env create --file=environment.yml
 Then use `source activate library-access` and `source deactivate` to activate or deactivate the environment.
 On windows, use `activate library-access` and `deactivate` instead.
 
+## Using the Code
+
+The code files in this repository assume that your working directory is set to the top-level directory of this repository.
+
 ## License
 
 The files in this repository are released under the CC0 1.0 public domain dedication ([`LICENSE-CC0.md`](LICENSE-CC0.md)), excepting those that match the glob patterns listed below.

diff --git a/environment.yml b/environment.yml
@@ -9,6 +9,12 @@ dependencies:
 - anaconda::pytest=3.2.1
 - anaconda::python=3.6.1
 - anaconda::r-base=3.4.1
+- anaconda::r-dplyr=0.7.0
+- anaconda::r-ggplot2=2.2.1
+- anaconda::r-knitr=1.16
+- anaconda::r-markdown=0.8
+- anaconda::r-readr=1.1.1
+- anaconda::r-rmarkdown=1.5
 - anaconda::requests=2.14.2
 - anaconda::spyder=3.1.4
 - anaconda::sqlalchemy=1.1.9

diff --git a/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R b/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R
@@ -0,0 +1,56 @@
+# Load dependencies ------------------------------------------------------------
+
+# Load magrittr pipe
+`%>%` = dplyr::`%>%`
+
+# Settings ---------------------------------------------------------------------
+
+lzma_compressed_library_access_data_location <- file.path(
+  'data', 'library_coverage_xml_and_fulltext_indicators.tsv.xz'
+)
+
+sample_size_per_cell <- 100  # This will be for each cell, multiplied by 
+# 2 full_text_indicator status
+
+output_tsv_location <- file.path(
+  'evaluate_library_access_from_output_tsv',
+  'manual-doi-checks.tsv'
+)
+
+randomizer_seed_to_set <- 3  # Ensure that random sampling will always return
+# the same result.
+
+# Read the dataset -------------------------------------------------------------
+
+library_access_data <- readr::read_tsv(
+  gzfile(lzma_compressed_library_access_data_location),
+)
+# View(lzma_compressed_library_access_data)  # Check the dataset
+
+# Convert variable to factor:
+library_access_data <- library_access_data %>% dplyr::mutate(
+  full_text_indicator = as.factor(full_text_indicator)
+)
+
+# Create stratefied sample, and clean up the tibble ----------------------------
+
+set.seed(randomizer_seed_to_set)
+stratefied_sample <- library_access_data %>%
+  dplyr::group_by(full_text_indicator) %>%
+  dplyr::sample_n(sample_size_per_cell) %>%
+  # Add columns to fill in manually to the stratefied sample dataframe:
+  dplyr::rename('full_text_indicator_automated' = 'full_text_indicator') %>%
+  dplyr::mutate(
+    date_of_manual_full_text_check_inside_campus = NA,
+    full_text_indicator_manual_inside_campus = NA,
+    date_of_manual_full_text_check_outside_campus = NA,
+    full_text_indicator_manual_outside_campus = NA
+  )
+
+# Write the output to a TSV ----------------------------------------------------
+
+readr::write_tsv(
+  stratefied_sample,
+  output_tsv_location,
+  na = ''
+)
diff --git a/evaluate_library_access_from_output_tsv.Rmd → ...aluate_library_access_from_output_tsv.Rmd b/evaluate_library_access_from_output_tsv.Rmd → ...aluate_library_access_from_output_tsv.Rmd
@@ -5,24 +5,30 @@ date: "2017"
 output: pdf_document
 ---
 
-```{r setup, include=FALSE}
-knitr::opts_chunk$set(echo = TRUE)
-knitr::opts_chunk$set(include = FALSE)
-knitr::opts_chunk$set(results = "asis")
-knitr::opts_chunk$set(cache = TRUE)
-```
-
-```{r settings}
-lzma_compressed_library_access_tsv_location <- "data/library_coverage_xml_and_fulltext_indicators.tsv.xz"
+```{r settings, include = FALSE}
+lzma_compressed_library_access_tsv_location <- file.path(
+  'data', 'library_coverage_xml_and_fulltext_indicators.tsv.xz'
+)
 
 original_dataset_with_oa_color_column_location <- paste0(
   'https://github.com/greenelab/scihub/raw/',
   '4172526ac7433357b31790578ad6f59948b6db26/data/',
-  'state-of-oa-dois.tsv.xz')
+  'state-of-oa-dois.tsv.xz'
+)
+
+repository_root_directory <- '..'  # This sets the Working Directory that knitr
+# uses when knitting this document back to the top directory of this repository.
 ```
 
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = FALSE)
+knitr::opts_chunk$set(include = FALSE)
+knitr::opts_chunk$set(results = "asis")
+knitr::opts_chunk$set(cache = TRUE)
+knitr::opts_knit$set(root.dir = repository_root_directory)
+```
 
-```{r read datasets}
+```{r read and merge datasets}
 lzma_compressed_library_access_tsv <- read.table(
   gzfile(lzma_compressed_library_access_tsv_location),
   sep = '\t',
@@ -46,11 +52,12 @@ original_dataset_with_oa_color_column <- read.table(
   header = TRUE
 )
 # View(original_dataset_with_oa_color_column)  # Check the dataset
-```
 
-```{r merge the datasets}
-# Combine the datasets so that we have doi, full_text_indicator, and oadoi_color
-merged_datasets <- merge(
+# Merge the datasets ---------------------------------------------------------
+
+# Combine the datasets so that we have doi, full_text_indicator,
+# and oadoi_color
+merged_datasets <- dplyr::inner_join(
   original_dataset_with_oa_color_column,
   lzma_compressed_library_access_tsv,
   by = "doi"
@@ -81,13 +88,15 @@ frequency_and_proportion_table <- data.frame(
   "no_access_percent" = proportion_table_by_oa_color[,1],
   "yes_access_percent" = proportion_table_by_oa_color[,2],
   "yes_access_rate" = frequency_table_by_oa_color[, 2],
-  "oa_color_total" = frequency_table_by_oa_color[, 1] + frequency_table_by_oa_color[, 2]
+  "oa_color_total" = frequency_table_by_oa_color[, 1] +
+    frequency_table_by_oa_color[, 2]
 )
 rownames(frequency_and_proportion_table) <- NULL
 # View(frequency_and_proportion_table)
 ```
 
-We queried `r nrow(merged_datasets)` DOIs of the the `r nrow(original_dataset_with_oa_color_column)` listed in the original State of OA dataset. Queried DOIs included the following OA "colors:" `r paste(unique(merged_datasets$oadoi_color), collapse = ", ")`.
+We queried `r nrow(merged_datasets)` DOIs of the the `r nrow(original_dataset_with_oa_color_column)` listed in the original State of OA dataset.
+Queried DOIs included the following OA "colors:" `r paste(unique(merged_datasets$oadoi_color), collapse = ", ")`.
 
 The proportions of access, alongside the rate of access, are presented below:
 

diff --git a/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R b/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R
@@ -0,0 +1,94 @@
+# Settings ---------------------------------------------------------------------
+
+manual_tsv_location <- file.path(
+  'evaluate_library_access_from_output_tsv',
+  'manual-doi-checks.tsv'
+)
+
+# Open the tsv -----------------------------------------------------------------
+
+dataset_to_go_through <- readr::read_tsv(
+  manual_tsv_location,
+  na = ''
+)
+# View(dataset_to_go_through)
+
+# Facilitate going through the rows that haven't been filled in ----------------
+
+while (TRUE) {
+  user_location_input <- readline(paste0(
+    'Are you on the university campus network',
+    '(y for on-campus, n for off-campus)? [y/n]'
+  ))
+
+  if (
+    tolower(user_location_input) == 'y' ||
+    tolower(user_location_input) == 'n'
+  ) {
+    if (tolower(user_location_input) == 'y') {
+      column_for_data_entry <- 'full_text_indicator_manual_inside_campus'
+      column_for_date <- 'date_of_manual_full_text_check_inside_campus'
+    } else {
+      column_for_data_entry <- 'full_text_indicator_manual_outside_campus'
+      column_for_date <- 'date_of_manual_full_text_check_outside_campus'
+    }
+
+    break  # Break out of the loop, and move on.
+  } else {
+    message('Please enter y or n. Asking again...')
+  }
+}
+
+for (row_number in which(
+  is.na(dataset_to_go_through[, column_for_data_entry])
+)) {
+  doi_for_row <- dataset_to_go_through[row_number, 'doi']
+
+  url_to_visit <- paste0(
+    'https://doi.org/',
+    doi_for_row
+  )
+
+  message('Opening URL "', url_to_visit, '"...')
+
+  utils::browseURL(url_to_visit)
+
+  while (TRUE) {
+    user_full_text_input <- readline(
+      'Do we have full-text access to this DOI? [y/n/invalid]
+  ("invalid" = invalid DOI)'
+    )
+
+    if (
+      tolower(user_full_text_input) == 'y' ||
+      tolower(user_full_text_input) == 'n' ||
+      tolower(user_full_text_input) == 'invalid'
+    ) {
+      dataset_to_go_through[
+        row_number,
+        column_for_date
+      ] <- as.character(Sys.Date())
+
+      if (tolower(user_full_text_input) == 'y') {
+        dataset_to_go_through[row_number, column_for_data_entry] <- 1
+      } else if (tolower(user_full_text_input) == 'n') {
+        dataset_to_go_through[row_number, column_for_data_entry] <- 0
+      } else {
+        dataset_to_go_through[row_number, column_for_data_entry] <- 'invalid'
+      }
+
+      break  # Break out of the loop, and move on.
+    } else {
+      message('Please enter y, n, or invalid. Asking again...')
+    }
+  }
+
+  # Save the changes to the tsv:
+  write.table(
+    dataset_to_go_through,
+    file = manual_tsv_location,
+    sep = '\t',
+    na = '',
+    row.names = FALSE
+  )
+}