diff --git a/README.md b/README.md index b12620f..f8c1f27 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,10 @@ conda env create --file=environment.yml Then use `source activate library-access` and `source deactivate` to activate or deactivate the environment. On windows, use `activate library-access` and `deactivate` instead. +## Using the Code + +The code files in this repository assume that your working directory is set to the top-level directory of this repository. + ## License The files in this repository are released under the CC0 1.0 public domain dedication ([`LICENSE-CC0.md`](LICENSE-CC0.md)), excepting those that match the glob patterns listed below. diff --git a/environment.yml b/environment.yml index f6cf28c..97a37b2 100644 --- a/environment.yml +++ b/environment.yml @@ -9,6 +9,12 @@ dependencies: - anaconda::pytest=3.2.1 - anaconda::python=3.6.1 - anaconda::r-base=3.4.1 +- anaconda::r-dplyr=0.7.0 +- anaconda::r-ggplot2=2.2.1 +- anaconda::r-knitr=1.16 +- anaconda::r-markdown=0.8 +- anaconda::r-readr=1.1.1 +- anaconda::r-rmarkdown=1.5 - anaconda::requests=2.14.2 - anaconda::spyder=3.1.4 - anaconda::sqlalchemy=1.1.9 diff --git a/evaluate_library_access_from_output_tsv.Rmd b/evaluate_library_access_from_output_tsv.Rmd deleted file mode 100644 index 79eeb8e..0000000 --- a/evaluate_library_access_from_output_tsv.Rmd +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: "Evaluate Library Access from the Output TSV" -author: "Jacob Levernier" -date: "2017" -output: pdf_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -knitr::opts_chunk$set(include = FALSE) -knitr::opts_chunk$set(results = "asis") -knitr::opts_chunk$set(cache = TRUE) -``` - -```{r settings} -lzma_compressed_library_access_tsv_location <- "data/library_coverage_xml_and_fulltext_indicators.tsv.xz" - -original_dataset_with_oa_color_column_location <- paste0( - 'https://github.com/greenelab/scihub/raw/', - '4172526ac7433357b31790578ad6f59948b6db26/data/', - 'state-of-oa-dois.tsv.xz') -``` - - -```{r read datasets} -lzma_compressed_library_access_tsv <- read.table( - gzfile(lzma_compressed_library_access_tsv_location), - sep = '\t', - header = TRUE -) -# View(lzma_compressed_library_access_tsv) # Check the dataset - -# Create a temporary filepath for downloading the original dataset. -# Then download and read it. -tmp_filpath_for_original_dataset <- tempfile() - -download.file( - original_dataset_with_oa_color_column_location, - destfile = tmp_filpath_for_original_dataset, - mode = 'wb' -) - -original_dataset_with_oa_color_column <- read.table( - gzfile(tmp_filpath_for_original_dataset), - sep = '\t', - header = TRUE -) -# View(original_dataset_with_oa_color_column) # Check the dataset -``` - -```{r merge the datasets} -# Combine the datasets so that we have doi, full_text_indicator, and oadoi_color -merged_datasets <- merge( - original_dataset_with_oa_color_column, - lzma_compressed_library_access_tsv, - by = "doi" -) -# View(merged_datasets) # Check our work -``` - -## Summary of the downloaded dataset - -```{r analyze the merged dataset} -merged_datasets_without_doi_column <- merged_datasets[ - , # Use all rows - c("oadoi_color", "full_text_indicator") -] - -frequency_table_by_oa_color <- table(merged_datasets_without_doi_column) -# View(frequency_table_by_oa_color) - -proportion_table_by_oa_color <- round( - prop.table( - frequency_table_by_oa_color, - margin = 1)*100, - digits = 2 -) - -frequency_and_proportion_table <- data.frame( - "oa_doi_color" = rownames(proportion_table_by_oa_color), - "no_access_percent" = proportion_table_by_oa_color[,1], - "yes_access_percent" = proportion_table_by_oa_color[,2], - "yes_access_rate" = frequency_table_by_oa_color[, 2], - "oa_color_total" = frequency_table_by_oa_color[, 1] + frequency_table_by_oa_color[, 2] -) -rownames(frequency_and_proportion_table) <- NULL -# View(frequency_and_proportion_table) -``` - -We queried `r nrow(merged_datasets)` DOIs of the the `r nrow(original_dataset_with_oa_color_column)` listed in the original State of OA dataset. Queried DOIs included the following OA "colors:" `r paste(unique(merged_datasets$oadoi_color), collapse = ", ")`. - -The proportions of access, alongside the rate of access, are presented below: - -`r knitr::kable(frequency_and_proportion_table, format = "markdown")` diff --git a/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R b/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R new file mode 100644 index 0000000..648ff30 --- /dev/null +++ b/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R @@ -0,0 +1,56 @@ +# Load dependencies ------------------------------------------------------------ + +# Load magrittr pipe +`%>%` = dplyr::`%>%` + +# Settings --------------------------------------------------------------------- + +lzma_compressed_library_access_data_location <- file.path( + 'data', 'library_coverage_xml_and_fulltext_indicators.tsv.xz' +) + +sample_size_per_cell <- 100 # This will be for each cell, multiplied by +# 2 full_text_indicator status + +output_tsv_location <- file.path( + 'evaluate_library_access_from_output_tsv', + 'manual-doi-checks.tsv' +) + +randomizer_seed_to_set <- 3 # Ensure that random sampling will always return +# the same result. + +# Read the dataset ------------------------------------------------------------- + +library_access_data <- readr::read_tsv( + lzma_compressed_library_access_data_location +) +# View(lzma_compressed_library_access_data) # Check the dataset + +# Convert variable to factor: +library_access_data <- library_access_data %>% dplyr::mutate( + full_text_indicator = as.factor(full_text_indicator) +) + +# Create stratefied sample, and clean up the tibble ---------------------------- + +set.seed(randomizer_seed_to_set) +stratefied_sample <- library_access_data %>% + dplyr::group_by(full_text_indicator) %>% + dplyr::sample_n(sample_size_per_cell) %>% + # Add columns to fill in manually to the stratefied sample dataframe: + dplyr::rename('full_text_indicator_automated' = 'full_text_indicator') %>% + dplyr::mutate( + date_of_manual_full_text_check_inside_campus = NA, + full_text_indicator_manual_inside_campus = NA, + date_of_manual_full_text_check_outside_campus = NA, + full_text_indicator_manual_outside_campus = NA + ) + +# Write the output to a TSV ---------------------------------------------------- + +readr::write_tsv( + stratefied_sample, + output_tsv_location, + na = '' +) diff --git a/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R b/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R new file mode 100644 index 0000000..dbfe4d8 --- /dev/null +++ b/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R @@ -0,0 +1,94 @@ +# Settings --------------------------------------------------------------------- + +manual_tsv_location <- file.path( + 'evaluate_library_access_from_output_tsv', + 'manual-doi-checks.tsv' +) + +# Open the tsv ----------------------------------------------------------------- + +dataset_to_go_through <- readr::read_tsv( + manual_tsv_location, + na = '' +) +# View(dataset_to_go_through) + +# Facilitate going through the rows that haven't been filled in ---------------- + +while (TRUE) { + user_location_input <- readline(paste0( + 'Are you on the university campus network', + '(y for on-campus, n for off-campus)? [y/n]' + )) + + if ( + tolower(user_location_input) == 'y' || + tolower(user_location_input) == 'n' + ) { + if (tolower(user_location_input) == 'y') { + column_for_data_entry <- 'full_text_indicator_manual_inside_campus' + column_for_date <- 'date_of_manual_full_text_check_inside_campus' + } else { + column_for_data_entry <- 'full_text_indicator_manual_outside_campus' + column_for_date <- 'date_of_manual_full_text_check_outside_campus' + } + + break # Break out of the loop, and move on. + } else { + message('Please enter y or n. Asking again...') + } +} + +for (row_number in which( + is.na(dataset_to_go_through[, column_for_data_entry]) +)) { + doi_for_row <- dataset_to_go_through[row_number, 'doi'] + + url_to_visit <- paste0( + 'https://doi.org/', + doi_for_row + ) + + message('Opening URL "', url_to_visit, '"...') + + utils::browseURL(url_to_visit) + + while (TRUE) { + user_full_text_input <- readline( + 'Do we have full-text access to this DOI? [y/n/invalid] + ("invalid" = invalid DOI)' + ) + + if ( + tolower(user_full_text_input) == 'y' || + tolower(user_full_text_input) == 'n' || + tolower(user_full_text_input) == 'invalid' + ) { + dataset_to_go_through[ + row_number, + column_for_date + ] <- as.character(Sys.Date()) + + if (tolower(user_full_text_input) == 'y') { + dataset_to_go_through[row_number, column_for_data_entry] <- 1 + } else if (tolower(user_full_text_input) == 'n') { + dataset_to_go_through[row_number, column_for_data_entry] <- 0 + } else { + dataset_to_go_through[row_number, column_for_data_entry] <- 'invalid' + } + + break # Break out of the loop, and move on. + } else { + message('Please enter y, n, or invalid. Asking again...') + } + } + + # Save the changes to the tsv: + write.table( + dataset_to_go_through, + file = manual_tsv_location, + sep = '\t', + na = '', + row.names = FALSE + ) +} diff --git a/evaluate_library_access_from_output_tsv/manual-doi-checks.tsv b/evaluate_library_access_from_output_tsv/manual-doi-checks.tsv new file mode 100644 index 0000000..3e1bf94 --- /dev/null +++ b/evaluate_library_access_from_output_tsv/manual-doi-checks.tsv @@ -0,0 +1,201 @@ +doi full_text_indicator_automated date_of_manual_full_text_check_inside_campus full_text_indicator_manual_inside_campus date_of_manual_full_text_check_outside_campus full_text_indicator_manual_outside_campus +10.1007/bf01441062 0 +10.20531/tfb.2016.44.1.11 0 +10.1080/00377996.1953.9957299 0 +10.1038/s41550-016-0030 0 +10.1111/j.1478-4408.1958.tb02258.x 0 +10.1111/j.1550-7408.1962.tb02648.x 0 +10.1007/978-3-658-12388-8_1 0 +10.1017/s1357729800051109 0 +10.1108/s1479-3679(2013)0000020014 0 +10.1136/bmj.1.4706.586 0 +10.1093/molehr/gaq017 0 +10.1093/mnras/89.4.329 0 +10.1093/qjmed/hct203 0 +10.1097/acm.0000000000000545 0 +10.3109/17453059209051382 0 +10.2165/00003495-198300252-00083 0 +10.1007/978-1-4614-2251-8_15 0 +10.1179/136404609x12572514498892 0 +10.3855/jidc.4620 0 +10.1017/s0007485307005305 0 +10.1016/0021-9517(79)90166-0 0 +10.1002/14651858.cd008009.pub2 0 +10.1007/978-94-011-4683-8_4 0 +10.1002/prac.18430290165 0 +10.1016/0306-2619(90)90086-s 0 +10.1680/geot.2004.54.3.233 0 +10.1111/j.1468-5914.1986.tb00063.x 0 +10.3934/dcdsb.2014.19.485 0 +10.1097/ccm.0b013e31821b85c6 0 +10.1515/ci.2008.30.1.8 0 +10.1080/00222338008068111 0 +10.1080/00150190902848156 0 +10.1007/bf01557174 0 +10.1080/14786419.2014.988714 0 +10.1016/j.scient.2011.05.025 0 +10.1051/eas/1042000 0 +10.3727/096368912x655154 0 +10.1007/bf03164593 0 +10.1109/ijcnn.2007.4371306 0 +10.1007/s00261-016-0956-8 0 +10.1017/s0020860400011542 0 +10.1587/transele.e92.c.1504 0 +10.1007/bf01678469 0 +10.1107/s0108767388009286 0 +10.1080/0889311x.2014.973868 0 +10.1016/s0171-2985(80)80037-4 0 +10.1002/chin.197531174 0 +10.1002/zaac.19402430401 0 +10.1029/2011jd016541 0 +10.17816/jowd6265-11 0 +10.1016/0022-1031(69)90003-1 0 +10.1007/s12494-010-0025-6 0 +10.3233/jnd-160146 0 +10.7748/eldc.6.5.41.s39 0 +10.2306/scienceasia1513-1874.2013.39.204 0 +10.3934/dcds.2016103 0 +10.1080/19447015408688036 0 +10.1016/0002-9394(54)91770-5 0 +10.1007/978-94-007-6173-5_139-1 0 +10.1017/s0004972715001860 0 +10.2139/ssrn.1394745 0 +10.1002/chin.198935250 0 +10.18201/ijisae.273053 0 +10.1002/zamm.19660460319 0 +10.1517/13543776.2012.684946 0 +10.1029/2000wr900383 0 +10.1517/17460441.2015.1079618 0 +10.1093/toxsci/kft259 0 +10.1071/fp14035 0 +10.1002/phbl.19510070201 0 +10.1515/ijnsns-2011-0005 0 +10.1515/jnet.1983.8.4.255 0 +10.3892/or.2012.2190 0 +10.5408/1.3604824 0 +10.1093/nar/gkh603 0 +10.1097/00001888-197312000-00009 0 +10.1007/bf01221875 0 +10.1007/bf01257666 0 +10.1587/transcom.e94.b.1944 0 +10.1504/ijsnet.2016.074278 0 +10.15833/kafeiam.22.1.031 0 +10.1142/s0218488513500244 0 +10.1080/00218839.1979.11099957 0 +10.1002/14651858.cd001055.pub3 0 +10.5042/jacpr.2010.0335 0 +10.2172/1057364 0 +10.1007/s12513-012-0083-4 0 +10.1093/jac/dkt265 0 +10.1136/bmj.2.4776.167 0 +10.4028/www.scientific.net/amm.760.175 0 +10.1002/14651858.cd004403.pub2 0 +10.1016/s0160-3450(15)30895-3 0 +10.1080/10510978309368140 0 +10.2139/ssrn.945322 0 +10.3139/146.110260 0 +10.1016/j.jgyn.2013.12.004 0 +10.1038/gim.2016.139 0 +10.1029/2004gl021817 0 +10.1007/bf02174690 0 +10.1159/000350583 0 +10.1155/2013/313905 1 +10.1111/j.1440-1754.2010.01778.x 1 +10.1016/j.dam.2008.12.008 1 +10.1111/nph.14486 1 +10.1097/phm.0b013e3181c56938 1 +10.1016/j.vetmic.2013.10.018 1 +10.1002/lt.21788 1 +10.1021/i650568a711 1 +10.1007/s004260050046 1 +10.1016/j.physletb.2015.10.071 1 +10.1016/j.sleep.2013.11.336 1 +10.1007/s00419-012-0674-9 1 +10.1016/0272-6386(95)90111-6 1 +10.1016/0370-2693(85)90704-x 1 +10.2307/2205376 1 +10.1111/j.1749-6632.1986.tb26492.x 1 +10.1186/s12889-016-3008-y 1 +10.1111/cico.12174 1 +10.1016/j.ejor.2008.02.026 1 +10.1038/srep34005 1 +10.1115/1.3139652 1 +10.1186/s12967-014-0256-4 1 +10.1186/s12891-017-1482-8 1 +10.1016/j.cognition.2010.08.010 1 +10.1097/fjc.0b013e3181ac8e12 1 +10.1038/121170a0 1 +10.1109/chicc.2016.7554656 1 +10.1021/om200288e 1 +10.15261/serdj.22.177 1 +10.1016/j.ridd.2008.09.003 1 +10.1016/j.powtec.2010.02.012 1 +10.1055/s-0035-1558152 1 +10.1016/j.tripleo.2011.04.007 1 +10.1016/j.enzmictec.2011.10.004 1 +10.1209/0295-5075/93/50003 1 +10.1080/09638191003599527 1 +10.1007/s10719-015-9582-x 1 +10.1007/s11069-010-9629-z 1 +10.1089/neu.2014.3492 1 +10.1080/09585192.2016.1242508 1 +10.1051/0004-6361/201321042 1 +10.1016/j.rehab.2014.03.1380 1 +10.1111/j.1365-4632.2012.05709.x 1 +10.1371/journal.pone.0071370 1 +10.1353/leg.0.0005 1 +10.1016/j.jim.2009.05.011 1 +10.9789/2175-5361.2013v5n4p556 1 +10.1210/jc.2009-0474 1 +10.1093/rpd/ncs007 1 +10.1016/j.clon.2009.11.003 1 +10.1158/1078-0432.ccr-10-3105 1 +10.1007/s11999-011-2149-7 1 +10.1590/s0101-32622002000200003 1 +10.1016/j.pec.2014.08.021 1 +10.1109/tcyb.2016.2545688 1 +10.1016/j.jempfin.2015.09.001 1 +10.1016/j.ajic.2010.12.005 1 +10.1007/s10957-010-9771-5 1 +10.1038/jhh.2016.76 1 +10.1002/jnr.1149 1 +10.3758/s13414-014-0713-4 1 +10.1063/1.4869572 1 +10.1016/s0168-8278(87)80474-9 1 +10.1525/jps.2005.35.1.60 1 +10.1143/jpsj.50.3131 1 +10.1146/annurev-psych-010213-115100 1 +10.1590/s1413-35552007000100013 1 +10.1016/j.biopha.2016.08.042 1 +10.1177/0002764217693277 1 +10.1080/14754835.2014.923754 1 +10.1057/palgrave.development.1100225 1 +10.1007/s11046-014-9755-3 1 +10.1093/humrep/det283 1 +10.1890/11-1817.1 1 +10.1016/j.carbon.2014.03.019 1 +10.1080/01490419.2011.637154 1 +10.1111/gequ.10192 1 +10.1167/iovs.12-10257 1 +10.1016/j.carbon.2014.05.057 1 +10.1088/0957-4484/26/6/065702 1 +10.1016/j.chroma.2012.12.040 1 +10.1016/j.paid.2009.10.022 1 +10.1016/j.ajem.2010.02.002 1 +10.1007/s10703-010-0095-8 1 +10.1016/j.jclepro.2011.12.037 1 +10.1016/j.wasman.2013.10.006 1 +10.1371/journal.pgen.1001343 1 +10.1007/s11845-010-0519-x 1 +10.1016/s0011-8486(11)00306-2 1 +10.1016/j.sedgeo.2017.03.003 1 +10.1039/c3gc36844k 1 +10.1364/oe.24.029109 1 +10.1016/j.saa.2010.06.006 1 +10.1086/684103 1 +10.1111/j.2044-8325.2012.02058.x 1 +10.1021/acs.jafc.5b00932 1 +10.1186/s13072-017-0121-9 1 +10.2214/ajr.13.12055 1 +10.1021/ma900452x 1 +10.1016/j.jsis.2016.02.004 1