fix #233 drop crminer as a dependency - moved over any functionqality…

… from crminer here
ropensci-archive · Feb 11, 2021 · 32c97fd · 32c97fd
1 parent 27585f9
commit 32c97fd
Show file tree

Hide file tree

Showing 14 changed files with 470 additions and 19 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -7,7 +7,7 @@ Description: Provides a single interface to many sources of full text
     included for searching for articles, downloading full or partial
     text, downloading supplementary materials, converting to various
     data formats.
-Version: 1.6.3.91
+Version: 1.6.4.91
 License: MIT + file LICENSE
 Authors@R: c(
     person("Scott", "Chamberlain",
@@ -35,7 +35,6 @@ Imports:
     jsonlite,
     rplos (>= 0.8.0),
     rcrossref (>= 0.8.0),
-    crminer (>= 0.2.0),
     microdemic (>= 0.2.0),
     aRxiv,
     rentrez (>= 1.1.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(as_ftdmurl,character)
+S3method(as_ftdmurl,ftdmurl)
 S3method(ft_abstract,character)
 S3method(ft_abstract,default)
 S3method(ft_abstract,numeric)
@@ -18,15 +20,19 @@ S3method(ft_links,ft)
 S3method(ft_links,ft_ind)
 S3method(ft_text,default)
 S3method(ft_text,ft_data)
+S3method(print,crm_pdf)
+S3method(print,crm_pdf_text)
 S3method(print,ft)
 S3method(print,ft_abstract)
 S3method(print,ft_data)
 S3method(print,ft_ind)
 S3method(print,ft_links)
 S3method(print,ft_parsed)
+S3method(print,ftdmurl)
 S3method(print,pdft_char)
 export("%>%")
 export(as.ft_data)
+export(as_ftdmurl)
 export(biorxiv_search)
 export(bmc_search)
 export(cache_file_info)
@@ -43,6 +49,7 @@ export(ft_browse)
 export(ft_browse_sections)
 export(ft_chunks)
 export(ft_collect)
+export(ft_cr_links)
 export(ft_extract)
 export(ft_extract_corpus)
 export(ft_get)

diff --git a/R/as_ftdmurl.R b/R/as_ftdmurl.R
@@ -0,0 +1,65 @@
+#' Coerce a url to a tdmurl with a specific type
+#'
+#' A tmd url is just a URL with some attributes to make it easier
+#' to handle within other functions in this package.
+#'
+#' @export
+#' @param url (character) A URL.
+#' @param type (character) One of 'xml' (default), 'html', 'plain', 'pdf',
+#' 'unspecified', or 'all'
+#' @param doi (character) A DOI, optional, default: `NULL`
+#' @param member (character) Crossref member id. optional
+#' @param intended_application (character) intended application string, 
+#' optional
+#' @examples
+#' as_ftdmurl("http://downloads.hindawi.com/journals/bmri/2014/201717.xml",
+#'    "xml")
+#' as_ftdmurl("http://downloads.hindawi.com/journals/bmri/2014/201717.pdf",
+#'    "pdf")
+#' out <-
+#'  as_ftdmurl("http://downloads.hindawi.com/journals/bmri/2014/201717.pdf",
+#'    "pdf", "10.1155/2014/201717")
+#' attributes(out)
+#' identical(attr(out, "type"), "pdf")
+as_ftdmurl <- function(url, type, doi = NULL, member = NULL, 
+  intended_application = NULL) {
+
+  UseMethod("as_ftdmurl")
+}
+
+#' @export
+#' @rdname as_ftdmurl
+as_ftdmurl.ftdmurl <- function(url, type, doi = NULL, member = NULL, 
+  intended_application = NULL) {
+
+  return(url)
+}
+
+#' @export
+#' @rdname as_ftdmurl
+as_ftdmurl.character <- function(url, type, doi = NULL, 
+  member = NULL, intended_application = NULL) {
+
+  makeurl(check_url(url), type, doi, member, intended_application)
+}
+
+#' @export
+print.ftdmurl <- function(x, ...) {
+  cat("<url> ", x[[1]], "\n", sep = "")
+}
+
+
+# helpers --------
+makeurl <- function(x, y, z, member, intended_application) {
+  structure(stats::setNames(list(x), match_type(y)),
+            class = "ftdmurl", type = match_type(y), doi = z,
+            member = member, intended_application = intended_application)
+}
+
+check_url <- function(x) {
+  if (!grepl("https?://", x)) stop("Not a proper url") else x
+}
+
+match_type <- function(x) {
+  match.arg(x, c("xml","html","plain","pdf","unspecified","all"))
+}
diff --git a/R/ft_cr_links.R b/R/ft_cr_links.R
@@ -0,0 +1,203 @@
+#' Get Crossref full text links from a DOI
+#'
+#' @export
+#' @param doi (character) A Digital Object Identifier (DOI). required.
+#' @param type (character) One of 'xml', 'html', 'plain', 'pdf',
+#' 'unspecified', or 'all' (default). required.
+#' @param ... Named parameters passed on to [crul::HttpClient()]
+#'
+#' @details Note that this function is not vectorized.
+#'
+#' Some links returned will not in fact lead you to full text
+#' content as you would understandbly think and expect. That is, if you
+#' use the `filter` parameter with e.g., [rcrossref::cr_works()]
+#' and filter to only full text content, some links may actually give back
+#' only metadata for an article. Elsevier is perhaps the worst offender,
+#' for one because they have a lot of entries in Crossref TDM, but most
+#' of the links that are apparently full text are not in fact full text,
+#' but only metadata. You can get full text if you are part of a subscribing
+#' institution to that specific Elsever content, but otherwise, you're SOL.
+#'
+#' Note that there are still some bugs in the data returned form CrossRef.
+#' For example, for the publisher eLife, they return a single URL with
+#' content-type application/pdf, but the URL is not for a PDF, but for both
+#' XML and PDF, and content-type can be set with that URL as either XML or
+#' PDF to get that type.
+#'
+#' In another example, all Elsevier URLs at time of writing are have
+#' `http` scheme, while those don't actually work, so we have a
+#' custom fix in this function for that publisher. Anyway, expect changes...
+#'
+#' @section Register for the Polite Pool:
+#' See of 'Authentication' setion of the [fulltext-package] manual page
+#'
+#' @return `NULL` if no full text links given; a list of tdmurl objects if
+#' links found. a tdmurl object is an S3 class wrapped around a simple list,
+#' with attributes for:
+#'
+#' - type: type, matchin type passed to the function
+#' - doi: DOI
+#' - member: Crossref member ID
+#' - intended_application: intended application, e.g., text-mining
+#'
+#' @examples \dontrun{
+#' dois <- c("10.1245/s10434-016-5211-6",
+#' "10.17159/2413-3108/2016/v0i55a49", "10.17159/2413-3108/2015/v0i53a455",
+#' "10.17159/2413-3108/2006/v0i18a982", "10.1007/s10665-016-9845-y", 
+#' "10.1016/j.ad.2015.06.020", "10.1016/j.medipa.2014.03.002")
+#'
+#' # pdf link
+#' ft_cr_links(doi = "10.5555/515151", "pdf")
+#'
+#' # xml and plain text links
+#' ft_cr_links(dois[1], "pdf")
+#' ft_cr_links(dois[6], "xml")
+#' ft_cr_links(dois[7], "plain")
+#' ft_cr_links(dois[1]) # all is the default
+#'
+#' # pdf link
+#' ft_cr_links(doi = "10.5555/515151", "pdf")
+#' ft_cr_links(doi = "10.3897/phytokeys.52.5250", "pdf")
+#'
+#' # many calls, use e.g., lapply
+#' lapply(dois[1:3], ft_cr_links)
+#'
+#' # elsevier
+#' ## DOI that is open acccess
+#' ft_cr_links('10.1016/j.physletb.2010.10.049')
+#' ## DOI that is not open acccess
+#' ft_cr_links('10.1006/jeth.1993.1066')
+#' }
+ft_cr_links <- function(doi, type = 'all', ...) {
+  res <- .crm_works_links(dois = doi, ...)[[1]]
+  if (is.null(unlist(res$links))) {
+    return(list())
+  } else {
+    elife <- grepl("elife", res$links[[1]]$URL)
+    withtype <- if (type == 'all') {
+      res$links
+    } else {
+      Filter(function(x) grepl(type, x$`content-type`), res$links)
+    }
+
+    if (is.null(withtype) || length(withtype) == 0) {
+      return(list())
+    } else {
+      withtype <- stats::setNames(withtype, sapply(withtype, function(x){
+        if (x$`content-type` == "unspecified") {
+          "unspecified"
+        } else {
+          strsplit(x$`content-type`, "/")[[1]][[2]]
+        }
+      }))
+
+      if (elife) {
+        withtype <- Filter(function(w) !grepl("lookup", w$URL), withtype)
+      }
+
+      if (basename(res$member) %in% c("2258", "179")) {
+        withtype <- lapply(withtype, function(z) {
+          z$URL <- sub("http://", "https://", z$URL)
+          z
+        })
+      }
+
+      if (basename(res$member) == "78") {
+        withtype <- lapply(withtype, function(z) {
+          z$URL <- sub("http://", "https://", z$URL)
+          z
+        })
+        pdf <- list(pdf =
+          utils::modifyList(withtype[[1]],
+            list(
+              URL = sub("text/xml", "application/pdf", withtype[[1]]$URL),
+              `content-type` = "application/pdf"
+            )
+        ))
+        withtype <- c(withtype, pdf)
+      }
+
+      if (type == "all") {
+        lapply(withtype, function(b) {
+          makeurl(b$URL, st(b$`content-type`), doi, res$member, b$`intended-application`)
+        })
+      } else {
+        y <- match.arg(type, c('xml', 'plain', 'html', 'pdf', 'unspecified'))
+        makeurl(x = withtype[[y]]$URL, y = y, z = doi, res$member,
+          withtype[[y]]$`intended-application`)
+      }
+    }
+  }
+}
+
+.crm_works_links <- function(dois = NULL, ...) {
+  get_links <- function(x, ...) {
+    tmp <- crm_GET(sprintf("works/%s", x), NULL, FALSE, ...)
+    trylinks <- tryCatch(tmp$message$link, error = function(e) e)
+    if (inherits(trylinks, "error")) {
+      NULL
+    } else {
+      list(links = trylinks, member = tmp$message$member)
+    }
+  }
+  stats::setNames(lapply(dois, get_links, ...), dois)
+}
+
+st <- function(x){
+  if (grepl("/", x)) {
+    strsplit(x, "/")[[1]][[2]]
+  } else {
+    x
+  }
+}
+
+crm_GET <- function(endpoint, args = list(), todf = TRUE, on_error = warning,
+                    parse = TRUE, ...) {
+
+  url <- sprintf("https://api.crossref.org/%s", endpoint)
+  cli <- crul::HttpClient$new(
+    url = url,
+    headers = list(
+      `User-Agent` = make_ua(),
+      `X-USER-AGENT` = make_ua()
+    ),
+    opts = list(...)
+  )
+  res <- cli$get(query = args)
+  doi <- gsub("works/|/agency|funders/", "", endpoint)
+  if (!res$status_code < 300) {
+    on_error(sprintf("%s: %s - (%s)", res$status_code, get_err(res), doi),
+             call. = FALSE)
+    list(message = NULL)
+  } else {
+    stopifnot(res$response_headers$`content-type` ==
+                "application/json;charset=UTF-8")
+    res <- res$parse("UTF-8")
+    if (parse) jsonlite::fromJSON(res, todf) else res
+  }
+}
+
+get_err <- function(x) {
+  xx <- x$parse("UTF-8")
+  if (x$response_headers$`content-type` == "text/plain") {
+    tmp <- xx
+  } else if (x$response_headers$`content-type` == "text/html") {
+    html <- xml2::read_html(xx)
+    tmp <- xml2::xml_text(xml2::xml_find_first(html, '//h3[@class="info"]'))
+  } else if (
+    x$response_headers$`content-type` == "application/json;charset=UTF-8"
+  ) {
+    tmp <- jsonlite::fromJSON(xx, FALSE)
+  } else {
+    tmp <- xx
+  }
+  if (inherits(tmp, "list")) {
+    tmp$message[[1]]$message
+  } else {
+    if (any(class(tmp) %in% c("HTMLInternalDocument", "xml_document"))) {
+      "Server error - check query - or api.crossref.org may have problems"
+    } else {
+      tmp
+    }
+  }
+}
diff --git a/R/ft_extract.R b/R/ft_extract.R
@@ -33,7 +33,7 @@ ft_extract <- function(x) {
 #' @export
 ft_extract.character <- function(x) {
   if (!file.exists(x)) stop("File does not exist", call. = FALSE)
-  res <- crminer::crm_extract(x)
+  res <- .crm_extract(x)
   structure(list(meta = res$info, data = res$text), class = "pdft_char", 
             path = x)
 }
@@ -68,3 +68,39 @@ print.pdft_char <- function(x, ...) {
   cat("  Creation date: ", as.character(as.Date(x$meta$created)), "\n", 
       sep = "")
 }
+
+.crm_extract <- function(path = NULL, raw = NULL, try_ocr = FALSE, ...) {
+  assert(try_ocr, "logical")
+  stopifnot(xor(is.null(path), is.null(raw)))
+  if (!is.null(path)) {
+    path <- path.expand(path)
+    if (!file.exists(path)) stop("path does not exist", call. = FALSE)  
+  } else {
+    assert(raw, "raw")
+    path <- raw
+  }
+  fun <- if (try_ocr) pdftools::pdf_ocr_text else pdftools::pdf_text
+
+  structure(
+    list(
+      info = pdftools::pdf_info(path, ...),
+      text = fun(path, ...)
+    ),
+    class = "crm_pdf",
+    path = if (is.character(path)) path else 'raw'
+  )
+}
+
+#' @export
+print.crm_pdf <- function(x, ...) {
+  cat("<document>", attr(x, "path"), "\n", sep = "")
+  cat("  Pages: ", x$info$pages, "\n", sep = "")
+  cat("  No. characters: ", sum(nchar(x$text)), "\n", sep = "")
+  cat("  Created: ", as.character(as.Date(x$info$created)), "\n",
+      sep = "")
+}
+#' @export
+print.crm_pdf_text <- function(x, ...) {
+  cat("<document>", attr(x, "path"), "\n", sep = "")
+  cat("  No. characters: ", sum(nchar(x$text)), "\n", sep = "")
+}
diff --git a/R/fulltext-package.R b/R/fulltext-package.R
@@ -139,12 +139,18 @@
 #'
 #' **Crossref**: Crossref encourages requests with contact information
 #' (an email address) and will forward you to a dedicated API cluster
-#' for improved performance when you share your email address with them.
+#' for improved performance when you share your email address with them. This 
+#' is called the "Polite Pool". 
 #' https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
 #' To pass your email address to Crossref via this client, store it
 #' as an environment variable in `.Renviron` like
-#' `crossref_email = name@example.com`
-#'
+#' `crossref_email=name@example.com`, or `CROSSREF_EMAIL=name@example.com`.
+#' Save the file and restart your R session. To stop sharing your email when
+#' using rcrossref simply delete it from your `.Renviron` file OR to temporarily
+#' not use your email unset it for the session
+#' like `Sys.unsetenv('crossref_email')`. To be sure your in the polite pool
+#' use curl verbose by e.g., `ft_cr_links(doi = "10.5555/515151", verbose = TRUE)`
+#' 
 #' **Crossref TDM**: TDM = "Text and Data Mining". This used to apply to just
 #' two publishers - Wiley and Elsevier - This service officially shut down at
 #' the end of 2020. For Elsevier, see the