Skip to content
This repository has been archived by the owner on Sep 9, 2022. It is now read-only.

Commit

Permalink
fix #233 drop crminer as a dependency - moved over any functionqality…
Browse files Browse the repository at this point in the history
… from crminer here
  • Loading branch information
sckott committed Feb 11, 2021
1 parent 27585f9 commit 32c97fd
Show file tree
Hide file tree
Showing 14 changed files with 470 additions and 19 deletions.
3 changes: 1 addition & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Description: Provides a single interface to many sources of full text
included for searching for articles, downloading full or partial
text, downloading supplementary materials, converting to various
data formats.
Version: 1.6.3.91
Version: 1.6.4.91
License: MIT + file LICENSE
Authors@R: c(
person("Scott", "Chamberlain",
Expand Down Expand Up @@ -35,7 +35,6 @@ Imports:
jsonlite,
rplos (>= 0.8.0),
rcrossref (>= 0.8.0),
crminer (>= 0.2.0),
microdemic (>= 0.2.0),
aRxiv,
rentrez (>= 1.1.0),
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Generated by roxygen2: do not edit by hand

S3method(as_ftdmurl,character)
S3method(as_ftdmurl,ftdmurl)
S3method(ft_abstract,character)
S3method(ft_abstract,default)
S3method(ft_abstract,numeric)
Expand All @@ -18,15 +20,19 @@ S3method(ft_links,ft)
S3method(ft_links,ft_ind)
S3method(ft_text,default)
S3method(ft_text,ft_data)
S3method(print,crm_pdf)
S3method(print,crm_pdf_text)
S3method(print,ft)
S3method(print,ft_abstract)
S3method(print,ft_data)
S3method(print,ft_ind)
S3method(print,ft_links)
S3method(print,ft_parsed)
S3method(print,ftdmurl)
S3method(print,pdft_char)
export("%>%")
export(as.ft_data)
export(as_ftdmurl)
export(biorxiv_search)
export(bmc_search)
export(cache_file_info)
Expand All @@ -43,6 +49,7 @@ export(ft_browse)
export(ft_browse_sections)
export(ft_chunks)
export(ft_collect)
export(ft_cr_links)
export(ft_extract)
export(ft_extract_corpus)
export(ft_get)
Expand Down
65 changes: 65 additions & 0 deletions R/as_ftdmurl.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#' Coerce a url to a tdmurl with a specific type
#'
#' A tmd url is just a URL with some attributes to make it easier
#' to handle within other functions in this package.
#'
#' @export
#' @param url (character) A URL.
#' @param type (character) One of 'xml' (default), 'html', 'plain', 'pdf',
#' 'unspecified', or 'all'
#' @param doi (character) A DOI, optional, default: `NULL`
#' @param member (character) Crossref member id. optional
#' @param intended_application (character) intended application string,
#' optional
#' @examples
#' as_ftdmurl("http://downloads.hindawi.com/journals/bmri/2014/201717.xml",
#' "xml")
#' as_ftdmurl("http://downloads.hindawi.com/journals/bmri/2014/201717.pdf",
#' "pdf")
#' out <-
#' as_ftdmurl("http://downloads.hindawi.com/journals/bmri/2014/201717.pdf",
#' "pdf", "10.1155/2014/201717")
#' attributes(out)
#' identical(attr(out, "type"), "pdf")
as_ftdmurl <- function(url, type, doi = NULL, member = NULL,
intended_application = NULL) {

UseMethod("as_ftdmurl")
}

#' @export
#' @rdname as_ftdmurl
as_ftdmurl.ftdmurl <- function(url, type, doi = NULL, member = NULL,
intended_application = NULL) {

return(url)
}

#' @export
#' @rdname as_ftdmurl
as_ftdmurl.character <- function(url, type, doi = NULL,
member = NULL, intended_application = NULL) {

makeurl(check_url(url), type, doi, member, intended_application)
}

#' @export
print.ftdmurl <- function(x, ...) {
cat("<url> ", x[[1]], "\n", sep = "")
}


# helpers --------
makeurl <- function(x, y, z, member, intended_application) {
structure(stats::setNames(list(x), match_type(y)),
class = "ftdmurl", type = match_type(y), doi = z,
member = member, intended_application = intended_application)
}

check_url <- function(x) {
if (!grepl("https?://", x)) stop("Not a proper url") else x
}

match_type <- function(x) {
match.arg(x, c("xml","html","plain","pdf","unspecified","all"))
}
203 changes: 203 additions & 0 deletions R/ft_cr_links.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#' Get Crossref full text links from a DOI
#'
#' @export
#' @param doi (character) A Digital Object Identifier (DOI). required.
#' @param type (character) One of 'xml', 'html', 'plain', 'pdf',
#' 'unspecified', or 'all' (default). required.
#' @param ... Named parameters passed on to [crul::HttpClient()]
#'
#' @details Note that this function is not vectorized.
#'
#' Some links returned will not in fact lead you to full text
#' content as you would understandbly think and expect. That is, if you
#' use the `filter` parameter with e.g., [rcrossref::cr_works()]
#' and filter to only full text content, some links may actually give back
#' only metadata for an article. Elsevier is perhaps the worst offender,
#' for one because they have a lot of entries in Crossref TDM, but most
#' of the links that are apparently full text are not in fact full text,
#' but only metadata. You can get full text if you are part of a subscribing
#' institution to that specific Elsever content, but otherwise, you're SOL.
#'
#' Note that there are still some bugs in the data returned form CrossRef.
#' For example, for the publisher eLife, they return a single URL with
#' content-type application/pdf, but the URL is not for a PDF, but for both
#' XML and PDF, and content-type can be set with that URL as either XML or
#' PDF to get that type.
#'
#' In another example, all Elsevier URLs at time of writing are have
#' `http` scheme, while those don't actually work, so we have a
#' custom fix in this function for that publisher. Anyway, expect changes...
#'
#' @section Register for the Polite Pool:
#' See of 'Authentication' setion of the [fulltext-package] manual page
#'
#' @return `NULL` if no full text links given; a list of tdmurl objects if
#' links found. a tdmurl object is an S3 class wrapped around a simple list,
#' with attributes for:
#'
#' - type: type, matchin type passed to the function
#' - doi: DOI
#' - member: Crossref member ID
#' - intended_application: intended application, e.g., text-mining
#'
#' @examples \dontrun{
#' dois <- c("10.1245/s10434-016-5211-6",
#' "10.17159/2413-3108/2016/v0i55a49", "10.17159/2413-3108/2015/v0i53a455",
#' "10.17159/2413-3108/2006/v0i18a982", "10.1007/s10665-016-9845-y",
#' "10.1016/j.ad.2015.06.020", "10.1016/j.medipa.2014.03.002")
#'
#' # pdf link
#' ft_cr_links(doi = "10.5555/515151", "pdf")
#'
#' # xml and plain text links
#' ft_cr_links(dois[1], "pdf")
#' ft_cr_links(dois[6], "xml")
#' ft_cr_links(dois[7], "plain")
#' ft_cr_links(dois[1]) # all is the default
#'
#' # pdf link
#' ft_cr_links(doi = "10.5555/515151", "pdf")
#' ft_cr_links(doi = "10.3897/phytokeys.52.5250", "pdf")
#'
#' # many calls, use e.g., lapply
#' lapply(dois[1:3], ft_cr_links)
#'
#' # elsevier
#' ## DOI that is open acccess
#' ft_cr_links('10.1016/j.physletb.2010.10.049')
#' ## DOI that is not open acccess
#' ft_cr_links('10.1006/jeth.1993.1066')
#' }
ft_cr_links <- function(doi, type = 'all', ...) {
res <- .crm_works_links(dois = doi, ...)[[1]]
if (is.null(unlist(res$links))) {
return(list())
} else {
elife <- grepl("elife", res$links[[1]]$URL)
withtype <- if (type == 'all') {
res$links
} else {
Filter(function(x) grepl(type, x$`content-type`), res$links)
}

if (is.null(withtype) || length(withtype) == 0) {
return(list())
} else {
withtype <- stats::setNames(withtype, sapply(withtype, function(x){
if (x$`content-type` == "unspecified") {
"unspecified"
} else {
strsplit(x$`content-type`, "/")[[1]][[2]]
}
}))

if (elife) {
withtype <- Filter(function(w) !grepl("lookup", w$URL), withtype)
}

if (basename(res$member) %in% c("2258", "179")) {
withtype <- lapply(withtype, function(z) {
z$URL <- sub("http://", "https://", z$URL)
z
})
}

if (basename(res$member) == "78") {
withtype <- lapply(withtype, function(z) {
z$URL <- sub("http://", "https://", z$URL)
z
})
pdf <- list(pdf =
utils::modifyList(withtype[[1]],
list(
URL = sub("text/xml", "application/pdf", withtype[[1]]$URL),
`content-type` = "application/pdf"
)
))
withtype <- c(withtype, pdf)
}

if (type == "all") {
lapply(withtype, function(b) {
makeurl(b$URL, st(b$`content-type`), doi, res$member, b$`intended-application`)
})
} else {
y <- match.arg(type, c('xml', 'plain', 'html', 'pdf', 'unspecified'))
makeurl(x = withtype[[y]]$URL, y = y, z = doi, res$member,
withtype[[y]]$`intended-application`)
}
}
}
}

.crm_works_links <- function(dois = NULL, ...) {
get_links <- function(x, ...) {
tmp <- crm_GET(sprintf("works/%s", x), NULL, FALSE, ...)
trylinks <- tryCatch(tmp$message$link, error = function(e) e)
if (inherits(trylinks, "error")) {
NULL
} else {
list(links = trylinks, member = tmp$message$member)
}
}
stats::setNames(lapply(dois, get_links, ...), dois)
}

st <- function(x){
if (grepl("/", x)) {
strsplit(x, "/")[[1]][[2]]
} else {
x
}
}

crm_GET <- function(endpoint, args = list(), todf = TRUE, on_error = warning,
parse = TRUE, ...) {

url <- sprintf("https://api.crossref.org/%s", endpoint)
cli <- crul::HttpClient$new(
url = url,
headers = list(
`User-Agent` = make_ua(),
`X-USER-AGENT` = make_ua()
),
opts = list(...)
)
res <- cli$get(query = args)
doi <- gsub("works/|/agency|funders/", "", endpoint)
if (!res$status_code < 300) {
on_error(sprintf("%s: %s - (%s)", res$status_code, get_err(res), doi),
call. = FALSE)
list(message = NULL)
} else {
stopifnot(res$response_headers$`content-type` ==
"application/json;charset=UTF-8")
res <- res$parse("UTF-8")
if (parse) jsonlite::fromJSON(res, todf) else res
}
}

get_err <- function(x) {
xx <- x$parse("UTF-8")
if (x$response_headers$`content-type` == "text/plain") {
tmp <- xx
} else if (x$response_headers$`content-type` == "text/html") {
html <- xml2::read_html(xx)
tmp <- xml2::xml_text(xml2::xml_find_first(html, '//h3[@class="info"]'))
} else if (
x$response_headers$`content-type` == "application/json;charset=UTF-8"
) {
tmp <- jsonlite::fromJSON(xx, FALSE)
} else {
tmp <- xx
}
if (inherits(tmp, "list")) {
tmp$message[[1]]$message
} else {
if (any(class(tmp) %in% c("HTMLInternalDocument", "xml_document"))) {
"Server error - check query - or api.crossref.org may have problems"
} else {
tmp
}
}
}
38 changes: 37 additions & 1 deletion R/ft_extract.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ft_extract <- function(x) {
#' @export
ft_extract.character <- function(x) {
if (!file.exists(x)) stop("File does not exist", call. = FALSE)
res <- crminer::crm_extract(x)
res <- .crm_extract(x)
structure(list(meta = res$info, data = res$text), class = "pdft_char",
path = x)
}
Expand Down Expand Up @@ -68,3 +68,39 @@ print.pdft_char <- function(x, ...) {
cat(" Creation date: ", as.character(as.Date(x$meta$created)), "\n",
sep = "")
}

.crm_extract <- function(path = NULL, raw = NULL, try_ocr = FALSE, ...) {
assert(try_ocr, "logical")
stopifnot(xor(is.null(path), is.null(raw)))
if (!is.null(path)) {
path <- path.expand(path)
if (!file.exists(path)) stop("path does not exist", call. = FALSE)
} else {
assert(raw, "raw")
path <- raw
}
fun <- if (try_ocr) pdftools::pdf_ocr_text else pdftools::pdf_text

structure(
list(
info = pdftools::pdf_info(path, ...),
text = fun(path, ...)
),
class = "crm_pdf",
path = if (is.character(path)) path else 'raw'
)
}

#' @export
print.crm_pdf <- function(x, ...) {
cat("<document>", attr(x, "path"), "\n", sep = "")
cat(" Pages: ", x$info$pages, "\n", sep = "")
cat(" No. characters: ", sum(nchar(x$text)), "\n", sep = "")
cat(" Created: ", as.character(as.Date(x$info$created)), "\n",
sep = "")
}
#' @export
print.crm_pdf_text <- function(x, ...) {
cat("<document>", attr(x, "path"), "\n", sep = "")
cat(" No. characters: ", sum(nchar(x$text)), "\n", sep = "")
}
12 changes: 9 additions & 3 deletions R/fulltext-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,18 @@
#'
#' **Crossref**: Crossref encourages requests with contact information
#' (an email address) and will forward you to a dedicated API cluster
#' for improved performance when you share your email address with them.
#' for improved performance when you share your email address with them. This
#' is called the "Polite Pool".
#' https://github.com/CrossRef/rest-api-doc#good-manners--more-reliable-service
#' To pass your email address to Crossref via this client, store it
#' as an environment variable in `.Renviron` like
#' `crossref_email = name@example.com`
#'
#' `crossref_email=name@example.com`, or `CROSSREF_EMAIL=name@example.com`.
#' Save the file and restart your R session. To stop sharing your email when
#' using rcrossref simply delete it from your `.Renviron` file OR to temporarily
#' not use your email unset it for the session
#' like `Sys.unsetenv('crossref_email')`. To be sure your in the polite pool
#' use curl verbose by e.g., `ft_cr_links(doi = "10.5555/515151", verbose = TRUE)`
#'
#' **Crossref TDM**: TDM = "Text and Data Mining". This used to apply to just
#' two publishers - Wiley and Elsevier - This service officially shut down at
#' the end of 2020. For Elsevier, see the
Expand Down
Loading

0 comments on commit 32c97fd

Please sign in to comment.