From 564bf38daedc5cbd014696b4bd7f8c4315867300 Mon Sep 17 00:00:00 2001 From: PAULOPERNA <41304883+PauloPerna@users.noreply.github.com> Date: Mon, 6 Dec 2021 11:19:12 -0300 Subject: [PATCH 1/5] Update ocr.R --- R/ocr.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/ocr.R b/R/ocr.R index c1a495b..165c61e 100644 --- a/R/ocr.R +++ b/R/ocr.R @@ -10,18 +10,18 @@ #' languge of the engine. #' @param pages which pages of the pdf file to extract #' @param dpi resolution to render image that is passed to [tesseract::ocr]. -pdf_ocr_text <- function(pdf, pages = NULL, opw = "", upw = "", language = "eng", dpi = 600){ +pdf_ocr_text <- function(pdf, pages = NULL, opw = "", upw = "", language = "eng", dpi = 600, verbose = FALSE){ engine <- tesseract::tesseract(language) - images <- pdf_convert(pdf = pdf, pages = pages, opw = opw, upw = upw, dpi = dpi) + images <- pdf_convert(pdf = pdf, pages = pages, opw = opw, upw = upw, dpi = dpi, verbose = verbose) on.exit(unlink(images)) vapply(images, tesseract::ocr, character(1), engine = engine, USE.NAMES = FALSE) } #' @export #' @rdname pdf_ocr -pdf_ocr_data <- function(pdf, pages = NULL, opw = "", upw = "", language = "eng", dpi = 600){ +pdf_ocr_data <- function(pdf, pages = NULL, opw = "", upw = "", language = "eng", dpi = 600, verbose = FALSE){ engine <- tesseract::tesseract(language) - images <- pdf_convert(pdf = pdf, pages = pages, opw = opw, upw = upw, dpi = dpi) + images <- pdf_convert(pdf = pdf, pages = pages, opw = opw, upw = upw, dpi = dpi, verbose = verbose) on.exit(unlink(images)) lapply(images, tesseract::ocr_data, engine = engine) } From 424de3c02ff1368a36f86128fa1052c62c626a49 Mon Sep 17 00:00:00 2001 From: PAULOPERNA <41304883+PauloPerna@users.noreply.github.com> Date: Mon, 6 Dec 2021 11:50:21 -0300 Subject: [PATCH 2/5] Update ocr.R --- R/ocr.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/ocr.R b/R/ocr.R index 165c61e..bc910be 100644 --- a/R/ocr.R +++ b/R/ocr.R @@ -10,6 +10,7 @@ #' languge of the engine. #' @param pages which pages of the pdf file to extract #' @param dpi resolution to render image that is passed to [tesseract::ocr]. +#' @param verbose passed to [pdftools][pdftools::pdf_convert] pdf_ocr_text <- function(pdf, pages = NULL, opw = "", upw = "", language = "eng", dpi = 600, verbose = FALSE){ engine <- tesseract::tesseract(language) images <- pdf_convert(pdf = pdf, pages = pages, opw = opw, upw = upw, dpi = dpi, verbose = verbose) From 5442cf653a573cdc951a7846334c06d6ed94c64c Mon Sep 17 00:00:00 2001 From: PAULOPERNA <41304883+PauloPerna@users.noreply.github.com> Date: Mon, 6 Dec 2021 13:40:24 -0300 Subject: [PATCH 3/5] Update RcppExports.R --- R/RcppExports.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 16c4583..216d919 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -14,11 +14,11 @@ poppler_pdf_info <- function(x, opw, upw) { } poppler_pdf_data <- function(x, get_font_info, opw, upw) { - .Call('_pdftools_poppler_pdf_data', PACKAGE = 'pdftools', x, get_font_info, opw, upw) + .Call('_pdftools_poppler_pdf_data', PACKAGE = 'pdftools', x, get_font_info, opw, upw, verbose) } poppler_pdf_text <- function(x, opw, upw) { - .Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw) + .Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw, verbose) } poppler_pdf_pagesize <- function(x, opw, upw) { From cb7b5e95a9dc678b4de1fad287b966cd4ad462d0 Mon Sep 17 00:00:00 2001 From: PAULOPERNA <41304883+PauloPerna@users.noreply.github.com> Date: Mon, 6 Dec 2021 13:42:33 -0300 Subject: [PATCH 4/5] Update pdf_ocr.Rd --- man/pdf_ocr.Rd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/man/pdf_ocr.Rd b/man/pdf_ocr.Rd index 3417b76..ac2d76c 100644 --- a/man/pdf_ocr.Rd +++ b/man/pdf_ocr.Rd @@ -11,7 +11,8 @@ pdf_ocr_text( opw = "", upw = "", language = "eng", - dpi = 600 + dpi = 600, + verbose = FALSE ) pdf_ocr_data( @@ -20,7 +21,8 @@ pdf_ocr_data( opw = "", upw = "", language = "eng", - dpi = 600 + dpi = 600, + verbose = FALSE ) } \arguments{ From 4c5f6e07f9f2b0687300762033ec2319b5c94c51 Mon Sep 17 00:00:00 2001 From: PAULOPERNA <41304883+PauloPerna@users.noreply.github.com> Date: Mon, 6 Dec 2021 14:14:56 -0300 Subject: [PATCH 5/5] Update RcppExports.R --- R/RcppExports.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 216d919..16c4583 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -14,11 +14,11 @@ poppler_pdf_info <- function(x, opw, upw) { } poppler_pdf_data <- function(x, get_font_info, opw, upw) { - .Call('_pdftools_poppler_pdf_data', PACKAGE = 'pdftools', x, get_font_info, opw, upw, verbose) + .Call('_pdftools_poppler_pdf_data', PACKAGE = 'pdftools', x, get_font_info, opw, upw) } poppler_pdf_text <- function(x, opw, upw) { - .Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw, verbose) + .Call('_pdftools_poppler_pdf_text', PACKAGE = 'pdftools', x, opw, upw) } poppler_pdf_pagesize <- function(x, opw, upw) {