-
Notifications
You must be signed in to change notification settings - Fork 71
/
extract_tables.R
156 lines (146 loc) · 8.51 KB
/
extract_tables.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#' @title extract_tables
#' @description Extract tables from a file
#' @param file A character string specifying the path or URL to a PDF file.
#' @param pages An optional integer vector specifying pages to extract from.
#' @param area An optional list, of length equal to the number of pages specified, where each entry contains a four-element numeric vector of coordinates (top,left,bottom,right) containing the table for the corresponding page. As a convenience, a list of length 1 can be used to extract the same area from all (specified) pages. Only specify \code{area} or \code{columns}. Warning: \code{area} is ignored if \code{guess} is \code{TRUE}.
#' @param columns An optional list, of length equal to the number of pages specified, where each entry contains a numeric vector of horizontal (x) coordinates separating columns of data for the corresponding page. As a convenience, a list of length 1 can be used to specify the same columns for all (specified) pages. Only specify \code{area} or \code{columns}. Warning: \code{columns} is ignored if \code{guess} is \code{TRUE}.
#' @param col_names A logical indicating whether to include column names in the output tibbles. Default is \code{TRUE}.
#' @param guess A logical indicating whether to guess the locations of tables on each page. If \code{FALSE}, \code{area} or \code{columns} must be specified; if \code{TRUE}, \code{area} and \code{columns} are ignored.
#' @param method A string identifying the preferred method of table extraction.
#' \itemize{
#' \item \code{method = "decide"} (default) automatically decide (for each page) whether spreadsheet-like formatting is present and "lattice" is appropriate
#' \item \code{method = "lattice"} use Tabula's spreadsheet extraction algorithm
#' \item \code{method = "stream"} use Tabula's basic extraction algorithm
#' }
#' @param output A function to coerce the Java response object (a Java ArrayList of Tabula Tables) to some output format. The default method, \dQuote{matrices}, returns a list of character matrices. See Details for other options.
#' @param outdir Output directory for files if \code{output} is set to
#' \code{"csv"}, \code{"tsv"} or \code{"json"}, ignored otherwise. If equals
#' \code{NULL} (default), uses R sessions temporary directory \code{tempdir()}.
#' @param password Optionally, a character string containing a user password to access a secured PDF.
#' @param encoding Optionally, a character string specifying an encoding for the text, to be passed to the assignment method of \code{\link[base]{Encoding}}.
#' @param copy Specifies whether the original local file(s) should be copied to
#' \code{tempdir()} before processing. \code{FALSE} by default. The argument is
#' ignored if \code{file} is URL.
#' @param \dots These are additional arguments passed to the internal functions dispatched by \code{method}.
#' @details This function mimics the behavior of the Tabula command line utility. It returns a list of R character matrices containing tables extracted from a file by default. This response behavior can be changed by using the following options.
#' \itemize{
#' \item \code{output = "tibble"} attempts to coerce the structure returned by \code{method = "character"} into a list of tibbles and returns character strings where this fails.
#' \item \code{output = "character"} returns a list of single-element character vectors, where each vector is a tab-delimited, line-separate string of concatenated table cells.
#' \item \code{output = "csv"} writes the tables to comma-separated (CSV) files using Tabula's CSVWriter method in the same directory as the original PDF. \code{method = "tsv"} does the same but with tab-separated (TSV) files using Tabula's TSVWriter and \code{method = "json"} does the same using Tabula's JSONWriter method. Any of these three methods return the path to the directory containing the extract table files.
#' \item \code{output = "asis"} returns the Java object reference, which can be useful for debugging or for writing a custom parser.
#' }
#' \code{\link{extract_areas}} implements this functionality in an interactive mode allowing the user to specify extraction areas for each page.
#' @return By default, a list of character matrices. This can be changed by specifying an alternative value of \code{method} (see Details).
#' @references \href{https://tabula.technology/}{Tabula}
#' @author Thomas J. Leeper <thosjleeper@gmail.com>, Tom Paskhalis <tpaskhalis@gmail.com>
#' @examples
#' # simple demo file
#' f <- system.file("examples", "mtcars.pdf", package = "tabulapdf")
#'
#' # extract tables from only second page
#' extract_tables(f, pages = 2)
#' @seealso \code{\link{extract_areas}}, \code{\link{get_page_dims}}, \code{\link{make_thumbnails}}, \code{\link{split_pdf}}
#' @importFrom utils download.file
#' @importFrom readr read_delim
#' @importFrom tools file_path_sans_ext
#' @importFrom rJava J new .jfloat .jcall
#' @export
extract_tables <- function(file,
pages = NULL,
area = NULL,
columns = NULL,
col_names = TRUE,
guess = TRUE,
method = c("decide", "lattice", "stream"),
output = c(
"tibble", "matrix", "character",
"asis", "csv", "tsv", "json"
),
outdir = NULL,
password = NULL,
encoding = NULL,
copy = FALSE,
...) {
method <- match.arg(method)
output <- match.arg(output)
if (isTRUE(guess) && (!is.null(area) || !is.null(columns))) warning("Argument guess is TRUE: arguments area and columns are ignored.")
if (is.null(outdir)) {
outdir <- normalizePath(tempdir())
} else {
outdir <- normalizePath(outdir)
}
pdfDocument <- load_doc(file, password = password, copy = copy)
on.exit(pdfDocument$close())
oe <- new(J("technology.tabula.ObjectExtractor"), pdfDocument)
# parse arguments
if (is.null(pages)) {
pageIterator <- oe$extract()
} else {
pages <- as.integer(pages)
pageIterator <- oe$extract(make_pages(pages))
}
npages <- pdfDocument$getNumberOfPages()
area <- make_area(area = area, pages = pages, npages = npages, target = "tabula")
columns <- make_columns(columns = columns, pages = pages, npages = npages)
# setup extractors
basicExtractor <- new(J("technology.tabula.extractors.BasicExtractionAlgorithm"))
spreadsheetExtractor <- new(J("technology.tabula.extractors.SpreadsheetExtractionAlgorithm"))
if (method == "lattice") {
use <- method
} else if (method == "stream") {
use <- method
}
tables <- new(J("java.util.ArrayList"))
p <- 1L # page number
while (.jcall(pageIterator, "Z", "hasNext")) {
page <- .jcall(pageIterator, "Ljava/lang/Object;", "next")
if (!is.null(area[[p]])) {
page <- page$getArea(area[[p]])
}
# decide whether to use spreadsheet or basic extractor
if (method == "decide") {
tabular <- spreadsheetExtractor$isTabular(page)
if (identical(FALSE, tabular)) {
use <- "stream"
} else {
use <- "lattice"
}
}
if (isTRUE(guess) && use == "lattice") {
tables$add(spreadsheetExtractor$extract(page))
} else {
if (isTRUE(guess)) {
# detect table locations
detector <- new(J("technology.tabula.detectors.NurminenDetectionAlgorithm"))
guesses <- detector$detect(page)
guessesIterator <- guesses$iterator()
while (.jcall(guessesIterator, "Z", "hasNext")) {
guessRect <- .jcall(guessesIterator, "Ljava/lang/Object;", "next")
thisGuess <- page$getArea(guessRect)
tables$add(basicExtractor$extract(thisGuess))
rm(thisGuess)
}
} else {
if (is.null(columns[[p]])) {
tables$add(basicExtractor$extract(page))
} else {
tables$add(basicExtractor$extract(page, columns[[p]]))
}
}
}
rm(page)
p <- p + 1L # iterate page number
}
rm(p)
# return output
switch(tolower(output),
"csv" = write_csvs(tables, file = file, outdir = outdir, ...),
"tsv" = write_tsvs(tables, file = file, outdir = outdir, ...),
"json" = write_jsons(tables, file = file, outdir = outdir, ...),
"character" = list_characters(tables, encoding = encoding, ...),
"matrix" = list_matrices(tables, encoding = encoding, ...),
"tibble" = list_tibbles(tables, encoding = encoding, col_names = col_names, ...),
"asis" = tables,
tables
)
}