-
Notifications
You must be signed in to change notification settings - Fork 26
/
Copy pathreadtext.R
312 lines (291 loc) · 13.5 KB
/
readtext.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
#' read a text file(s)
#'
#' Read texts and (if any) associated document-level meta-data from one or more source files.
#' The text source files
#' come from the textual component of the files, and the document-level
#' metadata ("docvars") come from either the file contents or filenames.
#' @param file the complete filename(s) to be read. This is designed to
#' automagically handle a number of common scenarios, so the value can be a
# single filename, a vector of file names a remote URL, or a file "mask" using a
#' "glob"-type wildcard value. Currently available filetypes are:
#'
#' **Single file formats:**
#'
#' \describe{
#' \item{`txt`}{plain text files:
#' So-called structured text files, which describe both texts and metadata:
#' For all structured text filetypes, the column, field, or node
#' which contains the the text must be specified with the `text_field`
#' parameter, and all other fields are treated as docvars.}
#' \item{`json`}{data in some form of JavaScript
#' Object Notation, consisting of the texts and optionally additional docvars.
#' The supported formats are:
#' \itemize{
#' \item a single JSON object per file
#' \item line-delimited JSON, with one object per line
#' \item line-delimited JSON, of the format produced from a Twitter stream.
#' This type of file has special handling which simplifies the Twitter format
#' into docvars. The correct format for each JSON file is automatically detected.}}
#' \item{`csv,tab,tsv`}{comma- or tab-separated values}
#' \item{`html`}{HTML documents, including specialized formats from known
#' sources, such as Nexis-formatted HTML. See the `source` parameter
#' below.}
#' \item{`xml`}{XML documents are supported -- those of the
#' kind that can be read by [xml2::read_xml()] and navigated through
#' [xml2::xml_find_all()]. For xml files, an additional
#' argument `collapse` may be passed through `...` that names the character(s) to use in
#' appending different text elements together.}
#' \item{`pdf`}{pdf formatted files, converted through \pkg{pdftools}.}
#' \item{`odt`}{Open Document Text formatted files.}
#' \item{`doc, docx`}{Microsoft Word formatted files.}
#' \item{`rtf`}{Rich Text Files.}
#'
#' **Reading multiple files and file types:**
#'
#' In addition, `file` can also not be a path
#' to a single local file, but also combinations of any of the above types, such as:
#' \item{a wildcard value}{any valid
#' pathname with a wildcard ("glob") expression that can be expanded by the
#' operating system. This may consist of multiple file types.}
#' \item{a URL to a remote}{which is downloaded then loaded}
#' \item{`zip,tar,tar.gz,tar.bz`}{archive file, which is unzipped. The
#' contained files must be either at the top level or in a single directory.
#' Archives, remote URLs and glob patterns can resolve to any of the other
#' filetypes, so you could have, for example, a remote URL to a zip file which
#' contained Twitter JSON files.}
#' }
#' @param text_field,docid_field a variable (column) name or column number
#' indicating where to find the texts that form the documents for the corpus
#' and their identifiers. This must be specified for file types `.csv`,
#' `.json`, and `.xls`/`.xlsx` files. For XML files, an XPath
#' expression can be specified.
#' @param docvarsfrom used to specify that docvars should be taken from the
#' filenames, when the `readtext` inputs are filenames and the elements
#' of the filenames are document variables, separated by a delimiter
#' (`dvsep`). This allows easy assignment of docvars from filenames such
#' as `1789-Washington.txt`, `1793-Washington`, etc. by `dvsep`
#' or from meta-data embedded in the text file header (`headers`).
#' If `docvarsfrom` is set to `"filepaths"`, consider the full path to the
#' file, not just the filename.
#' @param dvsep separator (a regular expression character string) used in
#' filenames to delimit docvar elements if `docvarsfrom="filenames"`
#' or `docvarsfrom="filepaths"` is used
#' @param docvarnames character vector of variable names for `docvars`, if
#' `docvarsfrom` is specified. If this argument is not used, default
#' docvar names will be used (`docvar1`, `docvar2`, ...).
#' @param encoding vector: either the encoding of all files, or one encoding
#' for each files
#' @param ignore_missing_files if `FALSE`, then if the file
#' argument doesn't resolve to an existing file, then an error will be thrown.
#' Note that this can happen in a number of ways, including passing a path
#' to a file that does not exist, to an empty archive file, or to a glob
#' pattern that matches no files.
#' @param source used to specify specific formats of some input file types, such
#' as JSON or HTML. Currently supported types are `"twitter"` for JSON and
#' `"nexis"` for HTML.
#' @param cache if `TRUE`, save remote file to a temporary folder. Only used
#' when `file` is a URL.
#' @param verbosity \itemize{
#' \item 0: output errors only
#' \item 1: output errors and warnings (default)
#' \item 2: output a brief summary message
#' \item 3: output detailed file-related messages
#' }
#' @param ... additional arguments passed through to low-level file reading
#' function, such as [file()], [fread()], etc. Useful
#' for specifying an input encoding option, which is specified in the same was
#' as it would be give to [iconv()]. See the Encoding section of
#' [file] for details.
#' @return a data.frame consisting of a columns `doc_id` and `text`
#' that contain a document identifier and the texts respectively, with any
#' additional columns consisting of document-level variables either found
#' in the file containing the texts, or created through the
#' `readtext` call.
#' @export
#' @importFrom utils unzip type.convert
#' @importFrom httr GET write_disk
#' @examples
#' \dontrun{
#' ## get the data directory
#' if (!interactive()) pkgload::load_all()
#' DATA_DIR <- system.file("extdata/", package = "readtext")
#'
#' ## read in some text data
#' # all UDHR files
#' (rt1 <- readtext(paste0(DATA_DIR, "/txt/UDHR/*")))
#'
#' # manifestos with docvars from filenames
#' (rt2 <- readtext(paste0(DATA_DIR, "/txt/EU_manifestos/*.txt"),
#' docvarsfrom = "filenames",
#' docvarnames = c("unit", "context", "year", "language", "party"),
#' encoding = "LATIN1"))
#'
#' # recurse through subdirectories
#' (rt3 <- readtext(paste0(DATA_DIR, "/txt/movie_reviews/*"),
#' docvarsfrom = "filepaths", docvarnames = "sentiment"))
#'
#' ## read in csv data
#' (rt4 <- readtext(paste0(DATA_DIR, "/csv/inaugCorpus.csv")))
#'
#' ## read in tab-separated data
#' (rt5 <- readtext(paste0(DATA_DIR, "/tsv/dailsample.tsv"), text_field = "speech"))
#'
#' ## read in JSON data
#' (rt6 <- readtext(paste0(DATA_DIR, "/json/inaugural_sample.json"), text_field = "texts"))
#'
#' ## read in pdf data
#' # UNHDR
#' (rt7 <- readtext(paste0(DATA_DIR, "/pdf/UDHR/*.pdf"),
#' docvarsfrom = "filenames",
#' docvarnames = c("document", "language")))
#' Encoding(rt7$text)
#'
#' ## read in Word data (.doc)
#' (rt8 <- readtext(paste0(DATA_DIR, "/word/*.doc")))
#' Encoding(rt8$text)
#'
#' ## read in Word data (.docx)
#' (rt9 <- readtext(paste0(DATA_DIR, "/word/*.docx")))
#' Encoding(rt9$text)
#'
#' ## use elements of path and filename as docvars
#' (rt10 <- readtext(paste0(DATA_DIR, "/pdf/UDHR/*.pdf"),
#' docvarsfrom = "filepaths", dvsep = "[/_.]"))
#' }
readtext <- function(file, ignore_missing_files = FALSE, text_field = NULL,
docid_field = NULL,
docvarsfrom = c("metadata", "filenames", "filepaths"), dvsep = "_",
docvarnames = NULL, encoding = NULL, source = NULL, cache = TRUE,
verbosity = readtext_options("verbosity"),
...) {
args <- list(...)
if ("textfield" %in% names(args)) {
warning("textfield is deprecated; use text_field instead.")
text_field <- args[["textfield"]]
}
# # in case the function was called without attaching the package,
# # in which case the option is never set
# if (is.null(verbosity))
# verbosity <- 1
if (!verbosity %in% 0:3)
stop("verbosity must be one of 0, 1, 2, 3.")
if (!all(is.character(file)))
stop("file must be a character (specifying file location(s)).")
if (!is.null(source) && !is.character(source))
stop("source must be a character.")
docvarsfrom <- match.arg(docvarsfrom)
# # just use the first, if both are specified?
# if (is.missing(docvarsfrom))
#
# if (!all(docvarsfrom %in% c( c("metadata", "filenames"))))
# stop("illegal docvarsfrom value")
if (is.null(text_field))
text_field <- 1
if (length(encoding) < 2 && is.null(encoding))
encoding <- getOption("encoding")
if (is.null(source))
source <- "auto"
if (verbosity >= 2)
message("Reading texts from ", file)
# TODO: files need to be imported as they are discovered. Currently
# list_files() uses a lot of storage space for temporary files when there
# are a lot of archives.
files <- list_files(file, ignore_missing_files, FALSE, cache, verbosity)
if (length(encoding) == 1) {
encoding <- rep(encoding, length(files))
} else {
if (length(encoding) != length(files))
stop("Encoding parameter must be length 1, or as long as the number of files")
}
sources <- mapply(function(x, e) {
get_source(x, text_field = text_field, docid_field = docid_field,
encoding = e, source = source, verbosity = verbosity, ...)
}, files, encoding, SIMPLIFY = FALSE)
# combine all of the data.frames returned
result <- data.frame(doc_id = "",
data.table::rbindlist(sources, use.names = TRUE, fill = TRUE),
stringsAsFactors = FALSE)
# this is in case some smart-alec (like AO) globs different directories
# for identical filenames
ids <- lapply(sources, row.names)
id <- unlist(ids, use.names = FALSE)
if (any(duplicated(id))) {
prefix <- rep(basename_unique(files, path_only = TRUE), lengths(ids))
#if (lengths(prefix) > 1)
id <- paste(prefix, id, sep = "/")
}
if (docvarsfrom %in% c("filepaths", "filenames")) {
docvar <- get_docvars_filenames(files, dvsep, docvarnames, docvarsfrom == "filepaths", verbosity)
result <- cbind(result, impute_types(docvar))
}
# change rownames to doc_id
result$doc_id <- id
rownames(result) <- NULL
if (verbosity >= 2)
message(" ... read ", nrow(result), " document", if (nrow(result) == 1) "" else "s.")
class(result) <- c("readtext", "data.frame")
result
}
## Read each file as appropriate, calling the get_* functions for recognized
## file types
get_source <- function(path, text_field, docid_field, replace_specialchar = FALSE, verbosity = 1, ...,
# deprecated arguments
textfield) {
ext <- tolower(file_ext(path))
if (ext %in% extensions()) {
if (dir.exists(path)) {
call <- deparse(sys.call(1))
call <- sub(path, paste0(sub("/$", "", path), "/*"), call, fixed = TRUE)
stop("File '", path, "' does not exist, but a directory of this name does exist. ",
"To read all files in a directory, you must pass a glob expression like ", call, ".")
}
} else {
if (verbosity >= 1)
warning("Unsupported extension ", sQuote(ext), " of file ", path , " treating as plain text.")
ext <- "txt"
}
if (verbosity >= 3)
message(" ... reading (", ext, ") file: ", path)
result <- switch(ext,
txt = get_txt(path, ...),
csv = get_csv(path, text_field, docid_field, sep = ",", ...),
tsv = get_csv(path, text_field, docid_field, sep = "\t", ...),
tab = get_csv(path, text_field, docid_field, sep = "\t", ...),
json = get_json(path, text_field, docid_field, verbosity = verbosity, ...),
xml = get_xml(path, text_field, verbosity = verbosity, ...),
html = get_html(path, verbosity = verbosity, ...),
pdf = get_pdf(path, ...),
odt = get_odt(path, ...),
docx = get_docx(path, ...),
doc = get_doc(path, ...),
rtf = get_rtf(path, ...),
xls = get_excel(path, text_field, docid_field, ...),
xlsx = get_excel(path, text_field, docid_field, ...),
ods = get_ods(path, text_field, docid_field, ...)
)
# assign filename (variants) unique text names
len <- nrow(result)
# TODO: stop using row.names as it errors when duplicated
if (len > 1) {
if (is.null(docid_field))
row.names(result) <- paste(basename(path), seq_len(len), sep = ".")
} else {
row.names(result) <- basename(path)
}
if (replace_specialchar)
result$text <- replace_charclass(result$text)
return(result)
}
replace_charclass <- function (text) {
mapping <- c(
"\\p{Dash_Punctuation}" = "-",
"\\p{Space_Separator}" = " ",
"\\p{Initial_Punctuation}" = "'",
"\\p{Final_Punctuation}" = "'",
"\\p{Private_Use}" = "",
"\\p{Unassigned}" = ""
)
for (i in seq_along(mapping))
text <- stri_replace_all(text, names(mapping[i]), regex = mapping[i])
return(text)
}