Skip to content

Commit

Permalink
Better error handling for guess_max
Browse files Browse the repository at this point in the history
  • Loading branch information
jimhester committed Feb 9, 2017
1 parent 61f899f commit 8f906bb
Show file tree
Hide file tree
Showing 8 changed files with 35 additions and 7 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* parsing problems now include the filename (#581).
* Numeric parser now returns the full string if it contains no numbers (#548).
* `read_table()` can now handle `pipe()` connections (#552).
* The `guess_max` argument now throws errors on inappropriate inputs (#588).

* parsing problems in `read_delim()` and `read_fwf()` when columns are skipped using col_types now report the correct column name (#573, @cb4ds)

Expand Down
18 changes: 14 additions & 4 deletions R/col_types.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ col_concise <- function(x) {

col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
guessed_types = NULL,
comment = "", skip = 0, n = 1000,
comment = "", skip = 0, guess_max = 1000,
tokenizer = tokenizer_csv(),
locale = default_locale(),
drop_skipped_names = FALSE) {
Expand Down Expand Up @@ -353,7 +353,7 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
if (any(is_guess)) {
if (is.null(guessed_types)) {
ds <- datasource(file, skip = skip, comment = comment)
guessed_types <- guess_types(ds, tokenizer, locale, n = n)
guessed_types <- guess_types(ds, tokenizer, locale, guess_max = guess_max)
}

# Need to be careful here: there might be more guesses than types/names
Expand All @@ -364,8 +364,18 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
spec
}

guess_types <- function(datasource, tokenizer, locale, n = 1000) {
guess_types_(datasource, tokenizer, locale, n = n)
guess_types <- function(datasource, tokenizer, locale, guess_max = 1000, max_limit = .Machine$integer.max %/% 100) {
if (is.na(guess_max) || guess_max < 0) {
stop("`guess_max` must be a positive integer", call. = FALSE)
}

if (guess_max > max_limit) {
warning("`guess_max` is a very large value, setting to `", max_limit,
"` to avoid exhausting memory", call. = FALSE)
guess_max <- max_limit
}

guess_types_(datasource, tokenizer, locale, n = guess_max)
}

guess_header <- function(datasource, tokenizer, locale = default_locale()) {
Expand Down
2 changes: 1 addition & 1 deletion R/read_delim.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ read_delimited <- function(file, tokenizer, col_names = TRUE, col_types = NULL,
}

spec <- col_spec_standardise(
data, skip = skip, comment = comment, n = guess_max,
data, skip = skip, comment = comment, guess_max = guess_max,
col_names = col_names, col_types = col_types, tokenizer = tokenizer,
locale = locale)

Expand Down
2 changes: 1 addition & 1 deletion R/read_fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ read_fwf <- function(file, col_positions, col_types = NULL,
spec <- col_spec_standardise(
file,
skip = skip,
n = guess_max,
guess_max = guess_max,
tokenizer = tokenizer,
locale = locale,
col_names = col_positions$col_names,
Expand Down
2 changes: 1 addition & 1 deletion R/read_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ read_table <- function(file, col_names = TRUE, col_types = NULL,
tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na, comment = comment)

spec <- col_spec_standardise(
file = ds, skip = skip, n = guess_max,
file = ds, skip = skip, guess_max = guess_max,
col_names = col_names, col_types = col_types,
locale = locale, tokenizer = tokenizer
)
Expand Down
5 changes: 5 additions & 0 deletions R/type_convert.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
#'
#' df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE)
#' str(type_convert(df))
#'
#' # Type convert can be used to infer types from an entire dataset
#' type_convert(
#' read_csv(readr_example("mtcars.csv"),
#' col_types = cols(.default = col_character())))
type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE,
locale = default_locale()) {
stopifnot(is.data.frame(df))
Expand Down
5 changes: 5 additions & 0 deletions man/type_convert.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions tests/testthat/test-col-spec.R
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,10 @@ test_that("long expressions are wrapped (597)", {
)
')
})

test_that("guess_types errors on invalid inputs", {
expect_error(col_spec_standardise("a,b,c\n", guess_max = NA), "`guess_max` must be a positive integer")
expect_error(col_spec_standardise("a,b,c\n", guess_max = -1), "`guess_max` must be a positive integer")

expect_warning(col_spec_standardise("a,b,c\n", guess_max = Inf), "`guess_max` is a very large value")
})

0 comments on commit 8f906bb

Please sign in to comment.