Skip to content

Commit

Permalink
Better error handling for guess_max (#590)
Browse files Browse the repository at this point in the history
* Better error handling for guess_max

Fixes #588

* More robust checks for guess_max

* Line up arguments in example

* Simplify boolean
  • Loading branch information
jimhester authored Feb 9, 2017
1 parent 61f899f commit 8e7febc
Show file tree
Hide file tree
Showing 9 changed files with 60 additions and 7 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
* parsing problems now include the filename (#581).
* Numeric parser now returns the full string if it contains no numbers (#548).
* `read_table()` can now handle `pipe()` connections (#552).
* The `guess_max` argument now throws errors on inappropriate inputs (#588).

* parsing problems in `read_delim()` and `read_fwf()` when columns are skipped using col_types now report the correct column name (#573, @cb4ds)

Expand Down
28 changes: 24 additions & 4 deletions R/col_types.R
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ col_concise <- function(x) {

col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
guessed_types = NULL,
comment = "", skip = 0, n = 1000,
comment = "", skip = 0, guess_max = 1000,
tokenizer = tokenizer_csv(),
locale = default_locale(),
drop_skipped_names = FALSE) {
Expand Down Expand Up @@ -353,7 +353,7 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
if (any(is_guess)) {
if (is.null(guessed_types)) {
ds <- datasource(file, skip = skip, comment = comment)
guessed_types <- guess_types(ds, tokenizer, locale, n = n)
guessed_types <- guess_types(ds, tokenizer, locale, guess_max = guess_max)
}

# Need to be careful here: there might be more guesses than types/names
Expand All @@ -364,8 +364,28 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
spec
}

guess_types <- function(datasource, tokenizer, locale, n = 1000) {
guess_types_(datasource, tokenizer, locale, n = n)

check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) {

if (length(guess_max) != 1 || !is.numeric(guess_max) || !is_integerish(guess_max) ||
is.na(guess_max) || guess_max < 0) {
stop("`guess_max` must be a positive integer", call. = FALSE)
}

if (guess_max > max_limit) {
warning("`guess_max` is a very large value, setting to `", max_limit,
"` to avoid exhausting memory", call. = FALSE)
guess_max <- max_limit
}
guess_max
}

guess_types <- function(datasource, tokenizer, locale, guess_max = 1000,
max_limit = .Machine$integer.max %/% 100) {

guess_max <- check_guess_max(guess_max, max_limit)

guess_types_(datasource, tokenizer, locale, n = guess_max)
}

guess_header <- function(datasource, tokenizer, locale = default_locale()) {
Expand Down
2 changes: 1 addition & 1 deletion R/read_delim.R
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ read_delimited <- function(file, tokenizer, col_names = TRUE, col_types = NULL,
}

spec <- col_spec_standardise(
data, skip = skip, comment = comment, n = guess_max,
data, skip = skip, comment = comment, guess_max = guess_max,
col_names = col_names, col_types = col_types, tokenizer = tokenizer,
locale = locale)

Expand Down
2 changes: 1 addition & 1 deletion R/read_fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ read_fwf <- function(file, col_positions, col_types = NULL,
spec <- col_spec_standardise(
file,
skip = skip,
n = guess_max,
guess_max = guess_max,
tokenizer = tokenizer,
locale = locale,
col_names = col_positions$col_names,
Expand Down
2 changes: 1 addition & 1 deletion R/read_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ read_table <- function(file, col_names = TRUE, col_types = NULL,
tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na, comment = comment)

spec <- col_spec_standardise(
file = ds, skip = skip, n = guess_max,
file = ds, skip = skip, guess_max = guess_max,
col_names = col_names, col_types = col_types,
locale = locale, tokenizer = tokenizer
)
Expand Down
5 changes: 5 additions & 0 deletions R/type_convert.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@
#'
#' df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE)
#' str(type_convert(df))
#'
#' # Type convert can be used to infer types from an entire dataset
#' type_convert(
#' read_csv(readr_example("mtcars.csv"),
#' col_types = cols(.default = col_character())))
type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE,
locale = default_locale()) {
stopifnot(is.data.frame(df))
Expand Down
4 changes: 4 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,7 @@ show_progress <- function() {
deparse2 <- function(expr, ..., sep = "\n") {
paste(deparse(expr, ...), collapse = sep)
}

is_integerish <- function(x) {
floor(x) == x
}
5 changes: 5 additions & 0 deletions man/type_convert.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions tests/testthat/test-col-spec.R
Original file line number Diff line number Diff line change
Expand Up @@ -226,3 +226,21 @@ test_that("long expressions are wrapped (597)", {
)
')
})

test_that("guess_types errors on invalid inputs", {
expect_error(col_spec_standardise("a,b,c\n", guess_max = NA), "`guess_max` must be a positive integer")
expect_error(col_spec_standardise("a,b,c\n", guess_max = -1), "`guess_max` must be a positive integer")

expect_warning(col_spec_standardise("a,b,c\n", guess_max = Inf), "`guess_max` is a very large value")
})

test_that("check_guess_max errors on invalid inputs", {
expect_error(check_guess_max(NULL), "`guess_max` must be a positive integer")
expect_error(check_guess_max("test"), "`guess_max` must be a positive integer")
expect_error(check_guess_max(letters), "`guess_max` must be a positive integer")
expect_error(check_guess_max(1:2), "`guess_max` must be a positive integer")
expect_error(check_guess_max(NA), "`guess_max` must be a positive integer")
expect_error(check_guess_max(-1), "`guess_max` must be a positive integer")

expect_warning(check_guess_max(Inf), "`guess_max` is a very large value")
})

0 comments on commit 8e7febc

Please sign in to comment.