Better error handling for guess_max

Fixes tidyverse#588
jimhester · Feb 9, 2017 · 8f906bb · 8f906bb
1 parent 61f899f
commit 8f906bb
Show file tree

Hide file tree

Showing 8 changed files with 35 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,7 @@
 * parsing problems now include the filename (#581).
 * Numeric parser now returns the full string if it contains no numbers (#548).
 * `read_table()` can now handle `pipe()` connections (#552).
+* The `guess_max` argument now throws errors on inappropriate inputs (#588).
 
 * parsing problems in `read_delim()` and `read_fwf()` when columns are skipped using col_types now report the correct column name (#573, @cb4ds)
 

diff --git a/R/col_types.R b/R/col_types.R
@@ -218,7 +218,7 @@ col_concise <- function(x) {
 
 col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
                                  guessed_types = NULL,
-                                 comment = "", skip = 0, n = 1000,
+                                 comment = "", skip = 0, guess_max = 1000,
                                  tokenizer = tokenizer_csv(),
                                  locale = default_locale(),
                                  drop_skipped_names = FALSE) {
@@ -353,7 +353,7 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
   if (any(is_guess)) {
     if (is.null(guessed_types)) {
       ds <- datasource(file, skip = skip, comment = comment)
-      guessed_types <- guess_types(ds, tokenizer, locale, n = n)
+      guessed_types <- guess_types(ds, tokenizer, locale, guess_max = guess_max)
     }
 
     # Need to be careful here: there might be more guesses than types/names
@@ -364,8 +364,18 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
   spec
 }
 
-guess_types <- function(datasource, tokenizer, locale, n = 1000) {
-  guess_types_(datasource, tokenizer, locale, n = n)
+guess_types <- function(datasource, tokenizer, locale, guess_max = 1000, max_limit = .Machine$integer.max %/% 100) {
+  if (is.na(guess_max) || guess_max < 0) {
+    stop("`guess_max` must be a positive integer", call. = FALSE)
+  }
+
+  if (guess_max > max_limit) {
+    warning("`guess_max` is a very large value, setting to `", max_limit,
+      "` to avoid exhausting memory", call. = FALSE)
+    guess_max <- max_limit
+  }
+
+  guess_types_(datasource, tokenizer, locale, n = guess_max)
 }
 
 guess_header <- function(datasource, tokenizer, locale = default_locale()) {

diff --git a/R/read_delim.R b/R/read_delim.R
@@ -174,7 +174,7 @@ read_delimited <- function(file, tokenizer, col_names = TRUE, col_types = NULL,
   }
 
   spec <- col_spec_standardise(
-    data, skip = skip, comment = comment, n = guess_max,
+    data, skip = skip, comment = comment, guess_max = guess_max,
     col_names = col_names, col_types = col_types, tokenizer = tokenizer,
     locale = locale)
 

diff --git a/R/read_fwf.R b/R/read_fwf.R
@@ -41,7 +41,7 @@ read_fwf <- function(file, col_positions, col_types = NULL,
   spec <- col_spec_standardise(
     file,
     skip = skip,
-    n = guess_max,
+    guess_max = guess_max,
     tokenizer = tokenizer,
     locale = locale,
     col_names = col_positions$col_names,

diff --git a/R/read_table.R b/R/read_table.R
@@ -37,7 +37,7 @@ read_table <- function(file, col_names = TRUE, col_types = NULL,
   tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na, comment = comment)
 
   spec <- col_spec_standardise(
-    file = ds, skip = skip, n = guess_max,
+    file = ds, skip = skip, guess_max = guess_max,
     col_names = col_names, col_types = col_types,
     locale = locale, tokenizer = tokenizer
   )

diff --git a/R/type_convert.R b/R/type_convert.R
@@ -33,6 +33,11 @@
 #'
 #' df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE)
 #' str(type_convert(df))
+#'
+#' # Type convert can be used to infer types from an entire dataset
+#' type_convert(
+#'   read_csv(readr_example("mtcars.csv"),
+#'     col_types = cols(.default = col_character())))
 type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE,
                          locale = default_locale()) {
   stopifnot(is.data.frame(df))

diff --git a/man/type_convert.Rd b/man/type_convert.Rd
diff --git a/tests/testthat/test-col-spec.R b/tests/testthat/test-col-spec.R
@@ -226,3 +226,10 @@ test_that("long expressions are wrapped (597)", {
 )
 ')
 })
+
+test_that("guess_types errors on invalid inputs", {
+  expect_error(col_spec_standardise("a,b,c\n", guess_max = NA), "`guess_max` must be a positive integer")
+  expect_error(col_spec_standardise("a,b,c\n", guess_max = -1), "`guess_max` must be a positive integer")
+
+  expect_warning(col_spec_standardise("a,b,c\n", guess_max = Inf), "`guess_max` is a very large value")
+})