Fix and warn on duplicate & missing col names

Fixes #318. Fixes #364
tidyverse · Jul 13, 2016 · 06330ff · 06330ff
1 parent fedd5a5
commit 06330ff
Show file tree

Hide file tree

Showing 10 changed files with 82 additions and 11 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # readr 0.2.2.9000
 
+* Missing colum name names are now given a default name (`X2`, `X7` etc) (#318).
+  Duplicated column names are now deduplicated. Both changes generate a warning;
+  to suppress it supply explicit `col_names` (setting skip = 1 if needed).
+
 * `parse_number()` is slightly more flexible - it now parses numbers up
   to the first ill-formed character. For example `parse_number("-3-")`
   and `parse_number("...3...")` now return -3 and 3 respectively.
@@ -105,9 +109,6 @@
 
 * `parse_time("NA")` works as expected (#398).
 
-* Quick hack to return something instead of NA for missing column names
-  (#318)
-
 * Add support for parsing years with col_date("%Y") or col_datetime("%Y") 
 
 * `parse_logical()` now accepts `0`, `1` as well as lowercase `t`, `f`, `true`, `false`. 

diff --git a/R/col_types.R b/R/col_types.R
@@ -217,6 +217,38 @@ col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL,
     stop("`col_names` must be TRUE, FALSE or a character vector", call. = FALSE)
   }
 
+  missing_names <- is.na(col_names)
+  if (any(missing_names)) {
+    new_names <- paste0("X", seq_along(col_names)[missing_names])
+    col_names[missing_names] <- new_names
+    warning(
+      "Missing column names filled in: ",
+      paste0(
+        encodeString(new_names, quote = "'"),
+        " [", which(missing_names), "]",
+        collapse = ", "
+      ), call. = FALSE
+    )
+  }
+
+  if (anyDuplicated(col_names)) {
+    dups <- duplicated(col_names)
+
+    old_names <- col_names
+    col_names <- make.unique(col_names, sep = "_")
+
+    warning(
+      "Duplicated column names deduplicated: ",
+      paste0(
+        encodeString(old_names[dups], quote = "'"),
+        " => ",
+        encodeString(col_names[dups], quote = "'"),
+        " [", which(dups), "]",
+        collapse = ", "
+      ), call. = FALSE
+    )
+  }
+
   # Figure out column types ----------------------------------------------------
 
   spec <- as.col_spec(col_types)

diff --git a/R/read_delim.R b/R/read_delim.R
@@ -22,6 +22,10 @@ NULL
 #'   If \code{col_names} is a character vector, the values will be used as the
 #'   names of the columns, and the first row of the input will be read into
 #'   the first row of the output data frame.
+#'
+#'   Missing (\code{NA}) column names will generate a warning, and be filled
+#'   in with dummy names \code{X1}, \code{X2} etc. Duplicate column names
+#'   will generate a warning and be made unique with a numeric prefix.
 #' @param col_types One of \code{NULL}, a \code{\link{cols}} specification, or
 #'   a string. See \code{vignette("column-types")} for more details.
 #'

diff --git a/README.md b/README.md
@@ -51,8 +51,9 @@ See `vignette("column-types")` on how readr parses columns, and how you can over
 * Characters are never automatically converted to factors (i.e. no more 
   `stringsAsFactors = FALSE`).
 
-* Column names are left as is, not munged into valid R identifiers
-  (i.e. there is no `check.names = TRUE`).
+* Valid column names are left as is, not munged into valid R identifiers
+  (i.e. there is no `check.names = TRUE`). Missing column names are filled
+  in with `X1`, `X2` etc, and duplicated column names are deduplicated.
 
 * The data frame is given class `c("tbl_df", "tbl", "data.frame")` so 
   if you also use [dplyr](https://github.com/hadley/dplyr/) you'll get an 

diff --git a/man/read_delim.Rd b/man/read_delim.Rd
diff --git a/man/read_log.Rd b/man/read_log.Rd
diff --git a/man/read_table.Rd b/man/read_table.Rd
diff --git a/man/spec_delim.Rd b/man/spec_delim.Rd
diff --git a/src/parse.cpp b/src/parse.cpp
@@ -72,8 +72,6 @@ RObject guess_header_(List sourceSpec, List tokenizerSpec, List locale_) {
 
     if (t.type() == TOKEN_STRING) {
       out.setValue(t.col(), t);
-    } else {
-      out.setValue(t.col(), tfm::format("X%i", bad_i++));
     }
   }
 

diff --git a/tests/testthat/test-col-spec.R b/tests/testthat/test-col-spec.R
@@ -54,6 +54,25 @@ test_that("col_spec_standardise works properly with 1 row inputs and no header c
   expect_is(col_spec_standardise("1\n", col_names = FALSE)[[1]]$X1, "collector_integer")
 })
 
+test_that("warns about duplicated names", {
+  expect_warning(col_spec_standardise("a,a\n1,2"), "Duplicated column names")
+  expect_warning(col_spec_standardise("X2,\n1,2"), "Duplicated column names")
+  expect_warning(
+    col_spec_standardise("1,2\n1,2", col_names = c("X", "X")),
+    "Duplicated column names"
+  )
+})
+
+test_that("warn about missing col names and fill in", {
+  expect_warning(col_spec_standardise(",\n1,2"), "Missing column names")
+  expect_warning(
+    col_spec_standardise("1,2\n1,2", col_names = c("X", NA)),
+    "Missing column names"
+  )
+})
+
+# Printing ----------------------------------------------------------------
+
 regex_escape <- function(x) {
   chars <- c("*", ".", "?", "^", "+", "$", "|", "(", ")", "[", "]", "{", "}", "\\")
   gsub(paste0("([\\", paste0(collapse = "\\", chars), "])"), "\\\\\\1", x, perl = TRUE)