Fixing of ragged fixed width format files and reading a subset of col…

…umns (#353) To read ragged fwf files, the last position must be NA. Fixes #300. Fixes #326.
tidyverse · Jun 7, 2016 · 8b253b2 · 8b253b2
1 parent e41bc7e
commit 8b253b2
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 24 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # readr 0.2.2.9000
 
+* Fix bug in `read_fwf()`, it will now properly read a subset of columns.
+  If the final column is ragged, supply an NA as the final end `fwf_positions`
+  or final width `fwf_widths` position (#353,@ghaarsma).
+
 * readr now imports tibble so that you get consistent `tbl_df` behaviour 
   (#317, #385).
 
@@ -19,6 +23,8 @@
 
 * `read_delim()` gains a `trim_ws` argument (#312, noamross)
 
+=======
+>>>>>>> Updated and appended read_fwf tests and NEWS.md
 # readr 0.2.2
 
 * Fix bug when checking empty values for missingness (caused valgrind issue

diff --git a/R/read_fwf.R b/R/read_fwf.R
@@ -13,8 +13,8 @@
 #' @inheritParams read_delim
 #' @param col_positions Column positions, as created by \code{fwf_empty},
 #'   \code{fwf_widths} or \code{fwf_positions}. To read in only selected fields,
-#'   use \code{fwf_positions}. The width of the last column will be silently
-#'   extended to the next line break.
+#'   use \code{fwf_positions}. If the width of the last column is variable (a
+#'   ragged fwf file), supply the last end position as NA.
 #' @export
 #' @examples
 #' fwf_sample <- system.file("extdata/fwf-sample.txt", package = "readr")
@@ -65,7 +65,8 @@ fwf_empty <- function(file, skip = 0, col_names = NULL) {
 
 #' @rdname read_fwf
 #' @export
-#' @param widths Width of each field.
+#' @param widths Width of each field. Use NA as width of last field when
+#'    reading a ragged fwf file.
 #' @param col_names Either NULL, or a character vector column names.
 fwf_widths <- function(widths, col_names = NULL) {
   pos <- cumsum(c(1, widths))
@@ -76,7 +77,9 @@ fwf_widths <- function(widths, col_names = NULL) {
 #' @rdname read_fwf
 #' @export
 #' @param start,end Starting and ending (inclusive) positions of each field.
+#'    Use NA as last end field when reading a ragged fwf file.
 fwf_positions <- function(start, end, col_names = NULL) {
+
   stopifnot(length(start) == length(end))
 
   if (is.null(col_names)) {

diff --git a/src/TokenizerFwf.cpp b/src/TokenizerFwf.cpp
@@ -81,10 +81,14 @@ TokenizerFwf::TokenizerFwf(const std::vector<int>& beginOffset, const std::vecto
 {
   if (beginOffset_.size() != endOffset_.size())
     Rcpp::stop("Begin (%i) and end (%i) specifications must have equal length",
-      beginOffset_.size(), endOffset_.size());
+               beginOffset_.size(), endOffset_.size());
+
+  // File is assumed to be ragged (last column can have variable width)
+  // when the last element of endOffset_ is NA
+  isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER;
 
   max_ = 0;
-  for (int j = 0; j < cols_; ++j) {
+  for (int j = 0; j < (cols_ - isRagged_); ++j) {
     if (endOffset_[j] <= beginOffset_[j])
       Rcpp::stop("Begin offset (%i) must be smaller than end offset (%i)",
         beginOffset_[j], endOffset_[j]);
@@ -133,29 +137,30 @@ Token TokenizerFwf::nextToken() {
   }
 
   SourceIterator fieldEnd = fieldBegin;
-  int width = endOffset_[col_] - beginOffset_[col_];
   bool lastCol = (col_ == cols_ - 1), tooShort = false, hasNull = false;
 
-  // Find the end of the field, stopping for newlines
-  for(int i = 0; i < width; ++i) {
-    if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
-      warn(row_, col_, tfm::format("%i chars", width), tfm::format("%i", i));
-
-      tooShort = true;
-      break;
-    }
-    if (*fieldEnd == '\0')
-      hasNull = true;
-
-    fieldEnd++;
-  }
-  // Last column is often ragged, so read until end of line (ignoring width)
-  if (lastCol) {
+  if (lastCol && isRagged_) {
+    // Last column is ragged, so read until end of line (ignoring width)
     while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
       if (*fieldEnd == '\0')
         hasNull = true;
       fieldEnd++;
     }
+  } else {
+    int width = endOffset_[col_] - beginOffset_[col_];
+    // Find the end of the field, stopping for newlines
+    for(int i = 0; i < width; ++i) {
+      if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
+        warn(row_, col_, tfm::format("%i chars", width), tfm::format("%i", i));
+
+        tooShort = true;
+        break;
+      }
+      if (*fieldEnd == '\0')
+        hasNull = true;
+
+      fieldEnd++;
+    }
   }
 
   Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
@@ -164,6 +169,16 @@ Token TokenizerFwf::nextToken() {
     row_++;
     col_ = 0;
 
+    if (!(tooShort || isRagged_)) {
+      // Proceed to the end of the line when you are possibly not there.
+      // This is needed in case the last column in the file is not being read.
+      while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
+        if (*fieldEnd == '\0')
+          hasNull = true;
+        fieldEnd++;
+      }
+    }
+
     curLine_ = fieldEnd;
     advanceForLF(&curLine_, end_);
     if (curLine_ != end_)

diff --git a/src/TokenizerFwf.h b/src/TokenizerFwf.h
@@ -12,7 +12,7 @@ class TokenizerFwf : public Tokenizer {
 
   SourceIterator begin_, curLine_, end_;
   int row_, col_, cols_, max_;
-  bool moreTokens_;
+  bool moreTokens_, isRagged_;
 
 public:
 

diff --git a/tests/testthat/test-read-fwf.R b/tests/testthat/test-read-fwf.R
@@ -14,8 +14,8 @@ test_that("passing \"\" to read_fwf's 'na' option", {
                c("bar", NA))
 })
 
-test_that("ragged last column silently expanded", {
-  x <- read_fwf("1a\n2ab\n3abc", fwf_widths(c(1, 1)))
+test_that("ragged last column expanded with NA", {
+  x <- read_fwf("1a\n2ab\n3abc", fwf_widths(c(1, NA)))
   expect_equal(x$X2, c("a", "ab", "abc"))
   expect_equal(n_problems(x), 0)
 })
@@ -26,6 +26,39 @@ test_that("ragged last column shrunk with warning", {
   expect_equal(n_problems(x), 2)
 })
 
+test_that("read all columns with positions, non ragged", {
+  col_pos <- fwf_positions(c(1,3,6),c(2,5,6))
+  x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
+  expect_equal(x$X3, c("A", "B", "C"))
+  expect_equal(n_problems(x), 0)
+})
+
+test_that("read subset columns with positions", {
+  col_pos <- fwf_positions(c(1,3),c(2,5))
+  x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
+  expect_equal(x$X1, c(12, 67, 54))
+  expect_equal(x$X2, c(345, 890, 321))
+  expect_equal(n_problems(x), 0)
+})
+
+test_that("read columns with positions, ragged", {
+  col_pos <- fwf_positions(c(1,3,6),c(2,5,NA))
+  x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
+  expect_equal(x$X1, c(12, 67, 54))
+  expect_equal(x$X2, c(345, 890, 321))
+  expect_equal(x$X3, c('A', 'BBBBBBBBB', 'C'))
+  expect_equal(n_problems(x), 0)
+})
+
+test_that("read columns with width, ragged", {
+  col_pos <- fwf_widths(c(2,3,NA))
+  x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
+  expect_equal(x$X1, c(12, 67, 54))
+  expect_equal(x$X2, c(345, 890, 321))
+  expect_equal(x$X3, c('A', 'BBBBBBBBB', 'C'))
+  expect_equal(n_problems(x), 0)
+})
+
 # read_table -------------------------------------------------------------------
 
 test_that("read_table silently reads ragged last column", {