Check for line breaks between columns

Fixes #469
tidyverse · Jul 13, 2016 · 851220f · 851220f
1 parent 6df1fd0
commit 851220f
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 19 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,10 @@
 # readr 0.2.2.9000
 
+* `read_fwf()` now checks for line breaks in between specified columns (#469).
+
+* `fwf_empty()` returns sets last end value to NA to automatically trim
+  ragged columns.
+
 * `read_fwf()` now can now skip columns without adding an extranous column of 
   missing values (#322).
 

diff --git a/R/read_fwf.R b/R/read_fwf.R
@@ -67,6 +67,7 @@ fwf_empty <- function(file, skip = 0, col_names = NULL) {
   ds <- datasource(file, skip = skip)
 
   out <- whitespaceColumns(ds)
+  out$end[length(out$end)] <- NA
 
   if (is.null(col_names)) {
     col_names <- paste0("X", seq_along(out$begin))

diff --git a/src/TokenizerFwf.cpp b/src/TokenizerFwf.cpp
@@ -99,7 +99,9 @@ TokenizerFwf::TokenizerFwf(const std::vector<int>& beginOffset, const std::vecto
 }
 
 void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
+  cur_ = begin;
   curLine_ = begin;
+
   begin_ = begin;
   end_ = end;
 
@@ -109,33 +111,47 @@ void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
 }
 
 std::pair<double,size_t> TokenizerFwf::progress() {
-  size_t bytes = curLine_ - begin_;
+  size_t bytes = cur_ - begin_;
   return std::make_pair(bytes / (double) (end_ - begin_), bytes);
 }
 
-// bool advance(SourceIterator start, SourceIterator end, int n) {
-//   for(int i = 0; i < n; ++i) {
-//     start++;
-//     if (start == end)
-//       return false;
-//
-//     if (*start == '\r' || *start == '\n')
-//       return false;
-//   }
-//
-//   return true;
-// }
-
 Token TokenizerFwf::nextToken() {
   if (!moreTokens_)
     return Token(TOKEN_EOF, 0, 0);
 
-  SourceIterator fieldBegin = curLine_ + beginOffset_[col_];
-  if (fieldBegin >= end_) {
+  // Find start of field
+  SourceIterator fieldBegin = cur_;
+  findBeginning:
+  int skip = beginOffset_[col_] - (cur_ - curLine_);
+  for (int i = 0; i < skip; ++i) {
+    if (fieldBegin == end_)
+      break;
+
+    if (*fieldBegin == '\n' || *fieldBegin == '\r') {
+      warn(row_, col_,
+        tfm::format("%i chars between fields", skip),
+        tfm::format("%i chars until end of line", i)
+      );
+
+      row_++;
+      col_ = 0;
+
+      advanceForLF(&fieldBegin, end_);
+      if (fieldBegin != end_)
+        fieldBegin++;
+      cur_ = curLine_ = fieldBegin;
+      goto findBeginning;
+    }
+    fieldBegin++;
+  }
+
+  if (fieldBegin == end_) {
+    // need to warn here if col != 0/cols - 1
     moreTokens_ = false;
     return Token(TOKEN_EOF, 0, 0);
   }
 
+  // Find end of field
   SourceIterator fieldEnd = fieldBegin;
   bool lastCol = (col_ == cols_ - 1), tooShort = false, hasNull = false;
 
@@ -183,8 +199,10 @@ Token TokenizerFwf::nextToken() {
     advanceForLF(&curLine_, end_);
     if (curLine_ != end_)
       curLine_++;
+    cur_ = curLine_;
   } else {
     col_++;
+    cur_ = fieldEnd;
   }
 
   return t;

diff --git a/src/TokenizerFwf.h b/src/TokenizerFwf.h
@@ -10,7 +10,7 @@ class TokenizerFwf : public Tokenizer {
   std::vector<int> beginOffset_, endOffset_;
   std::vector<std::string> NA_;
 
-  SourceIterator begin_, curLine_, end_;
+  SourceIterator begin_, cur_, curLine_, end_;
   int row_, col_, cols_, max_;
   bool moreTokens_, isRagged_;
 

diff --git a/tests/testthat/test-read-fwf.R b/tests/testthat/test-read-fwf.R
@@ -1,9 +1,9 @@
 context("read_fwf")
 
 test_that("trailing spaces ommitted", {
-  spec <- fwf_empty("fwf-trailing.txt")
+  spec <- fwf_empty(test_path("fwf-trailing.txt"))
   expect_equal(spec$begin, c(0, 4))
-  expect_equal(spec$end, c(3, 7))
+  expect_equal(spec$end, c(3, NA))
 
   df <- read_fwf("fwf-trailing.txt", spec, progress = FALSE)
   expect_equal(df$X1, df$X2)
@@ -74,6 +74,31 @@ test_that("read_fwf returns an empty data.frame on an empty file", {
    expect_equal(read_fwf("empty-file", progress = FALSE), tibble::data_frame())
 })
 
+test_that("check for line breaks in between widths", {
+  txt1 <- paste(
+    "1 1",
+    "2",
+    "1 1 ",
+    sep = "\n"
+  )
+  expect_warning(out1 <- read_fwf(txt1, fwf_empty(txt1)))
+  expect_equal(n_problems(out1), 2)
+
+  txt2 <- paste(
+    " 1 1",
+    " 2",
+    " 1 1 ",
+    sep = "\n"
+  )
+  expect_warning(out2 <- read_fwf(txt2, fwf_empty(txt2)))
+  expect_equal(n_problems(out2), 2)
+
+  exp <- tibble::tibble(X1 = c(1L, 2L, 1L), X2 = c(1L, NA, 1L))
+  expect_equal(out1, exp)
+  expect_equal(out2, exp)
+
+})
+
 # read_table -------------------------------------------------------------------
 
 test_that("read_table silently reads ragged last column", {