Skip to content

Commit

Permalink
Check for line breaks between columns
Browse files Browse the repository at this point in the history
Fixes #469
  • Loading branch information
hadley committed Jul 13, 2016
1 parent 6df1fd0 commit 851220f
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 19 deletions.
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# readr 0.2.2.9000

* `read_fwf()` now checks for line breaks in between specified columns (#469).

* `fwf_empty()` returns sets last end value to NA to automatically trim
ragged columns.

* `read_fwf()` now can now skip columns without adding an extranous column of
missing values (#322).

Expand Down
1 change: 1 addition & 0 deletions R/read_fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ fwf_empty <- function(file, skip = 0, col_names = NULL) {
ds <- datasource(file, skip = skip)

out <- whitespaceColumns(ds)
out$end[length(out$end)] <- NA

if (is.null(col_names)) {
col_names <- paste0("X", seq_along(out$begin))
Expand Down
50 changes: 34 additions & 16 deletions src/TokenizerFwf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ TokenizerFwf::TokenizerFwf(const std::vector<int>& beginOffset, const std::vecto
}

void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
cur_ = begin;
curLine_ = begin;

begin_ = begin;
end_ = end;

Expand All @@ -109,33 +111,47 @@ void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) {
}

std::pair<double,size_t> TokenizerFwf::progress() {
size_t bytes = curLine_ - begin_;
size_t bytes = cur_ - begin_;
return std::make_pair(bytes / (double) (end_ - begin_), bytes);
}

// bool advance(SourceIterator start, SourceIterator end, int n) {
// for(int i = 0; i < n; ++i) {
// start++;
// if (start == end)
// return false;
//
// if (*start == '\r' || *start == '\n')
// return false;
// }
//
// return true;
// }

Token TokenizerFwf::nextToken() {
if (!moreTokens_)
return Token(TOKEN_EOF, 0, 0);

SourceIterator fieldBegin = curLine_ + beginOffset_[col_];
if (fieldBegin >= end_) {
// Find start of field
SourceIterator fieldBegin = cur_;
findBeginning:
int skip = beginOffset_[col_] - (cur_ - curLine_);
for (int i = 0; i < skip; ++i) {
if (fieldBegin == end_)
break;

if (*fieldBegin == '\n' || *fieldBegin == '\r') {
warn(row_, col_,
tfm::format("%i chars between fields", skip),
tfm::format("%i chars until end of line", i)
);

row_++;
col_ = 0;

advanceForLF(&fieldBegin, end_);
if (fieldBegin != end_)
fieldBegin++;
cur_ = curLine_ = fieldBegin;
goto findBeginning;
}
fieldBegin++;
}

if (fieldBegin == end_) {
// need to warn here if col != 0/cols - 1
moreTokens_ = false;
return Token(TOKEN_EOF, 0, 0);
}

// Find end of field
SourceIterator fieldEnd = fieldBegin;
bool lastCol = (col_ == cols_ - 1), tooShort = false, hasNull = false;

Expand Down Expand Up @@ -183,8 +199,10 @@ Token TokenizerFwf::nextToken() {
advanceForLF(&curLine_, end_);
if (curLine_ != end_)
curLine_++;
cur_ = curLine_;
} else {
col_++;
cur_ = fieldEnd;
}

return t;
Expand Down
2 changes: 1 addition & 1 deletion src/TokenizerFwf.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class TokenizerFwf : public Tokenizer {
std::vector<int> beginOffset_, endOffset_;
std::vector<std::string> NA_;

SourceIterator begin_, curLine_, end_;
SourceIterator begin_, cur_, curLine_, end_;
int row_, col_, cols_, max_;
bool moreTokens_, isRagged_;

Expand Down
29 changes: 27 additions & 2 deletions tests/testthat/test-read-fwf.R
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
context("read_fwf")

test_that("trailing spaces ommitted", {
spec <- fwf_empty("fwf-trailing.txt")
spec <- fwf_empty(test_path("fwf-trailing.txt"))
expect_equal(spec$begin, c(0, 4))
expect_equal(spec$end, c(3, 7))
expect_equal(spec$end, c(3, NA))

df <- read_fwf("fwf-trailing.txt", spec, progress = FALSE)
expect_equal(df$X1, df$X2)
Expand Down Expand Up @@ -74,6 +74,31 @@ test_that("read_fwf returns an empty data.frame on an empty file", {
expect_equal(read_fwf("empty-file", progress = FALSE), tibble::data_frame())
})

test_that("check for line breaks in between widths", {
txt1 <- paste(
"1 1",
"2",
"1 1 ",
sep = "\n"
)
expect_warning(out1 <- read_fwf(txt1, fwf_empty(txt1)))
expect_equal(n_problems(out1), 2)

txt2 <- paste(
" 1 1",
" 2",
" 1 1 ",
sep = "\n"
)
expect_warning(out2 <- read_fwf(txt2, fwf_empty(txt2)))
expect_equal(n_problems(out2), 2)

exp <- tibble::tibble(X1 = c(1L, 2L, 1L), X2 = c(1L, NA, 1L))
expect_equal(out1, exp)
expect_equal(out2, exp)

})

# read_table -------------------------------------------------------------------

test_that("read_table silently reads ragged last column", {
Expand Down

0 comments on commit 851220f

Please sign in to comment.