Skip to content

Commit

Permalink
Fixing of ragged fixed width format files and reading a subset of col…
Browse files Browse the repository at this point in the history
…umns (#353)

To read ragged fwf files, the last position must be NA. 

Fixes #300. Fixes #326.
  • Loading branch information
ghaarsma authored and hadley committed Jun 7, 2016
1 parent e41bc7e commit 8b253b2
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 24 deletions.
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# readr 0.2.2.9000

* Fix bug in `read_fwf()`, it will now properly read a subset of columns.
If the final column is ragged, supply an NA as the final end `fwf_positions`
or final width `fwf_widths` position (#353,@ghaarsma).

* readr now imports tibble so that you get consistent `tbl_df` behaviour
(#317, #385).

Expand All @@ -19,6 +23,8 @@

* `read_delim()` gains a `trim_ws` argument (#312, noamross)

=======
>>>>>>> Updated and appended read_fwf tests and NEWS.md
# readr 0.2.2

* Fix bug when checking empty values for missingness (caused valgrind issue
Expand Down
9 changes: 6 additions & 3 deletions R/read_fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
#' @inheritParams read_delim
#' @param col_positions Column positions, as created by \code{fwf_empty},
#' \code{fwf_widths} or \code{fwf_positions}. To read in only selected fields,
#' use \code{fwf_positions}. The width of the last column will be silently
#' extended to the next line break.
#' use \code{fwf_positions}. If the width of the last column is variable (a
#' ragged fwf file), supply the last end position as NA.
#' @export
#' @examples
#' fwf_sample <- system.file("extdata/fwf-sample.txt", package = "readr")
Expand Down Expand Up @@ -65,7 +65,8 @@ fwf_empty <- function(file, skip = 0, col_names = NULL) {

#' @rdname read_fwf
#' @export
#' @param widths Width of each field.
#' @param widths Width of each field. Use NA as width of last field when
#' reading a ragged fwf file.
#' @param col_names Either NULL, or a character vector column names.
fwf_widths <- function(widths, col_names = NULL) {
pos <- cumsum(c(1, widths))
Expand All @@ -76,7 +77,9 @@ fwf_widths <- function(widths, col_names = NULL) {
#' @rdname read_fwf
#' @export
#' @param start,end Starting and ending (inclusive) positions of each field.
#' Use NA as last end field when reading a ragged fwf file.
fwf_positions <- function(start, end, col_names = NULL) {

stopifnot(length(start) == length(end))

if (is.null(col_names)) {
Expand Down
51 changes: 33 additions & 18 deletions src/TokenizerFwf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,14 @@ TokenizerFwf::TokenizerFwf(const std::vector<int>& beginOffset, const std::vecto
{
if (beginOffset_.size() != endOffset_.size())
Rcpp::stop("Begin (%i) and end (%i) specifications must have equal length",
beginOffset_.size(), endOffset_.size());
beginOffset_.size(), endOffset_.size());

// File is assumed to be ragged (last column can have variable width)
// when the last element of endOffset_ is NA
isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER;

max_ = 0;
for (int j = 0; j < cols_; ++j) {
for (int j = 0; j < (cols_ - isRagged_); ++j) {
if (endOffset_[j] <= beginOffset_[j])
Rcpp::stop("Begin offset (%i) must be smaller than end offset (%i)",
beginOffset_[j], endOffset_[j]);
Expand Down Expand Up @@ -133,29 +137,30 @@ Token TokenizerFwf::nextToken() {
}

SourceIterator fieldEnd = fieldBegin;
int width = endOffset_[col_] - beginOffset_[col_];
bool lastCol = (col_ == cols_ - 1), tooShort = false, hasNull = false;

// Find the end of the field, stopping for newlines
for(int i = 0; i < width; ++i) {
if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
warn(row_, col_, tfm::format("%i chars", width), tfm::format("%i", i));

tooShort = true;
break;
}
if (*fieldEnd == '\0')
hasNull = true;

fieldEnd++;
}
// Last column is often ragged, so read until end of line (ignoring width)
if (lastCol) {
if (lastCol && isRagged_) {
// Last column is ragged, so read until end of line (ignoring width)
while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
if (*fieldEnd == '\0')
hasNull = true;
fieldEnd++;
}
} else {
int width = endOffset_[col_] - beginOffset_[col_];
// Find the end of the field, stopping for newlines
for(int i = 0; i < width; ++i) {
if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') {
warn(row_, col_, tfm::format("%i chars", width), tfm::format("%i", i));

tooShort = true;
break;
}
if (*fieldEnd == '\0')
hasNull = true;

fieldEnd++;
}
}

Token t = fieldToken(fieldBegin, fieldEnd, hasNull);
Expand All @@ -164,6 +169,16 @@ Token TokenizerFwf::nextToken() {
row_++;
col_ = 0;

if (!(tooShort || isRagged_)) {
// Proceed to the end of the line when you are possibly not there.
// This is needed in case the last column in the file is not being read.
while(fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') {
if (*fieldEnd == '\0')
hasNull = true;
fieldEnd++;
}
}

curLine_ = fieldEnd;
advanceForLF(&curLine_, end_);
if (curLine_ != end_)
Expand Down
2 changes: 1 addition & 1 deletion src/TokenizerFwf.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class TokenizerFwf : public Tokenizer {

SourceIterator begin_, curLine_, end_;
int row_, col_, cols_, max_;
bool moreTokens_;
bool moreTokens_, isRagged_;

public:

Expand Down
37 changes: 35 additions & 2 deletions tests/testthat/test-read-fwf.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ test_that("passing \"\" to read_fwf's 'na' option", {
c("bar", NA))
})

test_that("ragged last column silently expanded", {
x <- read_fwf("1a\n2ab\n3abc", fwf_widths(c(1, 1)))
test_that("ragged last column expanded with NA", {
x <- read_fwf("1a\n2ab\n3abc", fwf_widths(c(1, NA)))
expect_equal(x$X2, c("a", "ab", "abc"))
expect_equal(n_problems(x), 0)
})
Expand All @@ -26,6 +26,39 @@ test_that("ragged last column shrunk with warning", {
expect_equal(n_problems(x), 2)
})

test_that("read all columns with positions, non ragged", {
col_pos <- fwf_positions(c(1,3,6),c(2,5,6))
x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
expect_equal(x$X3, c("A", "B", "C"))
expect_equal(n_problems(x), 0)
})

test_that("read subset columns with positions", {
col_pos <- fwf_positions(c(1,3),c(2,5))
x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
expect_equal(x$X1, c(12, 67, 54))
expect_equal(x$X2, c(345, 890, 321))
expect_equal(n_problems(x), 0)
})

test_that("read columns with positions, ragged", {
col_pos <- fwf_positions(c(1,3,6),c(2,5,NA))
x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
expect_equal(x$X1, c(12, 67, 54))
expect_equal(x$X2, c(345, 890, 321))
expect_equal(x$X3, c('A', 'BBBBBBBBB', 'C'))
expect_equal(n_problems(x), 0)
})

test_that("read columns with width, ragged", {
col_pos <- fwf_widths(c(2,3,NA))
x <- read_fwf('12345A\n67890BBBBBBBBB\n54321C',col_positions = col_pos)
expect_equal(x$X1, c(12, 67, 54))
expect_equal(x$X2, c(345, 890, 321))
expect_equal(x$X3, c('A', 'BBBBBBBBB', 'C'))
expect_equal(n_problems(x), 0)
})

# read_table -------------------------------------------------------------------

test_that("read_table silently reads ragged last column", {
Expand Down

0 comments on commit 8b253b2

Please sign in to comment.