From e1f86e5676a52cc2dfe620cdba6a2468a0de9b56 Mon Sep 17 00:00:00 2001 From: Jim Hester Date: Mon, 14 Sep 2020 10:40:46 -0400 Subject: [PATCH] Ignore quotes when skipping in read_lines Fixes #991 --- NEWS.md | 2 ++ R/lines.R | 4 ++-- R/source.R | 34 ++++++++++++++++---------------- src/Source.cpp | 23 ++++++++++++--------- src/Source.h | 6 ++++-- src/SourceFile.h | 5 +++-- src/SourceRaw.h | 5 +++-- src/SourceString.h | 5 +++-- tests/testthat/test-read-lines.R | 15 ++++++++++++++ 9 files changed, 63 insertions(+), 36 deletions(-) diff --git a/NEWS.md b/NEWS.md index 336b6c92..047f66a0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,8 @@ ## Additional features and fixes +* `read_lines()` now ignores quotations when skipping lines (#991). + * `write_csv2()` now formats decimal numbers more consistently with `utils::write.csv2()` (#1087) * `fwf_positions(end)` no longer has a default argument and must be specified (#996) diff --git a/R/lines.R b/R/lines.R index 44b7536f..387f04d2 100644 --- a/R/lines.R +++ b/R/lines.R @@ -36,7 +36,7 @@ read_lines <- function(file, skip = 0, skip_empty_rows = FALSE, n_max = -1L, if (empty_file(file)) { return(character()) } - ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) + ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows, skip_quote = FALSE) read_lines_(ds, skip_empty_rows = skip_empty_rows, locale_ = locale, na = na, n_max = n_max, progress = progress) } @@ -47,7 +47,7 @@ read_lines_raw <- function(file, skip = 0, if (empty_file(file)) { return(list()) } - ds <- datasource(file, skip = skip, skip_empty_rows = FALSE) + ds <- datasource(file, skip = skip, skip_empty_rows = FALSE, skip_quote = FALSE) read_lines_raw_(ds, n_max = n_max, progress = progress) } diff --git a/R/source.R b/R/source.R index 71274ed5..b56fbc64 100644 --- a/R/source.R +++ b/R/source.R @@ -35,7 +35,7 @@ #' con <- rawConnection(charToRaw("abc\n123")) #' datasource(con) #' close(con) -datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "") { +datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "", skip_quote = TRUE) { if (inherits(file, "source")) { # If `skip` and `comment` arguments are expliictly passed, we want to use @@ -50,20 +50,20 @@ datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "") { file } else if (is.connection(file)) { - datasource_connection(file, skip, skip_empty_rows, comment) + datasource_connection(file, skip, skip_empty_rows, comment, skip_quote) } else if (is.raw(file)) { - datasource_raw(file, skip, skip_empty_rows, comment) + datasource_raw(file, skip, skip_empty_rows, comment, skip_quote) } else if (is.character(file)) { if (length(file) > 1) { - datasource_string(paste(file, collapse = "\n"), skip, skip_empty_rows, comment) + datasource_string(paste(file, collapse = "\n"), skip, skip_empty_rows, comment, skip_quote) } else if (grepl("\n", file)) { - datasource_string(file, skip, skip_empty_rows, comment) + datasource_string(file, skip, skip_empty_rows, comment, skip_quote) } else { file <- standardise_path(file) if (is.connection(file)) { - datasource_connection(file, skip, skip_empty_rows, comment) + datasource_connection(file, skip, skip_empty_rows, comment, skip_quote) } else { - datasource_file(file, skip, skip_empty_rows, comment) + datasource_file(file, skip, skip_empty_rows, comment, skip_quote) } } } else { @@ -73,32 +73,32 @@ datasource <- function(file, skip = 0, skip_empty_rows = FALSE, comment = "") { # Constructors ----------------------------------------------------------------- -new_datasource <- function(type, x, skip, skip_empty_rows = TRUE, comment = "", ...) { - structure(list(x, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, ...), +new_datasource <- function(type, x, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE, ...) { + structure(list(x, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote, ...), class = c(paste0("source_", type), "source")) } -datasource_string <- function(text, skip, skip_empty_rows = TRUE, comment = "") { - new_datasource("string", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment) +datasource_string <- function(text, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE) { + new_datasource("string", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote) } -datasource_file <- function(path, skip, skip_empty_rows = TRUE, comment = "", ...) { +datasource_file <- function(path, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE, ...) { path <- check_path(path) - new_datasource("file", path, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, ...) + new_datasource("file", path, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote, ...) } -datasource_connection <- function(path, skip, skip_empty_rows = TRUE, comment = "") { +datasource_connection <- function(path, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE) { # We read the connection to a temporary file, then register a finalizer to # cleanup the temp file after the datasource object is removed. file <- read_connection(path) env <- new.env(parent = emptyenv()) reg.finalizer(env, function(env) unlink(file)) - datasource_file(file, skip, skip_empty_rows = skip_empty_rows, comment = comment, env = env) + datasource_file(file, skip, skip_empty_rows = skip_empty_rows, comment = comment, env = env, skip_quote = skip_quote) } -datasource_raw <- function(text, skip, skip_empty_rows, comment) { - new_datasource("raw", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment) +datasource_raw <- function(text, skip, skip_empty_rows, comment, skip_quote = TRUE) { + new_datasource("raw", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote) } # Helpers ---------------------------------------------------------------------- diff --git a/src/Source.cpp b/src/Source.cpp index ca965f35..b3640a35 100644 --- a/src/Source.cpp +++ b/src/Source.cpp @@ -12,15 +12,18 @@ SourcePtr Source::create(cpp11::list spec) { int skip = cpp11::as_cpp(spec["skip"]); bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); std::string comment = cpp11::as_cpp(spec["comment"]); + bool skipQuote = cpp11::as_cpp(spec["skip_quote"]); if (subclass == "source_raw") { - return SourcePtr(new SourceRaw(spec[0], skip, skipEmptyRows, comment)); + return SourcePtr( + new SourceRaw(spec[0], skip, skipEmptyRows, comment, skipQuote)); } else if (subclass == "source_string") { - return SourcePtr(new SourceString(spec[0], skip, skipEmptyRows, comment)); + return SourcePtr( + new SourceString(spec[0], skip, skipEmptyRows, comment, skipQuote)); } else if (subclass == "source_file") { cpp11::strings path(spec[0]); return SourcePtr(new SourceFile( - Rf_translateChar(path[0]), skip, skipEmptyRows, comment)); + Rf_translateChar(path[0]), skip, skipEmptyRows, comment, skipQuote)); } cpp11::stop("Unknown source type"); @@ -32,14 +35,16 @@ const char* Source::skipLines( const char* end, int n, bool skipEmptyRows, - const std::string& comment) { + const std::string& comment, + bool skipQuote) { bool hasComment = comment != ""; bool isComment; const char* cur = begin; while (cur < end && n > 0) { - cur = skipLine(cur, end, hasComment && inComment(cur, end, comment)); + cur = skipLine( + cur, end, hasComment && inComment(cur, end, comment), skipQuote); --n; ++skippedRows_; } @@ -48,19 +53,19 @@ const char* Source::skipLines( while (cur < end && ((skipEmptyRows && (*cur == '\n' || *cur == '\r')) || (isComment = hasComment && inComment(cur, end, comment)))) { - cur = skipLine(cur, end, isComment); + cur = skipLine(cur, end, isComment, skipQuote); ++skippedRows_; } return cur; } -const char* -Source::skipLine(const char* begin, const char* end, bool isComment) { +const char* Source::skipLine( + const char* begin, const char* end, bool isComment, bool skipQuote) { const char* cur = begin; // skip the rest of the line until the newline while (cur < end && !(*cur == '\n' || *cur == '\r')) { - if (!isComment && *cur == '"') { + if (!isComment && skipQuote && *cur == '"') { cur = skipDoubleQuoted(cur, end); } else { advanceForLF(&cur, end); diff --git a/src/Source.h b/src/Source.h index bcc1ebdb..db3f95c0 100644 --- a/src/Source.h +++ b/src/Source.h @@ -22,9 +22,11 @@ class Source { const char* end, int n, bool skipEmptyRows = true, - const std::string& comment = ""); + const std::string& comment = "", + bool skipQuote = true); - const char* skipLine(const char* begin, const char* end, bool isComment); + const char* + skipLine(const char* begin, const char* end, bool isComment, bool skipQuote); const char* skipDoubleQuoted(const char* begin, const char* end); diff --git a/src/SourceFile.h b/src/SourceFile.h index 425228ab..4c8642f0 100644 --- a/src/SourceFile.h +++ b/src/SourceFile.h @@ -18,7 +18,8 @@ class SourceFile : public Source { const std::string& path, int skip = 0, bool skipEmptyRows = true, - const std::string& comment = "") { + const std::string& comment = "", + bool skipQuotes = true) { try { fm_ = boost::interprocess::file_mapping( path.c_str(), boost::interprocess::read_only); @@ -35,7 +36,7 @@ class SourceFile : public Source { begin_ = skipBom(begin_, end_); // Skip lines, if needed - begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment); + begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); } const char* begin() { return begin_; } diff --git a/src/SourceRaw.h b/src/SourceRaw.h index 97feec8f..c57e383e 100644 --- a/src/SourceRaw.h +++ b/src/SourceRaw.h @@ -14,7 +14,8 @@ class SourceRaw : public Source { cpp11::raws x, int skip = 0, bool skipEmptyRows = true, - const std::string& comment = "") + const std::string& comment = "", + bool skipQuotes = true) : x_(x) { begin_ = (const char*)RAW(x); end_ = (const char*)RAW(x) + Rf_xlength(x); @@ -23,7 +24,7 @@ class SourceRaw : public Source { begin_ = skipBom(begin_, end_); // Skip lines, if needed - begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment); + begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); } const char* begin() { return begin_; } diff --git a/src/SourceString.h b/src/SourceString.h index 2d94fb5f..4e84a150 100644 --- a/src/SourceString.h +++ b/src/SourceString.h @@ -16,7 +16,8 @@ class SourceString : public Source { cpp11::strings x, int skip = 0, bool skipEmptyRows = true, - const std::string& comment = "") + const std::string& comment = "", + bool skipQuotes = true) : string_(static_cast(x[0])) { begin_ = CHAR(string_); @@ -26,7 +27,7 @@ class SourceString : public Source { begin_ = skipBom(begin_, end_); // Skip lines, if needed - begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment); + begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); } const char* begin() { return begin_; } diff --git a/tests/testthat/test-read-lines.R b/tests/testthat/test-read-lines.R index a76c5066..fd6bd46b 100644 --- a/tests/testthat/test-read-lines.R +++ b/tests/testthat/test-read-lines.R @@ -70,6 +70,21 @@ test_that("read_lines(skip_empty_rows) works when blank lines are at the end of expect_equal(read_lines(tmp, skip_empty_rows = TRUE), "test") }) +test_that("read_lines(skip_empty_rows) works if there are double quotes in the lines (#991)", { + data <- +"a\"b +cde +f\"g +hij" + + expect_equal( + read_lines(data, skip = 1), + c("cde", + "f\"g", + "hij") + ) +}) + # These tests are slow so are commented out #test_that("long vectors are supported", { #tmp <- tempfile(fileext = ".gz")