Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ARROW-16480: [R] Update read_csv_arrow and open_dataset parse_options, read_options, and convert_options to take lists #15270

Merged
merged 8 commits into from
Jan 10, 2023
13 changes: 13 additions & 0 deletions r/R/csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ read_delim_arrow <- function(file,
}
on.exit(file$close())
}

reader <- CsvTableReader$create(
file,
read_options = read_options,
Expand Down Expand Up @@ -326,6 +327,18 @@ CsvTableReader$create <- function(file,
...) {
assert_is(file, "InputStream")

if (is.list(read_options)) {
read_options <- do.call(CsvReadOptions$create, read_options)
}

if (is.list(parse_options)) {
parse_options <- do.call(CsvParseOptions$create, parse_options)
}

if (is.list(convert_options)) {
convert_options <- do.call(CsvConvertOptions$create, convert_options)
}

if (!(tolower(read_options$encoding) %in% c("utf-8", "utf8"))) {
file <- MakeReencodeInputStream(file, read_options$encoding)
}
Expand Down
12 changes: 12 additions & 0 deletions r/R/dataset-format.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,18 @@ CsvFileFormat$create <- function(...,
))
}

if (!inherits(read_options, "CsvReadOptions")) {
read_options <- do.call(CsvReadOptions$create, read_options)
}

if (!inherits(convert_options, "CsvConvertOptions")) {
convert_options <- do.call(CsvConvertOptions$create, convert_options)
}

if (!inherits(opts, "CsvParseOptions")) {
opts <- do.call(CsvParseOptions$create, opts)
}

column_names <- read_options$column_names
schema_names <- names(schema)

Expand Down
23 changes: 23 additions & 0 deletions r/tests/testthat/test-csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -664,3 +664,26 @@ test_that("Shows an error message when trying to read a timestamp with time zone
"CSV conversion error to timestamp\\[ns\\]: expected no zone offset in"
)
})

test_that("CSV reading/parsing/convert options can be passed in as lists", {
tf <- tempfile()
on.exit(unlink(tf))

writeLines('"x"\nNA\nNA\n"NULL"\n\n"foo"\n', tf)

tab1 <- read_csv_arrow(
tf,
convert_options = list(null_values = c("NA", "NULL"), strings_can_be_null = TRUE),
parse_options = list(ignore_empty_lines = FALSE),
read_options = list(skip_rows = 1L)
)

tab2 <- read_csv_arrow(
tf,
convert_options = CsvConvertOptions$create(null_values = c(NA, "NA", "NULL"), strings_can_be_null = TRUE),
parse_options = CsvParseOptions$create(ignore_empty_lines = FALSE),
read_options = CsvReadOptions$create(skip_rows = 1L)
)

expect_equal(tab1, tab2)
})
26 changes: 25 additions & 1 deletion r/tests/testthat/test-dataset-csv.R
Original file line number Diff line number Diff line change
Expand Up @@ -444,11 +444,35 @@ test_that("skip argument in open_dataset", {
})

test_that("error message if non-schema passed in as schema to open_dataset", {

# passing in the schema function, not an actual schema
expect_error(
open_dataset(csv_dir, format = "csv", schema = schema),
regexp = "`schema` must be an object of class 'Schema' not 'function'.",
fixed = TRUE
)
})

test_that("CSV reading/parsing/convert options can be passed in as lists", {
tf <- tempfile()
on.exit(unlink(tf))

writeLines('"x"\n"y"\nNA\nNA\n"NULL"\n\n"foo"\n', tf, )
wjones127 marked this conversation as resolved.
Show resolved Hide resolved

ds1 <- open_dataset(
tf,
format = "csv",
convert_options = list(null_values = c("NA", "NULL"), strings_can_be_null = TRUE),
read_options = list(skip_rows = 1L)
) %>%
collect()

ds2 <- open_dataset(
tf,
format = "csv",
convert_options = CsvConvertOptions$create(null_values = c(NA, "NA", "NULL"), strings_can_be_null = TRUE),
read_options = CsvReadOptions$create(skip_rows = 1L)
) %>%
collect()

expect_equal(ds1, ds2)
})