Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use nil_values instead of null_character in IO operations #667

Merged
merged 3 commits into from
Jul 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ defmodule Explorer.Backend.DataFrame do
entry :: fs_entry(),
dtypes,
delimiter :: String.t(),
null_character :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
header? :: boolean(),
encoding :: String.t(),
Expand All @@ -61,7 +61,7 @@ defmodule Explorer.Backend.DataFrame do
contents :: String.t(),
dtypes,
delimiter :: String.t(),
null_character :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
header? :: boolean(),
encoding :: String.t(),
Expand Down
12 changes: 6 additions & 6 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ defmodule Explorer.DataFrame do

* `:max_rows` - Maximum number of lines to read. (default: `nil`)

* `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)

* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)

Expand Down Expand Up @@ -553,7 +553,7 @@ defmodule Explorer.DataFrame do
encoding: "utf8",
header: true,
max_rows: nil,
null_character: "NA",
nil_values: [],
skip_rows: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
Expand All @@ -568,7 +568,7 @@ defmodule Explorer.DataFrame do
entry,
check_dtypes!(opts[:dtypes]),
opts[:delimiter],
opts[:null_character],
opts[:nil_values],
opts[:skip_rows],
opts[:header],
opts[:encoding],
Expand Down Expand Up @@ -611,7 +611,7 @@ defmodule Explorer.DataFrame do
imputed from the first 1000 rows. (default: `[]`)
* `:header` - Does the file have a header of column names as the first row or not? (default: `true`)
* `:max_rows` - Maximum number of lines to read. (default: `nil`)
* `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`)
* `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`).
Expand All @@ -633,7 +633,7 @@ defmodule Explorer.DataFrame do
encoding: "utf8",
header: true,
max_rows: nil,
null_character: "NA",
nil_values: [],
skip_rows: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
Expand All @@ -647,7 +647,7 @@ defmodule Explorer.DataFrame do
contents,
check_dtypes!(opts[:dtypes]),
opts[:delimiter],
opts[:null_character],
opts[:nil_values],
opts[:skip_rows],
opts[:header],
opts[:encoding],
Expand Down
8 changes: 4 additions & 4 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
%Local.Entry{} = entry,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -91,7 +91,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
columns,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down Expand Up @@ -153,7 +153,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
contents,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -188,7 +188,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
columns,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down
8 changes: 4 additions & 4 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
%Local.Entry{} = entry,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -131,7 +131,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
true,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down Expand Up @@ -206,7 +206,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
contents,
dtypes,
delimiter,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand All @@ -220,7 +220,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
contents,
dtypes,
delimiter,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down
6 changes: 3 additions & 3 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ defmodule Explorer.PolarsBackend.Native do
_columns,
_dtypes,
_encoding,
_null_char,
_nil_vals,
_parse_dates,
_eol_delimiter
),
Expand Down Expand Up @@ -95,7 +95,7 @@ defmodule Explorer.PolarsBackend.Native do
_columns,
_dtypes,
_encoding,
_null_char,
_nil_vals,
_parse_dates,
_eol_delimiter
),
Expand Down Expand Up @@ -191,7 +191,7 @@ defmodule Explorer.PolarsBackend.Native do
_rechunk,
_dtypes,
_encoding,
_null_char,
_nil_vals,
_parse_dates,
_eol_delimiter
),
Expand Down
8 changes: 4 additions & 4 deletions native/explorer/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub fn df_from_csv(
column_names: Option<Vec<String>>,
dtypes: Option<Vec<(&str, &str)>>,
encoding: &str,
null_char: String,
null_vals: Vec<String>,
parse_dates: bool,
eol_delimiter: Option<u8>,
) -> Result<ExDataFrame, ExplorerError> {
Expand All @@ -77,7 +77,7 @@ pub fn df_from_csv(
.with_encoding(encoding)
.with_columns(column_names)
.with_dtypes(schema)
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'));

finish_reader(reader)
Expand Down Expand Up @@ -158,7 +158,7 @@ pub fn df_load_csv(
column_names: Option<Vec<String>>,
dtypes: Option<Vec<(&str, &str)>>,
encoding: &str,
null_char: String,
null_vals: Vec<String>,
parse_dates: bool,
eol_delimiter: Option<u8>,
) -> Result<ExDataFrame, ExplorerError> {
Expand Down Expand Up @@ -187,7 +187,7 @@ pub fn df_load_csv(
.with_encoding(encoding)
.with_columns(column_names)
.with_dtypes(schema)
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'));

finish_reader(reader)
Expand Down
4 changes: 2 additions & 2 deletions native/explorer/src/lazyframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ pub fn lf_from_csv(
do_rechunk: bool,
dtypes: Option<Vec<(&str, &str)>>,
encoding: &str,
null_char: String,
null_vals: Vec<String>,
parse_dates: bool,
eol_delimiter: Option<u8>,
) -> Result<ExLazyFrame, ExplorerError> {
Expand All @@ -183,7 +183,7 @@ pub fn lf_from_csv(
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_dtype_overwrite(schema.as_deref())
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'))
.finish()?;

Expand Down
2 changes: 1 addition & 1 deletion notebooks/exploring_explorer.livemd
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ For CSV, your 'usual suspects' of options are available:
* `dtypes` - A keyword list of `[column_name: dtype]`. If a type is not specified for a column, it is imputed from the first 1000 rows. (default: `[]`)
* `header` - Does the file have a header of column names as the first row or not? (default: `true`)
* `max_rows` - Maximum number of lines to read. (default: `nil`)
* `null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `columns` - A list of column names to keep. If present, only these columns are read into the dataframe. (default: `nil`)

Expand Down
4 changes: 2 additions & 2 deletions test/explorer/data_frame/csv_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ defmodule Explorer.DataFrame.CSVTest do
end

@tag :tmp_dir
test "null_character", config do
test "nil_values", config do
csv =
tmp_csv(config.tmp_dir, """
a,b
Expand All @@ -337,7 +337,7 @@ defmodule Explorer.DataFrame.CSVTest do
c,d
""")

df = DF.from_csv!(csv, null_character: "n/a")
df = DF.from_csv!(csv, nil_values: ["n/a"])

assert DF.to_columns(df, atom_keys: true) == %{
a: [nil, "nil", "c"],
Expand Down