Skip to content

Commit

Permalink
Use nil_values instead of null_character in IO operations (#667)
Browse files Browse the repository at this point in the history
  • Loading branch information
cnpryer authored Jul 28, 2023
1 parent f9ed222 commit dab4941
Show file tree
Hide file tree
Showing 9 changed files with 28 additions and 28 deletions.
4 changes: 2 additions & 2 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ defmodule Explorer.Backend.DataFrame do
entry :: fs_entry(),
dtypes,
delimiter :: String.t(),
null_character :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
header? :: boolean(),
encoding :: String.t(),
Expand All @@ -61,7 +61,7 @@ defmodule Explorer.Backend.DataFrame do
contents :: String.t(),
dtypes,
delimiter :: String.t(),
null_character :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
header? :: boolean(),
encoding :: String.t(),
Expand Down
12 changes: 6 additions & 6 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ defmodule Explorer.DataFrame do
* `:max_rows` - Maximum number of lines to read. (default: `nil`)
* `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
Expand Down Expand Up @@ -553,7 +553,7 @@ defmodule Explorer.DataFrame do
encoding: "utf8",
header: true,
max_rows: nil,
null_character: "NA",
nil_values: [],
skip_rows: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
Expand All @@ -568,7 +568,7 @@ defmodule Explorer.DataFrame do
entry,
check_dtypes!(opts[:dtypes]),
opts[:delimiter],
opts[:null_character],
opts[:nil_values],
opts[:skip_rows],
opts[:header],
opts[:encoding],
Expand Down Expand Up @@ -611,7 +611,7 @@ defmodule Explorer.DataFrame do
imputed from the first 1000 rows. (default: `[]`)
* `:header` - Does the file have a header of column names as the first row or not? (default: `true`)
* `:max_rows` - Maximum number of lines to read. (default: `nil`)
* `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`)
* `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`).
Expand All @@ -633,7 +633,7 @@ defmodule Explorer.DataFrame do
encoding: "utf8",
header: true,
max_rows: nil,
null_character: "NA",
nil_values: [],
skip_rows: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
Expand All @@ -647,7 +647,7 @@ defmodule Explorer.DataFrame do
contents,
check_dtypes!(opts[:dtypes]),
opts[:delimiter],
opts[:null_character],
opts[:nil_values],
opts[:skip_rows],
opts[:header],
opts[:encoding],
Expand Down
8 changes: 4 additions & 4 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
%Local.Entry{} = entry,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -91,7 +91,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
columns,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down Expand Up @@ -144,7 +144,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
contents,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -179,7 +179,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
columns,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down
8 changes: 4 additions & 4 deletions lib/explorer/polars_backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
%Local.Entry{} = entry,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -131,7 +131,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
true,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down Expand Up @@ -206,7 +206,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
contents,
dtypes,
delimiter,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand All @@ -220,7 +220,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
contents,
dtypes,
delimiter,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down
6 changes: 3 additions & 3 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ defmodule Explorer.PolarsBackend.Native do
_columns,
_dtypes,
_encoding,
_null_char,
_nil_vals,
_parse_dates,
_eol_delimiter
),
Expand Down Expand Up @@ -95,7 +95,7 @@ defmodule Explorer.PolarsBackend.Native do
_columns,
_dtypes,
_encoding,
_null_char,
_nil_vals,
_parse_dates,
_eol_delimiter
),
Expand Down Expand Up @@ -191,7 +191,7 @@ defmodule Explorer.PolarsBackend.Native do
_rechunk,
_dtypes,
_encoding,
_null_char,
_nil_vals,
_parse_dates,
_eol_delimiter
),
Expand Down
8 changes: 4 additions & 4 deletions native/explorer/src/dataframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ pub fn df_from_csv(
column_names: Option<Vec<String>>,
dtypes: Option<Vec<(&str, &str)>>,
encoding: &str,
null_char: String,
null_vals: Vec<String>,
parse_dates: bool,
eol_delimiter: Option<u8>,
) -> Result<ExDataFrame, ExplorerError> {
Expand All @@ -78,7 +78,7 @@ pub fn df_from_csv(
.with_encoding(encoding)
.with_columns(column_names)
.with_dtypes(schema)
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'));

finish_reader(reader)
Expand Down Expand Up @@ -159,7 +159,7 @@ pub fn df_load_csv(
column_names: Option<Vec<String>>,
dtypes: Option<Vec<(&str, &str)>>,
encoding: &str,
null_char: String,
null_vals: Vec<String>,
parse_dates: bool,
eol_delimiter: Option<u8>,
) -> Result<ExDataFrame, ExplorerError> {
Expand Down Expand Up @@ -188,7 +188,7 @@ pub fn df_load_csv(
.with_encoding(encoding)
.with_columns(column_names)
.with_dtypes(schema)
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'));

finish_reader(reader)
Expand Down
4 changes: 2 additions & 2 deletions native/explorer/src/lazyframe/io.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ pub fn lf_from_csv(
do_rechunk: bool,
dtypes: Option<Vec<(&str, &str)>>,
encoding: &str,
null_char: String,
null_vals: Vec<String>,
parse_dates: bool,
eol_delimiter: Option<u8>,
) -> Result<ExLazyFrame, ExplorerError> {
Expand All @@ -185,7 +185,7 @@ pub fn lf_from_csv(
.with_rechunk(do_rechunk)
.with_encoding(encoding)
.with_dtype_overwrite(schema.as_deref())
.with_null_values(Some(NullValues::AllColumns(vec![null_char])))
.with_null_values(Some(NullValues::AllColumns(null_vals)))
.with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'))
.finish()?;

Expand Down
2 changes: 1 addition & 1 deletion notebooks/exploring_explorer.livemd
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ For CSV, your 'usual suspects' of options are available:
* `dtypes` - A keyword list of `[column_name: dtype]`. If a type is not specified for a column, it is imputed from the first 1000 rows. (default: `[]`)
* `header` - Does the file have a header of column names as the first row or not? (default: `true`)
* `max_rows` - Maximum number of lines to read. (default: `nil`)
* `null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `columns` - A list of column names to keep. If present, only these columns are read into the dataframe. (default: `nil`)

Expand Down
4 changes: 2 additions & 2 deletions test/explorer/data_frame/csv_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ defmodule Explorer.DataFrame.CSVTest do
end

@tag :tmp_dir
test "null_character", config do
test "nil_values", config do
csv =
tmp_csv(config.tmp_dir, """
a,b
Expand All @@ -338,7 +338,7 @@ defmodule Explorer.DataFrame.CSVTest do
c,d
""")

df = DF.from_csv!(csv, null_character: "n/a")
df = DF.from_csv!(csv, nil_values: ["n/a"])

assert DF.to_columns(df, atom_keys: true) == %{
a: [nil, "nil", "c"],
Expand Down

0 comments on commit dab4941

Please sign in to comment.