diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex index df1dcfa20..b7e627fd8 100644 --- a/lib/explorer/backend/data_frame.ex +++ b/lib/explorer/backend/data_frame.ex @@ -43,7 +43,7 @@ defmodule Explorer.Backend.DataFrame do entry :: fs_entry(), dtypes, delimiter :: String.t(), - null_character :: String.t(), + nil_values :: list(String.t()), skip_rows :: integer(), header? :: boolean(), encoding :: String.t(), @@ -61,7 +61,7 @@ defmodule Explorer.Backend.DataFrame do contents :: String.t(), dtypes, delimiter :: String.t(), - null_character :: String.t(), + nil_values :: list(String.t()), skip_rows :: integer(), header? :: boolean(), encoding :: String.t(), diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 5146b67a0..fd695b2f7 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -516,7 +516,7 @@ defmodule Explorer.DataFrame do * `:max_rows` - Maximum number of lines to read. (default: `nil`) - * `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`) + * `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`) * `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`) @@ -553,7 +553,7 @@ defmodule Explorer.DataFrame do encoding: "utf8", header: true, max_rows: nil, - null_character: "NA", + nil_values: [], skip_rows: 0, columns: nil, infer_schema_length: @default_infer_schema_length, @@ -568,7 +568,7 @@ defmodule Explorer.DataFrame do entry, check_dtypes!(opts[:dtypes]), opts[:delimiter], - opts[:null_character], + opts[:nil_values], opts[:skip_rows], opts[:header], opts[:encoding], @@ -611,7 +611,7 @@ defmodule Explorer.DataFrame do imputed from the first 1000 rows. (default: `[]`) * `:header` - Does the file have a header of column names as the first row or not? (default: `true`) * `:max_rows` - Maximum number of lines to read. (default: `nil`) - * `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`) + * `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`) * `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`) * `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`) * `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`). @@ -633,7 +633,7 @@ defmodule Explorer.DataFrame do encoding: "utf8", header: true, max_rows: nil, - null_character: "NA", + nil_values: [], skip_rows: 0, columns: nil, infer_schema_length: @default_infer_schema_length, @@ -647,7 +647,7 @@ defmodule Explorer.DataFrame do contents, check_dtypes!(opts[:dtypes]), opts[:delimiter], - opts[:null_character], + opts[:nil_values], opts[:skip_rows], opts[:header], opts[:encoding], diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex index 85e9e762d..0a221fdcd 100644 --- a/lib/explorer/polars_backend/data_frame.ex +++ b/lib/explorer/polars_backend/data_frame.ex @@ -56,7 +56,7 @@ defmodule Explorer.PolarsBackend.DataFrame do %Local.Entry{} = entry, dtypes, <>, - null_character, + nil_values, skip_rows, header?, encoding, @@ -91,7 +91,7 @@ defmodule Explorer.PolarsBackend.DataFrame do columns, dtypes, encoding, - null_character, + nil_values, parse_dates, char_byte(eol_delimiter) ) @@ -144,7 +144,7 @@ defmodule Explorer.PolarsBackend.DataFrame do contents, dtypes, <>, - null_character, + nil_values, skip_rows, header?, encoding, @@ -179,7 +179,7 @@ defmodule Explorer.PolarsBackend.DataFrame do columns, dtypes, encoding, - null_character, + nil_values, parse_dates, char_byte(eol_delimiter) ) diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex index d52ae1680..c5451e384 100644 --- a/lib/explorer/polars_backend/lazy_frame.ex +++ b/lib/explorer/polars_backend/lazy_frame.ex @@ -94,7 +94,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do %Local.Entry{} = entry, dtypes, <>, - null_character, + nil_values, skip_rows, header?, encoding, @@ -131,7 +131,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do true, dtypes, encoding, - null_character, + nil_values, parse_dates, char_byte(eol_delimiter) ) @@ -206,7 +206,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do contents, dtypes, delimiter, - null_character, + nil_values, skip_rows, header?, encoding, @@ -220,7 +220,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do contents, dtypes, delimiter, - null_character, + nil_values, skip_rows, header?, encoding, diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 7cd72fce6..8348a6cb0 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -59,7 +59,7 @@ defmodule Explorer.PolarsBackend.Native do _columns, _dtypes, _encoding, - _null_char, + _nil_vals, _parse_dates, _eol_delimiter ), @@ -95,7 +95,7 @@ defmodule Explorer.PolarsBackend.Native do _columns, _dtypes, _encoding, - _null_char, + _nil_vals, _parse_dates, _eol_delimiter ), @@ -191,7 +191,7 @@ defmodule Explorer.PolarsBackend.Native do _rechunk, _dtypes, _encoding, - _null_char, + _nil_vals, _parse_dates, _eol_delimiter ), diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs index e35c0209e..2ade3cf46 100644 --- a/native/explorer/src/dataframe/io.rs +++ b/native/explorer/src/dataframe/io.rs @@ -51,7 +51,7 @@ pub fn df_from_csv( column_names: Option>, dtypes: Option>, encoding: &str, - null_char: String, + null_vals: Vec, parse_dates: bool, eol_delimiter: Option, ) -> Result { @@ -78,7 +78,7 @@ pub fn df_from_csv( .with_encoding(encoding) .with_columns(column_names) .with_dtypes(schema) - .with_null_values(Some(NullValues::AllColumns(vec![null_char]))) + .with_null_values(Some(NullValues::AllColumns(null_vals))) .with_end_of_line_char(eol_delimiter.unwrap_or(b'\n')); finish_reader(reader) @@ -159,7 +159,7 @@ pub fn df_load_csv( column_names: Option>, dtypes: Option>, encoding: &str, - null_char: String, + null_vals: Vec, parse_dates: bool, eol_delimiter: Option, ) -> Result { @@ -188,7 +188,7 @@ pub fn df_load_csv( .with_encoding(encoding) .with_columns(column_names) .with_dtypes(schema) - .with_null_values(Some(NullValues::AllColumns(vec![null_char]))) + .with_null_values(Some(NullValues::AllColumns(null_vals))) .with_end_of_line_char(eol_delimiter.unwrap_or(b'\n')); finish_reader(reader) diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs index 2422e559c..6ad3dc7db 100644 --- a/native/explorer/src/lazyframe/io.rs +++ b/native/explorer/src/lazyframe/io.rs @@ -160,7 +160,7 @@ pub fn lf_from_csv( do_rechunk: bool, dtypes: Option>, encoding: &str, - null_char: String, + null_vals: Vec, parse_dates: bool, eol_delimiter: Option, ) -> Result { @@ -185,7 +185,7 @@ pub fn lf_from_csv( .with_rechunk(do_rechunk) .with_encoding(encoding) .with_dtype_overwrite(schema.as_deref()) - .with_null_values(Some(NullValues::AllColumns(vec![null_char]))) + .with_null_values(Some(NullValues::AllColumns(null_vals))) .with_end_of_line_char(eol_delimiter.unwrap_or(b'\n')) .finish()?; diff --git a/notebooks/exploring_explorer.livemd b/notebooks/exploring_explorer.livemd index b2704a3b0..ab85236cb 100644 --- a/notebooks/exploring_explorer.livemd +++ b/notebooks/exploring_explorer.livemd @@ -28,7 +28,7 @@ For CSV, your 'usual suspects' of options are available: * `dtypes` - A keyword list of `[column_name: dtype]`. If a type is not specified for a column, it is imputed from the first 1000 rows. (default: `[]`) * `header` - Does the file have a header of column names as the first row or not? (default: `true`) * `max_rows` - Maximum number of lines to read. (default: `nil`) -* `null_character` - The string that should be interpreted as a nil value. (default: `"NA"`) +* `nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`) * `skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`) * `columns` - A list of column names to keep. If present, only these columns are read into the dataframe. (default: `nil`) diff --git a/test/explorer/data_frame/csv_test.exs b/test/explorer/data_frame/csv_test.exs index d26b53744..794e3c51c 100644 --- a/test/explorer/data_frame/csv_test.exs +++ b/test/explorer/data_frame/csv_test.exs @@ -329,7 +329,7 @@ defmodule Explorer.DataFrame.CSVTest do end @tag :tmp_dir - test "null_character", config do + test "nil_values", config do csv = tmp_csv(config.tmp_dir, """ a,b @@ -338,7 +338,7 @@ defmodule Explorer.DataFrame.CSVTest do c,d """) - df = DF.from_csv!(csv, null_character: "n/a") + df = DF.from_csv!(csv, nil_values: ["n/a"]) assert DF.to_columns(df, atom_keys: true) == %{ a: [nil, "nil", "c"],