Use nil_values instead of null_character in IO operations (#667)

elixir-explorer · Jul 28, 2023 · dab4941 · dab4941
1 parent f9ed222
commit dab4941
Show file tree

Hide file tree

Showing 9 changed files with 28 additions and 28 deletions.
diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
@@ -43,7 +43,7 @@ defmodule Explorer.Backend.DataFrame do
               entry :: fs_entry(),
               dtypes,
               delimiter :: String.t(),
-              null_character :: String.t(),
+              nil_values :: list(String.t()),
               skip_rows :: integer(),
               header? :: boolean(),
               encoding :: String.t(),
@@ -61,7 +61,7 @@ defmodule Explorer.Backend.DataFrame do
               contents :: String.t(),
               dtypes,
               delimiter :: String.t(),
-              null_character :: String.t(),
+              nil_values :: list(String.t()),
               skip_rows :: integer(),
               header? :: boolean(),
               encoding :: String.t(),

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -516,7 +516,7 @@ defmodule Explorer.DataFrame do
 
     * `:max_rows` - Maximum number of lines to read. (default: `nil`)
 
-    * `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
+    * `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
 
     * `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
 
@@ -553,7 +553,7 @@ defmodule Explorer.DataFrame do
         encoding: "utf8",
         header: true,
         max_rows: nil,
-        null_character: "NA",
+        nil_values: [],
         skip_rows: 0,
         columns: nil,
         infer_schema_length: @default_infer_schema_length,
@@ -568,7 +568,7 @@ defmodule Explorer.DataFrame do
         entry,
         check_dtypes!(opts[:dtypes]),
         opts[:delimiter],
-        opts[:null_character],
+        opts[:nil_values],
         opts[:skip_rows],
         opts[:header],
         opts[:encoding],
@@ -611,7 +611,7 @@ defmodule Explorer.DataFrame do
       imputed from the first 1000 rows. (default: `[]`)
     * `:header` - Does the file have a header of column names as the first row or not? (default: `true`)
     * `:max_rows` - Maximum number of lines to read. (default: `nil`)
-    * `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
+    * `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
     * `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
     * `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`)
     * `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`).
@@ -633,7 +633,7 @@ defmodule Explorer.DataFrame do
         encoding: "utf8",
         header: true,
         max_rows: nil,
-        null_character: "NA",
+        nil_values: [],
         skip_rows: 0,
         columns: nil,
         infer_schema_length: @default_infer_schema_length,
@@ -647,7 +647,7 @@ defmodule Explorer.DataFrame do
       contents,
       check_dtypes!(opts[:dtypes]),
       opts[:delimiter],
-      opts[:null_character],
+      opts[:nil_values],
       opts[:skip_rows],
       opts[:header],
       opts[:encoding],

diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -56,7 +56,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         %Local.Entry{} = entry,
         dtypes,
         <<delimiter::utf8>>,
-        null_character,
+        nil_values,
         skip_rows,
         header?,
         encoding,
@@ -91,7 +91,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         columns,
         dtypes,
         encoding,
-        null_character,
+        nil_values,
         parse_dates,
         char_byte(eol_delimiter)
       )
@@ -144,7 +144,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         contents,
         dtypes,
         <<delimiter::utf8>>,
-        null_character,
+        nil_values,
         skip_rows,
         header?,
         encoding,
@@ -179,7 +179,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         columns,
         dtypes,
         encoding,
-        null_character,
+        nil_values,
         parse_dates,
         char_byte(eol_delimiter)
       )

diff --git a/lib/explorer/polars_backend/lazy_frame.ex b/lib/explorer/polars_backend/lazy_frame.ex
@@ -94,7 +94,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
         %Local.Entry{} = entry,
         dtypes,
         <<delimiter::utf8>>,
-        null_character,
+        nil_values,
         skip_rows,
         header?,
         encoding,
@@ -131,7 +131,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
         true,
         dtypes,
         encoding,
-        null_character,
+        nil_values,
         parse_dates,
         char_byte(eol_delimiter)
       )
@@ -206,7 +206,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
         contents,
         dtypes,
         delimiter,
-        null_character,
+        nil_values,
         skip_rows,
         header?,
         encoding,
@@ -220,7 +220,7 @@ defmodule Explorer.PolarsBackend.LazyFrame do
            contents,
            dtypes,
            delimiter,
-           null_character,
+           nil_values,
            skip_rows,
            header?,
            encoding,

diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex
@@ -59,7 +59,7 @@ defmodule Explorer.PolarsBackend.Native do
         _columns,
         _dtypes,
         _encoding,
-        _null_char,
+        _nil_vals,
         _parse_dates,
         _eol_delimiter
       ),
@@ -95,7 +95,7 @@ defmodule Explorer.PolarsBackend.Native do
         _columns,
         _dtypes,
         _encoding,
-        _null_char,
+        _nil_vals,
         _parse_dates,
         _eol_delimiter
       ),
@@ -191,7 +191,7 @@ defmodule Explorer.PolarsBackend.Native do
         _rechunk,
         _dtypes,
         _encoding,
-        _null_char,
+        _nil_vals,
         _parse_dates,
         _eol_delimiter
       ),

diff --git a/native/explorer/src/dataframe/io.rs b/native/explorer/src/dataframe/io.rs
@@ -51,7 +51,7 @@ pub fn df_from_csv(
     column_names: Option<Vec<String>>,
     dtypes: Option<Vec<(&str, &str)>>,
     encoding: &str,
-    null_char: String,
+    null_vals: Vec<String>,
     parse_dates: bool,
     eol_delimiter: Option<u8>,
 ) -> Result<ExDataFrame, ExplorerError> {
@@ -78,7 +78,7 @@ pub fn df_from_csv(
         .with_encoding(encoding)
         .with_columns(column_names)
         .with_dtypes(schema)
-        .with_null_values(Some(NullValues::AllColumns(vec![null_char])))
+        .with_null_values(Some(NullValues::AllColumns(null_vals)))
         .with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'));
 
     finish_reader(reader)
@@ -159,7 +159,7 @@ pub fn df_load_csv(
     column_names: Option<Vec<String>>,
     dtypes: Option<Vec<(&str, &str)>>,
     encoding: &str,
-    null_char: String,
+    null_vals: Vec<String>,
     parse_dates: bool,
     eol_delimiter: Option<u8>,
 ) -> Result<ExDataFrame, ExplorerError> {
@@ -188,7 +188,7 @@ pub fn df_load_csv(
         .with_encoding(encoding)
         .with_columns(column_names)
         .with_dtypes(schema)
-        .with_null_values(Some(NullValues::AllColumns(vec![null_char])))
+        .with_null_values(Some(NullValues::AllColumns(null_vals)))
         .with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'));
 
     finish_reader(reader)

diff --git a/native/explorer/src/lazyframe/io.rs b/native/explorer/src/lazyframe/io.rs
@@ -160,7 +160,7 @@ pub fn lf_from_csv(
     do_rechunk: bool,
     dtypes: Option<Vec<(&str, &str)>>,
     encoding: &str,
-    null_char: String,
+    null_vals: Vec<String>,
     parse_dates: bool,
     eol_delimiter: Option<u8>,
 ) -> Result<ExLazyFrame, ExplorerError> {
@@ -185,7 +185,7 @@ pub fn lf_from_csv(
         .with_rechunk(do_rechunk)
         .with_encoding(encoding)
         .with_dtype_overwrite(schema.as_deref())
-        .with_null_values(Some(NullValues::AllColumns(vec![null_char])))
+        .with_null_values(Some(NullValues::AllColumns(null_vals)))
         .with_end_of_line_char(eol_delimiter.unwrap_or(b'\n'))
         .finish()?;
 

diff --git a/notebooks/exploring_explorer.livemd b/notebooks/exploring_explorer.livemd
@@ -28,7 +28,7 @@ For CSV, your 'usual suspects' of options are available:
 * `dtypes` - A keyword list of `[column_name: dtype]`. If a type is not specified for a column, it is imputed from the first 1000 rows. (default: `[]`)
 * `header` - Does the file have a header of column names as the first row or not? (default: `true`)
 * `max_rows` - Maximum number of lines to read. (default: `nil`)
-* `null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
+* `nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
 * `skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
 * `columns` - A list of column names to keep. If present, only these columns are read into the dataframe. (default: `nil`)
 

diff --git a/test/explorer/data_frame/csv_test.exs b/test/explorer/data_frame/csv_test.exs
@@ -329,7 +329,7 @@ defmodule Explorer.DataFrame.CSVTest do
     end
 
     @tag :tmp_dir
-    test "null_character", config do
+    test "nil_values", config do
       csv =
         tmp_csv(config.tmp_dir, """
         a,b
@@ -338,7 +338,7 @@ defmodule Explorer.DataFrame.CSVTest do
         c,d
         """)
 
-      df = DF.from_csv!(csv, null_character: "n/a")
+      df = DF.from_csv!(csv, nil_values: ["n/a"])
 
       assert DF.to_columns(df, atom_keys: true) == %{
                a: [nil, "nil", "c"],