feat: customize CSV format (#115)

domoritz · Dec 15, 2024 · 56e4b6d · 56e4b6d
1 parent 55e5c20
commit 56e4b6d
Show file tree

Hide file tree

Showing 7 changed files with 211 additions and 57 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/csv2arrow/Cargo.toml b/crates/csv2arrow/Cargo.toml
@@ -16,6 +16,7 @@ arrow-schema = { version = "53.0.0", features = ["serde"] }
 serde_json = "1.0.133"
 clap = { version = "4.5.21", features = ["derive"] }
 arrow-tools = { version = "0.19.0", path = "../arrow-tools" }
+regex = "1.5.4"
 
 [dev-dependencies]
 assert_cmd = "2.0.14"

diff --git a/crates/csv2arrow/Readme.md b/crates/csv2arrow/Readme.md
@@ -36,24 +36,51 @@ cargo binstall csv2arrow
 Usage: csv2arrow [OPTIONS] <CSV> [ARROW]
 
 Arguments:
-  <CSV>    Input CSV file, stdin if not present
-  [ARROW]  Output file, stdout if not present
+  <CSV>
+          Input CSV file, stdin if not present
+
+  [ARROW]
+          Output file, stdout if not present
 
 Options:
   -s, --schema-file <SCHEMA_FILE>
           File with Arrow schema in JSON format
+
   -m, --max-read-records <MAX_READ_RECORDS>
           The number of records to infer the schema from. All rows if not present. Setting max-read-records to zero will stop schema inference and all columns will be string typed
+
       --header <HEADER>
-          Set whether the CSV file has headers [possible values: true, false]
-  -d, --delimiter <DELIMITER>
-          Set the CSV file's column delimiter as a byte character [default: ,]
+          Set whether the CSV file has headers
+
+          [default: true]
+          [possible values: true, false]
+
+      --delimiter <DELIMITER>
+          Set the CSV file's column delimiter as a byte character
+
+      --escape <ESCAPE>
+          Specify an escape character
+
+      --quote <QUOTE>
+          Specify a custom quote character
+
+      --comment <COMMENT>
+          Specify a comment character.
+
+          Lines starting with this character will be ignored
+
+      --null-regex <NULL_REGEX>
+          Provide a regex to match null values
+
   -p, --print-schema
           Print the schema to stderr
+
   -n, --dry
           Only print the schema
+
   -h, --help
-          Print help
+          Print help (see a summary with '-h')
+
   -V, --version
           Print version
 ```

diff --git a/crates/csv2arrow/src/main.rs b/crates/csv2arrow/src/main.rs
@@ -1,6 +1,7 @@
 use arrow::{csv::reader::Format, csv::ReaderBuilder, error::ArrowError, ipc::writer::FileWriter};
 use arrow_tools::seekable_reader::{SeekRead, SeekableReader};
 use clap::{Parser, ValueHint};
+use regex::Regex;
 use std::io::stdout;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -25,13 +26,31 @@ struct Opts {
     #[clap(short, long)]
     max_read_records: Option<usize>,
 
-    /// Set whether the CSV file has headers
-    #[clap(long)]
+    /// Set whether the CSV file has headers.
+    #[clap(long, default_value = "true")]
     header: Option<bool>,
 
     /// Set the CSV file's column delimiter as a byte character.
-    #[clap(short, long, default_value = ",")]
-    delimiter: char,
+    #[clap(long)]
+    delimiter: Option<char>,
+
+    /// Specify an escape character.
+    #[clap(long)]
+    escape: Option<char>,
+
+    /// Specify a custom quote character.
+    #[clap(long)]
+    quote: Option<char>,
+
+    /// Specify a comment character.
+    ///
+    /// Lines starting with this character will be ignored
+    #[clap(long)]
+    comment: Option<char>,
+
+    /// Provide a regex to match null values.
+    #[clap(long)]
+    null_regex: Option<Regex>,
 
     /// Print the schema to stderr.
     #[clap(short, long)]
@@ -56,6 +75,32 @@ fn main() -> Result<(), ArrowError> {
         ))
     };
 
+    let mut format = Format::default();
+
+    if let Some(header) = opts.header {
+        format = format.with_header(header);
+    }
+
+    if let Some(delimiter) = opts.delimiter {
+        format = format.with_delimiter(delimiter as u8);
+    }
+
+    if let Some(escape) = opts.escape {
+        format = format.with_escape(escape as u8);
+    }
+
+    if let Some(quote) = opts.quote {
+        format = format.with_quote(quote as u8);
+    }
+
+    if let Some(comment) = opts.comment {
+        format = format.with_comment(comment as u8);
+    }
+
+    if let Some(regex) = opts.null_regex {
+        format = format.with_null_regex(regex);
+    }
+
     let schema = match opts.schema_file {
         Some(schema_def_file_path) => {
             let schema_file = match File::open(&schema_def_file_path) {
@@ -76,18 +121,12 @@ fn main() -> Result<(), ArrowError> {
                 ))),
             }
         }
-        _ => {
-            let format = Format::default()
-                .with_delimiter(opts.delimiter as u8)
-                .with_header(opts.header.unwrap_or(true));
-
-            match format.infer_schema(&mut input, opts.max_read_records) {
-                Ok((schema, _size)) => Ok(schema),
-                Err(error) => Err(ArrowError::SchemaError(format!(
-                    "Error inferring schema: {error}"
-                ))),
-            }
-        }
+        _ => match format.infer_schema(&mut input, opts.max_read_records) {
+            Ok((schema, _size)) => Ok(schema),
+            Err(error) => Err(ArrowError::SchemaError(format!(
+                "Error inferring schema: {error}"
+            ))),
+        },
     }?;
 
     if opts.print_schema || opts.dry {
@@ -100,9 +139,7 @@ fn main() -> Result<(), ArrowError> {
     }
 
     let schema_ref = Arc::new(schema);
-    let builder = ReaderBuilder::new(schema_ref)
-        .with_header(opts.header.unwrap_or(true))
-        .with_delimiter(opts.delimiter as u8);
+    let builder = ReaderBuilder::new(schema_ref).with_format(format);
 
     input.rewind()?;
 

diff --git a/crates/csv2parquet/Cargo.toml b/crates/csv2parquet/Cargo.toml
@@ -17,6 +17,7 @@ arrow-schema = { version = "53.0.0", features = ["serde"] }
 serde_json = "1.0.133"
 clap = { version = "4.5.21", features = ["derive"] }
 arrow-tools = { version = "0.19.0", path = "../arrow-tools" }
+regex = "1.5.4"
 
 [dev-dependencies]
 assert_cmd = "2.0.14"

diff --git a/crates/csv2parquet/Readme.md b/crates/csv2parquet/Readme.md
@@ -36,44 +36,89 @@ cargo binstall csv2parquet
 Usage: csv2parquet [OPTIONS] <CSV> <PARQUET>
 
 Arguments:
-  <CSV>      Input CSV fil, stdin if not present
-  <PARQUET>  Output file
+  <CSV>
+          Input CSV fil, stdin if not present
+
+  <PARQUET>
+          Output file
 
 Options:
   -s, --schema-file <SCHEMA_FILE>
           File with Arrow schema in JSON format
+
       --max-read-records <MAX_READ_RECORDS>
           The number of records to infer the schema from. All rows if not present. Setting max-read-records to zero will stop schema inference and all columns will be string typed
+
       --header <HEADER>
-          Set whether the CSV file has headers [possible values: true, false]
-  -d, --delimiter <DELIMITER>
-          Set the CSV file's column delimiter as a byte character [default: ,]
+          Set whether the CSV file has headers
+
+          [default: true]
+          [possible values: true, false]
+
+      --delimiter <DELIMITER>
+          Set the CSV file's column delimiter as a byte character
+
+      --escape <ESCAPE>
+          Specify an escape character
+
+      --quote <QUOTE>
+          Specify a custom quote character
+
+      --comment <COMMENT>
+          Specify a comment character.
+
+          Lines starting with this character will be ignored
+
+      --null-regex <NULL_REGEX>
+          Provide a regex to match null values
+
   -c, --compression <COMPRESSION>
-          Set the compression [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd, lz4-raw]
+          Set the compression
+
+          [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd, lz4-raw]
+
   -e, --encoding <ENCODING>
-          Sets encoding for any column [possible values: plain, plain-dictionary, rle, rle-dictionary, delta-binary-packed, delta-length-byte-array, delta-byte-array, byte-stream-split]
+          Sets encoding for any column
+
+          [possible values: plain, plain-dictionary, rle, rle-dictionary, delta-binary-packed, delta-length-byte-array, delta-byte-array, byte-stream-split]
+
       --data-page-size-limit <DATA_PAGE_SIZE_LIMIT>
           Sets data page size limit
+
       --dictionary-page-size-limit <DICTIONARY_PAGE_SIZE_LIMIT>
           Sets dictionary page size limit
+
       --write-batch-size <WRITE_BATCH_SIZE>
           Sets write batch size
+
       --max-row-group-size <MAX_ROW_GROUP_SIZE>
           Sets max size for a row group
+
       --created-by <CREATED_BY>
           Sets "created by" property
-      --dictionary
+
+      --dictionary <DICTIONARY>
           Sets flag to enable/disable dictionary encoding for any column
+
+          [possible values: true, false]
+
       --statistics <STATISTICS>
-          Sets flag to enable/disable statistics for any column [possible values: none, chunk, page]
+          Sets flag to enable/disable statistics for any column
+
+          [possible values: none, chunk, page]
+
       --max-statistics-size <MAX_STATISTICS_SIZE>
           Sets max statistics size for any column. Applicable only if statistics are enabled
+
   -p, --print-schema
           Print the schema to stderr
+
   -n, --dry
           Only print the schema
+
   -h, --help
-          Print help
+          Print help (see a summary with '-h')
+
   -V, --version
           Print version
 ```