Skip to content

Commit

Permalink
feat: customize CSV format (#115)
Browse files Browse the repository at this point in the history
  • Loading branch information
domoritz authored Dec 15, 2024
1 parent 55e5c20 commit 56e4b6d
Show file tree
Hide file tree
Showing 7 changed files with 211 additions and 57 deletions.
4 changes: 3 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/csv2arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ arrow-schema = { version = "53.0.0", features = ["serde"] }
serde_json = "1.0.133"
clap = { version = "4.5.21", features = ["derive"] }
arrow-tools = { version = "0.19.0", path = "../arrow-tools" }
regex = "1.5.4"

[dev-dependencies]
assert_cmd = "2.0.14"
Expand Down
39 changes: 33 additions & 6 deletions crates/csv2arrow/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,51 @@ cargo binstall csv2arrow
Usage: csv2arrow [OPTIONS] <CSV> [ARROW]
Arguments:
<CSV> Input CSV file, stdin if not present
[ARROW] Output file, stdout if not present
<CSV>
Input CSV file, stdin if not present
[ARROW]
Output file, stdout if not present
Options:
-s, --schema-file <SCHEMA_FILE>
File with Arrow schema in JSON format
-m, --max-read-records <MAX_READ_RECORDS>
The number of records to infer the schema from. All rows if not present. Setting max-read-records to zero will stop schema inference and all columns will be string typed
--header <HEADER>
Set whether the CSV file has headers [possible values: true, false]
-d, --delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character [default: ,]
Set whether the CSV file has headers
[default: true]
[possible values: true, false]
--delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character
--escape <ESCAPE>
Specify an escape character
--quote <QUOTE>
Specify a custom quote character
--comment <COMMENT>
Specify a comment character.
Lines starting with this character will be ignored
--null-regex <NULL_REGEX>
Provide a regex to match null values
-p, --print-schema
Print the schema to stderr
-n, --dry
Only print the schema
-h, --help
Print help
Print help (see a summary with '-h')
-V, --version
Print version
```
Expand Down
75 changes: 56 additions & 19 deletions crates/csv2arrow/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use arrow::{csv::reader::Format, csv::ReaderBuilder, error::ArrowError, ipc::writer::FileWriter};
use arrow_tools::seekable_reader::{SeekRead, SeekableReader};
use clap::{Parser, ValueHint};
use regex::Regex;
use std::io::stdout;
use std::path::PathBuf;
use std::sync::Arc;
Expand All @@ -25,13 +26,31 @@ struct Opts {
#[clap(short, long)]
max_read_records: Option<usize>,

/// Set whether the CSV file has headers
#[clap(long)]
/// Set whether the CSV file has headers.
#[clap(long, default_value = "true")]
header: Option<bool>,

/// Set the CSV file's column delimiter as a byte character.
#[clap(short, long, default_value = ",")]
delimiter: char,
#[clap(long)]
delimiter: Option<char>,

/// Specify an escape character.
#[clap(long)]
escape: Option<char>,

/// Specify a custom quote character.
#[clap(long)]
quote: Option<char>,

/// Specify a comment character.
///
/// Lines starting with this character will be ignored
#[clap(long)]
comment: Option<char>,

/// Provide a regex to match null values.
#[clap(long)]
null_regex: Option<Regex>,

/// Print the schema to stderr.
#[clap(short, long)]
Expand All @@ -56,6 +75,32 @@ fn main() -> Result<(), ArrowError> {
))
};

let mut format = Format::default();

if let Some(header) = opts.header {
format = format.with_header(header);
}

if let Some(delimiter) = opts.delimiter {
format = format.with_delimiter(delimiter as u8);
}

if let Some(escape) = opts.escape {
format = format.with_escape(escape as u8);
}

if let Some(quote) = opts.quote {
format = format.with_quote(quote as u8);
}

if let Some(comment) = opts.comment {
format = format.with_comment(comment as u8);
}

if let Some(regex) = opts.null_regex {
format = format.with_null_regex(regex);
}

let schema = match opts.schema_file {
Some(schema_def_file_path) => {
let schema_file = match File::open(&schema_def_file_path) {
Expand All @@ -76,18 +121,12 @@ fn main() -> Result<(), ArrowError> {
))),
}
}
_ => {
let format = Format::default()
.with_delimiter(opts.delimiter as u8)
.with_header(opts.header.unwrap_or(true));

match format.infer_schema(&mut input, opts.max_read_records) {
Ok((schema, _size)) => Ok(schema),
Err(error) => Err(ArrowError::SchemaError(format!(
"Error inferring schema: {error}"
))),
}
}
_ => match format.infer_schema(&mut input, opts.max_read_records) {
Ok((schema, _size)) => Ok(schema),
Err(error) => Err(ArrowError::SchemaError(format!(
"Error inferring schema: {error}"
))),
},
}?;

if opts.print_schema || opts.dry {
Expand All @@ -100,9 +139,7 @@ fn main() -> Result<(), ArrowError> {
}

let schema_ref = Arc::new(schema);
let builder = ReaderBuilder::new(schema_ref)
.with_header(opts.header.unwrap_or(true))
.with_delimiter(opts.delimiter as u8);
let builder = ReaderBuilder::new(schema_ref).with_format(format);

input.rewind()?;

Expand Down
1 change: 1 addition & 0 deletions crates/csv2parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ arrow-schema = { version = "53.0.0", features = ["serde"] }
serde_json = "1.0.133"
clap = { version = "4.5.21", features = ["derive"] }
arrow-tools = { version = "0.19.0", path = "../arrow-tools" }
regex = "1.5.4"

[dev-dependencies]
assert_cmd = "2.0.14"
Expand Down
65 changes: 55 additions & 10 deletions crates/csv2parquet/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,44 +36,89 @@ cargo binstall csv2parquet
Usage: csv2parquet [OPTIONS] <CSV> <PARQUET>
Arguments:
<CSV> Input CSV fil, stdin if not present
<PARQUET> Output file
<CSV>
Input CSV fil, stdin if not present
<PARQUET>
Output file
Options:
-s, --schema-file <SCHEMA_FILE>
File with Arrow schema in JSON format
--max-read-records <MAX_READ_RECORDS>
The number of records to infer the schema from. All rows if not present. Setting max-read-records to zero will stop schema inference and all columns will be string typed
--header <HEADER>
Set whether the CSV file has headers [possible values: true, false]
-d, --delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character [default: ,]
Set whether the CSV file has headers
[default: true]
[possible values: true, false]
--delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character
--escape <ESCAPE>
Specify an escape character
--quote <QUOTE>
Specify a custom quote character
--comment <COMMENT>
Specify a comment character.
Lines starting with this character will be ignored
--null-regex <NULL_REGEX>
Provide a regex to match null values
-c, --compression <COMPRESSION>
Set the compression [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd, lz4-raw]
Set the compression
[possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd, lz4-raw]
-e, --encoding <ENCODING>
Sets encoding for any column [possible values: plain, plain-dictionary, rle, rle-dictionary, delta-binary-packed, delta-length-byte-array, delta-byte-array, byte-stream-split]
Sets encoding for any column
[possible values: plain, plain-dictionary, rle, rle-dictionary, delta-binary-packed, delta-length-byte-array, delta-byte-array, byte-stream-split]
--data-page-size-limit <DATA_PAGE_SIZE_LIMIT>
Sets data page size limit
--dictionary-page-size-limit <DICTIONARY_PAGE_SIZE_LIMIT>
Sets dictionary page size limit
--write-batch-size <WRITE_BATCH_SIZE>
Sets write batch size
--max-row-group-size <MAX_ROW_GROUP_SIZE>
Sets max size for a row group
--created-by <CREATED_BY>
Sets "created by" property
--dictionary
--dictionary <DICTIONARY>
Sets flag to enable/disable dictionary encoding for any column
[possible values: true, false]
--statistics <STATISTICS>
Sets flag to enable/disable statistics for any column [possible values: none, chunk, page]
Sets flag to enable/disable statistics for any column
[possible values: none, chunk, page]
--max-statistics-size <MAX_STATISTICS_SIZE>
Sets max statistics size for any column. Applicable only if statistics are enabled
-p, --print-schema
Print the schema to stderr
-n, --dry
Only print the schema
-h, --help
Print help
Print help (see a summary with '-h')
-V, --version
Print version
```
Expand Down
Loading

0 comments on commit 56e4b6d

Please sign in to comment.