Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: customize CSV format #115

Merged
merged 2 commits into from
Dec 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/csv2arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ arrow-schema = { version = "53.0.0", features = ["serde"] }
serde_json = "1.0.133"
clap = { version = "4.5.21", features = ["derive"] }
arrow-tools = { version = "0.19.0", path = "../arrow-tools" }
regex = "1.5.4"

[dev-dependencies]
assert_cmd = "2.0.14"
Expand Down
39 changes: 33 additions & 6 deletions crates/csv2arrow/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,24 +36,51 @@ cargo binstall csv2arrow
Usage: csv2arrow [OPTIONS] <CSV> [ARROW]

Arguments:
<CSV> Input CSV file, stdin if not present
[ARROW] Output file, stdout if not present
<CSV>
Input CSV file, stdin if not present

[ARROW]
Output file, stdout if not present

Options:
-s, --schema-file <SCHEMA_FILE>
File with Arrow schema in JSON format

-m, --max-read-records <MAX_READ_RECORDS>
The number of records to infer the schema from. All rows if not present. Setting max-read-records to zero will stop schema inference and all columns will be string typed

--header <HEADER>
Set whether the CSV file has headers [possible values: true, false]
-d, --delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character [default: ,]
Set whether the CSV file has headers

[default: true]
[possible values: true, false]

--delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character

--escape <ESCAPE>
Specify an escape character

--quote <QUOTE>
Specify a custom quote character

--comment <COMMENT>
Specify a comment character.

Lines starting with this character will be ignored

--null-regex <NULL_REGEX>
Provide a regex to match null values

-p, --print-schema
Print the schema to stderr

-n, --dry
Only print the schema

-h, --help
Print help
Print help (see a summary with '-h')

-V, --version
Print version
```
Expand Down
75 changes: 56 additions & 19 deletions crates/csv2arrow/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use arrow::{csv::reader::Format, csv::ReaderBuilder, error::ArrowError, ipc::writer::FileWriter};
use arrow_tools::seekable_reader::{SeekRead, SeekableReader};
use clap::{Parser, ValueHint};
use regex::Regex;
use std::io::stdout;
use std::path::PathBuf;
use std::sync::Arc;
Expand All @@ -25,13 +26,31 @@ struct Opts {
#[clap(short, long)]
max_read_records: Option<usize>,

/// Set whether the CSV file has headers
#[clap(long)]
/// Set whether the CSV file has headers.
#[clap(long, default_value = "true")]
header: Option<bool>,

/// Set the CSV file's column delimiter as a byte character.
#[clap(short, long, default_value = ",")]
delimiter: char,
#[clap(long)]
delimiter: Option<char>,

/// Specify an escape character.
#[clap(long)]
escape: Option<char>,

/// Specify a custom quote character.
#[clap(long)]
quote: Option<char>,

/// Specify a comment character.
///
/// Lines starting with this character will be ignored
#[clap(long)]
comment: Option<char>,

/// Provide a regex to match null values.
#[clap(long)]
null_regex: Option<Regex>,

/// Print the schema to stderr.
#[clap(short, long)]
Expand All @@ -56,6 +75,32 @@ fn main() -> Result<(), ArrowError> {
))
};

let mut format = Format::default();

if let Some(header) = opts.header {
format = format.with_header(header);
}

if let Some(delimiter) = opts.delimiter {
format = format.with_delimiter(delimiter as u8);
}

if let Some(escape) = opts.escape {
format = format.with_escape(escape as u8);
}

if let Some(quote) = opts.quote {
format = format.with_quote(quote as u8);
}

if let Some(comment) = opts.comment {
format = format.with_comment(comment as u8);
}

if let Some(regex) = opts.null_regex {
format = format.with_null_regex(regex);
}

let schema = match opts.schema_file {
Some(schema_def_file_path) => {
let schema_file = match File::open(&schema_def_file_path) {
Expand All @@ -76,18 +121,12 @@ fn main() -> Result<(), ArrowError> {
))),
}
}
_ => {
let format = Format::default()
.with_delimiter(opts.delimiter as u8)
.with_header(opts.header.unwrap_or(true));

match format.infer_schema(&mut input, opts.max_read_records) {
Ok((schema, _size)) => Ok(schema),
Err(error) => Err(ArrowError::SchemaError(format!(
"Error inferring schema: {error}"
))),
}
}
_ => match format.infer_schema(&mut input, opts.max_read_records) {
Ok((schema, _size)) => Ok(schema),
Err(error) => Err(ArrowError::SchemaError(format!(
"Error inferring schema: {error}"
))),
},
}?;

if opts.print_schema || opts.dry {
Expand All @@ -100,9 +139,7 @@ fn main() -> Result<(), ArrowError> {
}

let schema_ref = Arc::new(schema);
let builder = ReaderBuilder::new(schema_ref)
.with_header(opts.header.unwrap_or(true))
.with_delimiter(opts.delimiter as u8);
let builder = ReaderBuilder::new(schema_ref).with_format(format);

input.rewind()?;

Expand Down
1 change: 1 addition & 0 deletions crates/csv2parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ arrow-schema = { version = "53.0.0", features = ["serde"] }
serde_json = "1.0.133"
clap = { version = "4.5.21", features = ["derive"] }
arrow-tools = { version = "0.19.0", path = "../arrow-tools" }
regex = "1.5.4"

[dev-dependencies]
assert_cmd = "2.0.14"
Expand Down
65 changes: 55 additions & 10 deletions crates/csv2parquet/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,44 +36,89 @@ cargo binstall csv2parquet
Usage: csv2parquet [OPTIONS] <CSV> <PARQUET>

Arguments:
<CSV> Input CSV fil, stdin if not present
<PARQUET> Output file
<CSV>
Input CSV fil, stdin if not present

<PARQUET>
Output file

Options:
-s, --schema-file <SCHEMA_FILE>
File with Arrow schema in JSON format

--max-read-records <MAX_READ_RECORDS>
The number of records to infer the schema from. All rows if not present. Setting max-read-records to zero will stop schema inference and all columns will be string typed

--header <HEADER>
Set whether the CSV file has headers [possible values: true, false]
-d, --delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character [default: ,]
Set whether the CSV file has headers

[default: true]
[possible values: true, false]

--delimiter <DELIMITER>
Set the CSV file's column delimiter as a byte character

--escape <ESCAPE>
Specify an escape character

--quote <QUOTE>
Specify a custom quote character

--comment <COMMENT>
Specify a comment character.

Lines starting with this character will be ignored

--null-regex <NULL_REGEX>
Provide a regex to match null values

-c, --compression <COMPRESSION>
Set the compression [possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd, lz4-raw]
Set the compression

[possible values: uncompressed, snappy, gzip, lzo, brotli, lz4, zstd, lz4-raw]

-e, --encoding <ENCODING>
Sets encoding for any column [possible values: plain, plain-dictionary, rle, rle-dictionary, delta-binary-packed, delta-length-byte-array, delta-byte-array, byte-stream-split]
Sets encoding for any column

[possible values: plain, plain-dictionary, rle, rle-dictionary, delta-binary-packed, delta-length-byte-array, delta-byte-array, byte-stream-split]

--data-page-size-limit <DATA_PAGE_SIZE_LIMIT>
Sets data page size limit

--dictionary-page-size-limit <DICTIONARY_PAGE_SIZE_LIMIT>
Sets dictionary page size limit

--write-batch-size <WRITE_BATCH_SIZE>
Sets write batch size

--max-row-group-size <MAX_ROW_GROUP_SIZE>
Sets max size for a row group

--created-by <CREATED_BY>
Sets "created by" property
--dictionary

--dictionary <DICTIONARY>
Sets flag to enable/disable dictionary encoding for any column

[possible values: true, false]

--statistics <STATISTICS>
Sets flag to enable/disable statistics for any column [possible values: none, chunk, page]
Sets flag to enable/disable statistics for any column

[possible values: none, chunk, page]

--max-statistics-size <MAX_STATISTICS_SIZE>
Sets max statistics size for any column. Applicable only if statistics are enabled

-p, --print-schema
Print the schema to stderr

-n, --dry
Only print the schema

-h, --help
Print help
Print help (see a summary with '-h')

-V, --version
Print version
```
Expand Down
Loading
Loading