-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support COPY TO Externally Defined File Formats, add FileType trait #11060
Changes from 12 commits
a67d73b
85e64ba
05d329b
027131b
54f5f8f
0274eec
4319881
487da13
9527f82
891afc3
128bf2b
e6d2fa3
b065cd7
485f754
da8e635
6dec6ef
013ef91
132af14
493e82f
bcc3119
77bcc54
11071ac
826721f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,7 +24,7 @@ use std::str::FromStr; | |
|
||
use crate::error::_config_err; | ||
use crate::parsers::CompressionTypeVariant; | ||
use crate::{DataFusionError, FileType, Result}; | ||
use crate::{DataFusionError, Result}; | ||
|
||
/// A macro that wraps a configuration struct and automatically derives | ||
/// [`Default`] and [`ConfigField`] for it, allowing it to be used | ||
|
@@ -1116,6 +1116,16 @@ macro_rules! extensions_options { | |
} | ||
} | ||
|
||
/// These file types have special built in behavior for configuration. | ||
/// Use TableOptions::Extensions for configuring other file types. | ||
#[derive(Debug, Clone)] | ||
pub enum ConfigFileType { | ||
CSV, | ||
#[cfg(feature = "parquet")] | ||
PARQUET, | ||
JSON, | ||
} | ||
|
||
/// Represents the configuration options available for handling different table formats within a data processing application. | ||
/// This struct encompasses options for various file formats including CSV, Parquet, and JSON, allowing for flexible configuration | ||
/// of parsing and writing behaviors specific to each format. Additionally, it supports extending functionality through custom extensions. | ||
|
@@ -1134,7 +1144,7 @@ pub struct TableOptions { | |
|
||
/// The current file format that the table operations should assume. This option allows | ||
/// for dynamic switching between the supported file types (e.g., CSV, Parquet, JSON). | ||
pub current_format: Option<FileType>, | ||
pub current_format: Option<ConfigFileType>, | ||
|
||
/// Optional extensions that can be used to extend or customize the behavior of the table | ||
/// options. Extensions can be registered using `Extensions::insert` and might include | ||
|
@@ -1152,10 +1162,9 @@ impl ConfigField for TableOptions { | |
if let Some(file_type) = &self.current_format { | ||
match file_type { | ||
#[cfg(feature = "parquet")] | ||
FileType::PARQUET => self.parquet.visit(v, "format", ""), | ||
FileType::CSV => self.csv.visit(v, "format", ""), | ||
FileType::JSON => self.json.visit(v, "format", ""), | ||
_ => {} | ||
ConfigFileType::PARQUET => self.parquet.visit(v, "format", ""), | ||
ConfigFileType::CSV => self.csv.visit(v, "format", ""), | ||
ConfigFileType::JSON => self.json.visit(v, "format", ""), | ||
} | ||
} else { | ||
self.csv.visit(v, "csv", ""); | ||
|
@@ -1188,12 +1197,9 @@ impl ConfigField for TableOptions { | |
match key { | ||
"format" => match format { | ||
#[cfg(feature = "parquet")] | ||
FileType::PARQUET => self.parquet.set(rem, value), | ||
FileType::CSV => self.csv.set(rem, value), | ||
FileType::JSON => self.json.set(rem, value), | ||
_ => { | ||
_config_err!("Config value \"{key}\" is not supported on {}", format) | ||
} | ||
ConfigFileType::PARQUET => self.parquet.set(rem, value), | ||
ConfigFileType::CSV => self.csv.set(rem, value), | ||
ConfigFileType::JSON => self.json.set(rem, value), | ||
}, | ||
_ => _config_err!("Config value \"{key}\" not found on TableOptions"), | ||
} | ||
|
@@ -1210,15 +1216,6 @@ impl TableOptions { | |
Self::default() | ||
} | ||
|
||
/// Sets the file format for the table. | ||
/// | ||
/// # Parameters | ||
/// | ||
/// * `format`: The file format to use (e.g., CSV, Parquet). | ||
pub fn set_file_format(&mut self, format: FileType) { | ||
self.current_format = Some(format); | ||
} | ||
|
||
/// Creates a new `TableOptions` instance initialized with settings from a given session config. | ||
/// | ||
/// # Parameters | ||
|
@@ -1249,6 +1246,15 @@ impl TableOptions { | |
clone | ||
} | ||
|
||
/// Sets the file format for the table. | ||
/// | ||
/// # Parameters | ||
/// | ||
/// * `format`: The file format to use (e.g., CSV, Parquet). | ||
pub fn set_file_format(&mut self, format: ConfigFileType) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the parameter is called "format" but the struct is Maybe we should call this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed the method name for now. It might make sense for |
||
self.current_format = Some(format); | ||
} | ||
|
||
/// Sets the extensions for this `TableOptions` instance. | ||
/// | ||
/// # Parameters | ||
|
@@ -1658,6 +1664,8 @@ config_namespace! { | |
} | ||
} | ||
|
||
pub trait FormatOptionsExt: Display {} | ||
|
||
#[derive(Debug, Clone, PartialEq)] | ||
#[allow(clippy::large_enum_variant)] | ||
pub enum FormatOptions { | ||
|
@@ -1682,28 +1690,15 @@ impl Display for FormatOptions { | |
} | ||
} | ||
|
||
impl From<FileType> for FormatOptions { | ||
fn from(value: FileType) -> Self { | ||
match value { | ||
FileType::ARROW => FormatOptions::ARROW, | ||
FileType::AVRO => FormatOptions::AVRO, | ||
#[cfg(feature = "parquet")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is certainly nice to avoid many of these |
||
FileType::PARQUET => FormatOptions::PARQUET(TableParquetOptions::default()), | ||
FileType::CSV => FormatOptions::CSV(CsvOptions::default()), | ||
FileType::JSON => FormatOptions::JSON(JsonOptions::default()), | ||
} | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use std::any::Any; | ||
use std::collections::HashMap; | ||
|
||
use crate::config::{ | ||
ConfigEntry, ConfigExtension, ExtensionOptions, Extensions, TableOptions, | ||
ConfigEntry, ConfigExtension, ConfigFileType, ExtensionOptions, Extensions, | ||
TableOptions, | ||
}; | ||
use crate::FileType; | ||
|
||
#[derive(Default, Debug, Clone)] | ||
pub struct TestExtensionConfig { | ||
|
@@ -1761,7 +1756,7 @@ mod tests { | |
let mut extension = Extensions::new(); | ||
extension.insert(TestExtensionConfig::default()); | ||
let mut table_config = TableOptions::new().with_extensions(extension); | ||
table_config.set_file_format(FileType::CSV); | ||
table_config.set_file_format(ConfigFileType::CSV); | ||
table_config.set("format.delimiter", ";").unwrap(); | ||
assert_eq!(table_config.csv.delimiter, b';'); | ||
table_config.set("test.bootstrap.servers", "asd").unwrap(); | ||
|
@@ -1778,7 +1773,7 @@ mod tests { | |
#[test] | ||
fn csv_u8_table_options() { | ||
let mut table_config = TableOptions::new(); | ||
table_config.set_file_format(FileType::CSV); | ||
table_config.set_file_format(ConfigFileType::CSV); | ||
table_config.set("format.delimiter", ";").unwrap(); | ||
assert_eq!(table_config.csv.delimiter as char, ';'); | ||
table_config.set("format.escape", "\"").unwrap(); | ||
|
@@ -1791,7 +1786,7 @@ mod tests { | |
#[test] | ||
fn parquet_table_options() { | ||
let mut table_config = TableOptions::new(); | ||
table_config.set_file_format(FileType::PARQUET); | ||
table_config.set_file_format(ConfigFileType::PARQUET); | ||
table_config | ||
.set("format.bloom_filter_enabled::col1", "true") | ||
.unwrap(); | ||
|
@@ -1805,7 +1800,7 @@ mod tests { | |
#[test] | ||
fn parquet_table_options_config_entry() { | ||
let mut table_config = TableOptions::new(); | ||
table_config.set_file_format(FileType::PARQUET); | ||
table_config.set_file_format(ConfigFileType::PARQUET); | ||
table_config | ||
.set("format.bloom_filter_enabled::col1", "true") | ||
.unwrap(); | ||
|
@@ -1819,7 +1814,7 @@ mod tests { | |
#[test] | ||
fn parquet_table_options_config_metadata_entry() { | ||
let mut table_config = TableOptions::new(); | ||
table_config.set_file_format(FileType::PARQUET); | ||
table_config.set_file_format(ConfigFileType::PARQUET); | ||
table_config.set("format.metadata::key1", "").unwrap(); | ||
table_config.set("format.metadata::key2", "value2").unwrap(); | ||
table_config | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the last vestige of the old
FileType
enum. It can be completely ignored if using a custom format.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤔 perhaps we should add some Trait to unify the handling of options for built in formats and custom formats 🤔
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could potentially remove the
TableOptions
code and instead have eachFileFormatFactory
handle configuration. This is actually mostly the case in this PR already, butTableOptions
is a common helper.