Skip to content

Commit

Permalink
Merge branch 'main' into sequence-numbers
Browse files Browse the repository at this point in the history
  • Loading branch information
SergeiPatiakin committed Dec 5, 2024
2 parents d371c70 + 69f6c58 commit e123048
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 33 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ iceberg = { git = "https://github.com/splitgraph/iceberg-rust", rev = "e7008f399
log = "0.4"
native-tls = "0.2.11"
object_store = { version = "0.11", features = ["aws"] }
opendal = { version = "0.50" }
parquet = { version = "53" }
postgres = { version = "0.19.7", git = "https://github.com/splitgraph/rust-postgres", rev = "88c2c7714a4558aed6a63e2e2b140a8359568858" }
postgres-native-tls = { version = "0.5.0", git = "https://github.com/splitgraph/rust-postgres", rev = "88c2c7714a4558aed6a63e2e2b140a8359568858" }
Expand Down
2 changes: 2 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ pub enum DataLoadingError {
ObjectStoreError(#[from] object_store::Error),
#[error("join error")]
JoinError(#[from] tokio::task::JoinError),
#[error("optimistic concurrency error")]
OptimisticConcurrencyError(),
}
35 changes: 23 additions & 12 deletions src/iceberg_destination.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use core::str;
use std::collections::HashMap;
use std::error::Error;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};

Expand Down Expand Up @@ -119,24 +120,24 @@ pub async fn record_batches_to_iceberg(
"Table exists. Pass the overwrite flag to lakehouse-loader to overwrite data",
)));
}
let x = version_hint_input.read().await?;
let y: String = String::from_utf8(x.to_vec()).map_err(|_| {
DataLoadingError::IcebergError(iceberg::Error::new(
iceberg::ErrorKind::DataInvalid,
"Could not parse UTF-8 in version-hint.text",
))
})?;
let z = y.trim().parse::<u64>().map_err(|_| {
let version_hint_bytes = version_hint_input.read().await?;
let version_hint_string: String =
String::from_utf8(version_hint_bytes.to_vec()).map_err(|_| {
DataLoadingError::IcebergError(iceberg::Error::new(
iceberg::ErrorKind::DataInvalid,
"Could not parse UTF-8 in version-hint.text",
))
})?;
let version_hint_u64 = version_hint_string.trim().parse::<u64>().map_err(|_| {
DataLoadingError::IcebergError(iceberg::Error::new(
iceberg::ErrorKind::DataInvalid,
"Could not parse integer version in version-hint.text",
))
})?;
Some(z)
Some(version_hint_u64)
} else {
None
};

let (previous_metadata, previous_metadata_location) = match old_version_hint {
Some(version_hint) => {
let old_metadata_location =
Expand Down Expand Up @@ -275,10 +276,20 @@ pub async fn record_batches_to_iceberg(
target_url, new_version_hint
);

file_io
if let Err(iceberg_error) = file_io
.new_output(&new_metadata_location)?
.write_exclusive(serde_json::to_vec(&new_metadata).unwrap().into())
.await?;
.await
{
if let Some(iceberg_error_source) = iceberg_error.source() {
if let Some(opendal_error) = iceberg_error_source.downcast_ref::<opendal::Error>() {
if opendal_error.kind() == opendal::ErrorKind::ConditionNotMatch {
return Err(DataLoadingError::OptimisticConcurrencyError());
}
}
}
return Err(iceberg_error.into());
};
info!("Wrote new metadata: {:?}", new_metadata_location);

file_io
Expand Down
79 changes: 58 additions & 21 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ enum Commands {
},
}

const OPTIMISTIC_CONCURRENCY_RETRIES: u32 = 3;

pub async fn do_main(args: Cli) -> Result<(), DataLoadingError> {
match args.command {
Commands::ParquetToDelta {
Expand Down Expand Up @@ -117,20 +119,35 @@ pub async fn do_main(args: Cli) -> Result<(), DataLoadingError> {
target_url,
overwrite,
} => {
let file = tokio::fs::File::open(source_file).await?;
let record_batch_reader = ParquetRecordBatchStreamBuilder::new(file)
.await?
.build()
.unwrap();
let schema = record_batch_reader.schema().clone();
info!("File schema: {}", schema);
record_batches_to_iceberg(
record_batch_reader.map_err(DataLoadingError::ParquetError),
schema,
target_url,
overwrite,
)
.await
for _ in 0..OPTIMISTIC_CONCURRENCY_RETRIES {
let file = tokio::fs::File::open(&source_file).await?;
let record_batch_reader = ParquetRecordBatchStreamBuilder::new(file)
.await?
.build()
.unwrap();
let arrow_schema = record_batch_reader.schema().clone();
info!("File schema: {}", arrow_schema);
match record_batches_to_iceberg(
record_batch_reader.map_err(DataLoadingError::ParquetError),
arrow_schema,
target_url.clone(),
overwrite,
)
.await
{
Err(DataLoadingError::OptimisticConcurrencyError()) => {
info!("Optimistic concurrency error. Retrying");
continue;
}
Err(e) => {
return Err(e);
}
Ok(_) => {
break;
}
}
}
Ok(())
}
Commands::PgToIceberg {
connection_string,
Expand All @@ -139,14 +156,34 @@ pub async fn do_main(args: Cli) -> Result<(), DataLoadingError> {
overwrite,
batch_size,
} => {
let mut source = PgArrowSource::new(connection_string.as_ref(), &query, batch_size)
.await
.map_err(DataLoadingError::PostgresError)?;
let arrow_schema = source.get_arrow_schema();
let record_batch_stream = source.get_record_batch_stream();
info!("Rowset schema: {}", arrow_schema);
record_batches_to_iceberg(record_batch_stream, arrow_schema, target_url, overwrite)
for _ in 0..OPTIMISTIC_CONCURRENCY_RETRIES {
let mut source = PgArrowSource::new(connection_string.as_ref(), &query, batch_size)
.await
.map_err(DataLoadingError::PostgresError)?;
let arrow_schema = source.get_arrow_schema();
let record_batch_stream = source.get_record_batch_stream();
info!("Rowset schema: {}", arrow_schema);
match record_batches_to_iceberg(
record_batch_stream,
arrow_schema,
target_url.clone(),
overwrite,
)
.await
{
Err(DataLoadingError::OptimisticConcurrencyError()) => {
info!("Optimistic concurrency error. Retrying");
continue;
}
Err(e) => {
return Err(e);
}
Ok(_) => {
break;
}
}
}
Ok(())
}
}
// TODO
Expand Down

0 comments on commit e123048

Please sign in to comment.