Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: clean up dependencies and feature flags #1014

Merged
merged 14 commits into from
Dec 30, 2022
10 changes: 5 additions & 5 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,11 @@ jobs:
override: true
- uses: Swatinem/rust-cache@v1
- name: build and lint with clippy
run: cargo clippy --features azure,datafusion-ext,s3,gcs,glue
run: cargo clippy --features azure,datafusion,s3,gcs,glue
- name: Spot-check build for rustls features
run: cargo clippy --features s3-rustls
- name: Check docs
run: cargo doc --features azure,datafusion-ext,s3,gcs,glue
run: cargo doc --features azure,datafusion,s3,gcs,glue

test:
strategy:
Expand All @@ -68,7 +68,7 @@ jobs:
override: true
- uses: Swatinem/rust-cache@v1
- name: Run tests
run: cargo test --verbose --features datafusion-ext,azure
run: cargo test --verbose --features datafusion,azure

integration_test:
name: Integration Tests
Expand Down Expand Up @@ -107,10 +107,10 @@ jobs:

- name: Run tests with default ssl
run: |
cargo test --features integration_test,azure,s3,gcs,datafusion-ext
cargo test --features integration_test,azure,s3,gcs,datafusion
- name: Run tests with rustls
run: |
cargo test --features integration_test,s3-rustls,datafusion-ext
cargo test --features integration_test,s3-rustls,datafusion

parquet2_test:
runs-on: ubuntu-latest
Expand Down
2 changes: 1 addition & 1 deletion python/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,4 @@ features = ["extension-module", "abi3", "abi3-py37"]
[dependencies.deltalake]
path = "../rust"
version = "0"
features = ["s3", "azure", "glue", "gcs", "python", "datafusion-ext"]
features = ["s3", "azure", "glue", "gcs", "python", "datafusion"]
3 changes: 2 additions & 1 deletion python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ devel = [
"pytest-timeout",
"sphinx<=4.5",
"sphinx-rtd-theme",
"toml"
"toml",
"wheel"
]
pyspark = [
"pyspark",
Expand Down
14 changes: 9 additions & 5 deletions rust/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ edition = "2021"
arrow = { version = "28", optional = true }
async-trait = "0.1"
bytes = "1"
chrono = "0.4.22"
chrono = { version = "0.4.22", default-features = false, features = ["clock"] }
cfg-if = "1"
errno = "0.2"
futures = "0.3"
Expand All @@ -22,7 +22,7 @@ log = "0"
libc = ">=0.2.90, <1"
num-bigint = "0.4"
num-traits = "0.2.15"
object_store = { version = "0.5.2", features = ["aws_profile"] }
object_store = "0.5.2"
once_cell = "1.16.0"
parking_lot = "0.12"
parquet = { version = "28", features = ["async"], optional = true }
Expand Down Expand Up @@ -77,14 +77,15 @@ glibc_version = { path = "../glibc_version", version = "0.1" }

[features]
default = ["arrow", "parquet"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should we also add aws-profile in the default feature list? profile based auth is a very common practice in production environment.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe add it as default to s3* features, since those are the only ones where it is useful?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed!

datafusion-ext = [
"datafusion",
datafusion = [
"dep:datafusion",
"datafusion-expr",
"datafusion-common",
"datafusion-proto",
"arrow",
"parquet",
]
datafusion-ext = ["datafusion"]
azure = ["object_store/azure"]
gcs = ["object_store/gcp"]
s3 = [
Expand All @@ -94,6 +95,7 @@ s3 = [
"rusoto_dynamodb/native-tls",
"dynamodb_lock/native-tls",
"object_store/aws",
"object_store/aws_profile",
]
s3-rustls = [
"rusoto_core/rustls",
Expand All @@ -102,9 +104,11 @@ s3-rustls = [
"rusoto_dynamodb/rustls",
"dynamodb_lock/rustls",
"object_store/aws",
"object_store/aws_profile",
]
glue = ["s3", "rusoto_glue"]
python = ["arrow/pyarrow"]

# used only for integration testing
integration_test = ["fs_extra", "tempdir"]

Expand All @@ -114,4 +118,4 @@ harness = false

[[example]]
name = "basic_operations"
required-features = ["datafusion-ext"]
required-features = ["datafusion"]
18 changes: 6 additions & 12 deletions rust/README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
Deltalake
=========
# Deltalake

[![crates.io](https://img.shields.io/crates/v/deltalake.svg?style=flat-square)](https://crates.io/crates/deltalake)
[![api_doc](https://img.shields.io/badge/doc-api-blue)](https://docs.rs/deltalake)

Native Delta Lake implementation in Rust


Usage
-----
## Usage

### API

Expand All @@ -17,7 +14,6 @@ let table = deltalake::open_table("./tests/data/simple_table").await.unwrap();
println!("{}", table.get_files());
```


### CLI

```bash
Expand All @@ -43,20 +39,18 @@ Examples can be run using the `cargo run --example` command. For example:
cargo run --example read_delta_table
```

Optional cargo package features
-----------------------
## Optional cargo package features

- `s3` - enable the S3 storage backend to work with Delta Tables in AWS S3.
- `s3-rustls` - enable the S3 storage backend but rely on [rustls](https://github.com/ctz/rustls) rather than OpenSSL (`native-tls`).
- `glue` - enable the Glue data catalog to work with Delta Tables with AWS Glue.
- `azure` - enable the Azure storage backend to work with Delta Tables in Azure Data Lake Storage Gen2 accounts.
- `gcs` - enable the Google storage backend to work with Delta Tables in Google Cloud Storage.
- `datafusion-ext` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
- `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

- `datafusion-ext` - DEPRECATED: alias for `datafusion` feature
- `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`.


Development
-----------
## Development

To run s3 integration tests from local machine, we use docker-compose to stand
up AWS local stack. To spin up the test environment run `docker-compose up` in
Expand Down
1 change: 1 addition & 0 deletions rust/src/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,7 @@ pub fn get_s3_builder_from_options(
}
builder = builder.with_region(s3_options.region.name());

#[cfg(feature = "aws-profile")]
if let Some(profile) = &s3_options.profile {
builder = builder.with_profile(profile);
}
Expand Down
28 changes: 16 additions & 12 deletions rust/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,11 @@
//! or Azure Blob Storage / Azure Data Lake Storage Gen2 (ADLS2). Use `s3-rustls` to use Rust TLS
//! instead of native TLS implementation.
//! - `glue` - enable the Glue data catalog to work with Delta Tables with AWS Glue.
//! - `datafusion-ext` - enable the `datafusion::datasource::TableProvider` trait implementation
//! - `datafusion` - enable the `datafusion::datasource::TableProvider` trait implementation
//! for Delta Tables, allowing them to be queried using [DataFusion](https://github.com/apache/arrow-datafusion).
//! - `datafusion-ext` - DEPRECATED: alias for `datafusion` feature.
//! - `parquet2` - use parquet2 for checkpoint deserialization. Since `arrow` and `parquet` features
//! are enabled by default for backwards compatibility, this feature needs to be used with `--no-default-features`.
//!
//! # Querying Delta Tables with Datafusion
//!
Expand All @@ -64,23 +67,24 @@
//! .await.unwrap();
//! };
//! ```
//!
//! It's important to note that the DataFusion library is evolving quickly, often with breaking api
//! changes, and this may cause compilation issues as a result. If you are having issues with the most
//! recently released `delta-rs` you can set a specific branch or commit in your `Cargo.toml`.
//!
//! ```toml
//! datafusion = { git = "https://github.com/apache/arrow-datafusion.git", rev = "07bc2c754805f536fe1cd873dbe6adfc0a21cbb3" }
//! ```

#![deny(warnings)]
#![deny(missing_docs)]

#[cfg(all(feature = "parquet", feature = "parquet2"))]
compile_error!(
"Feature parquet and parquet2 are mutually exclusive and cannot be enabled together"
"Features parquet and parquet2 are mutually exclusive and cannot be enabled together"
);

#[cfg(all(
feature = "aws-profile",
not(any(feature = "s3", feature = "s3-rustls"))
))]
compile_error!("Feature aws-profile must be used together with s3 or s3-rustls feature");

#[cfg(all(feature = "s3", feature = "s3-rustls"))]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

compile_error!("Features s3 and s3-rustls are mutually exclusive and cannot be enabled together");

pub mod action;
pub mod builder;
pub mod data_catalog;
Expand All @@ -98,7 +102,7 @@ pub mod vacuum;
pub mod checkpoints;
#[cfg(all(feature = "arrow", feature = "parquet"))]
pub mod delta_arrow;
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
pub mod delta_datafusion;
#[cfg(all(feature = "arrow", feature = "parquet"))]
pub mod operations;
Expand All @@ -117,7 +121,7 @@ pub use object_store::{path::Path, Error as ObjectStoreError, ObjectMeta, Object
// convenience exports for consumers to avoid aligning crate versions
#[cfg(feature = "arrow")]
pub use arrow;
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
pub use datafusion;
#[cfg(all(feature = "arrow", feature = "parquet"))]
pub use operations::DeltaOps;
Expand Down
16 changes: 8 additions & 8 deletions rust/src/operations/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,20 @@ use crate::{DeltaResult, DeltaTable, DeltaTableError};
pub mod create;
pub mod transaction;

#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
use self::{load::LoadBuilder, write::WriteBuilder};
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
use arrow::record_batch::RecordBatch;
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
pub use datafusion::physical_plan::common::collect as collect_sendable_stream;

#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
mod load;
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
pub mod write;
// TODO the writer module does not actually depend on datafusion,
// eventually we should consolidate with the record batch writer
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
mod writer;

/// Maximum supported writer version
Expand Down Expand Up @@ -93,14 +93,14 @@ impl DeltaOps {
}

/// Load data from a DeltaTable
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
#[must_use]
pub fn load(self) -> LoadBuilder {
LoadBuilder::default().with_object_store(self.0.object_store())
}

/// Write data to Delta table
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
#[must_use]
pub fn write(self, batches: impl IntoIterator<Item = RecordBatch>) -> WriteBuilder {
WriteBuilder::default()
Expand Down
3 changes: 3 additions & 0 deletions rust/src/operations/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,9 @@ impl std::future::IntoFuture for WriteBuilder {
let schema = batches[0].schema();

if let Ok(meta) = table.get_metadata() {
// NOTE the schema generated from the delta schema will have the delta field metadata included,
// so we need to compare the field names and datatypes instead.
// TODO update comparison logic, once we have column mappings supported.
let curr_schema: ArrowSchemaRef = Arc::new((&meta.schema).try_into()?);

if !schema_eq(curr_schema, schema.clone()) {
Expand Down
4 changes: 2 additions & 2 deletions rust/src/storage/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use std::sync::Arc;
use tokio::io::AsyncWrite;

use crate::get_storage_backend;
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
use datafusion::datasource::object_store::ObjectStoreUrl;
use serde::de::Error;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
Expand Down Expand Up @@ -143,7 +143,7 @@ impl DeltaObjectStore {
self.config.to_uri(&Path::from(""))
}

#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
/// generate a unique enough url to identify the store in datafusion.
pub(crate) fn object_store_url(&self) -> ObjectStoreUrl {
// we are certain, that the URL can be parsed, since
Expand Down
2 changes: 1 addition & 1 deletion rust/tests/common/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use tempdir::TempDir;
#[cfg(feature = "azure")]
pub mod adls;
pub mod clock;
#[cfg(feature = "datafusion-ext")]
#[cfg(feature = "datafusion")]
pub mod datafusion;
#[cfg(any(feature = "s3", feature = "s3-rustls"))]
pub mod s3;
Expand Down
2 changes: 1 addition & 1 deletion rust/tests/datafusion_test.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#![cfg(feature = "datafusion-ext")]
#![cfg(feature = "datafusion")]

use std::collections::HashSet;
use std::path::PathBuf;
Expand Down
2 changes: 1 addition & 1 deletion rust/tests/integration_datafusion.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#![cfg(all(feature = "integration_test", feature = "datafusion-ext"))]
#![cfg(all(feature = "integration_test", feature = "datafusion"))]

use arrow::array::Int64Array;
use common::datafusion::context_with_delta_table_factory;
Expand Down