fix(python): add support for pyarrow 13+ (#1804)

# Description I build on top of the branch of @wjones127 #1602. In pyarrow v13+ the ParquetWriter by default uses the `compliant_nested_types = True` (see related PR: https://github.com/apache/arrow/pull/35146/files)and the docs: https://arrow.apache.org/docs/python/generated/pyarrow.parquet.ParquetWriter.html). In arrow/parquet-rs it fails when it compares schemas because it expected the old non-compliant ones. For now we can have pyarrow 13+ supported by disabling it or updating the file options provided by a user. # Related Issue(s)  - Closes #1744 # Documentation  --------- Co-authored-by: Will Jones <willjones127@gmail.com> Co-authored-by: R. Tyler Croy <rtyler@brokenco.de>
delta-io · Nov 5, 2023 · d98a0c2 · d98a0c2
1 parent 6e9894f
commit d98a0c2
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 14 deletions.
diff --git a/crates/deltalake-core/src/operations/optimize.rs b/crates/deltalake-core/src/operations/optimize.rs
@@ -1432,7 +1432,6 @@ pub(super) mod zorder {
             assert_eq!(result.null_count(), 0);
 
             let data: &BinaryArray = as_generic_binary_array(result.as_ref());
-            dbg!(data);
             assert_eq!(data.value_data().len(), 3 * 16 * 3);
             assert!(data.iter().all(|x| x.unwrap().len() == 3 * 16));
         }

diff --git a/python/deltalake/writer.py b/python/deltalake/writer.py
@@ -342,6 +342,13 @@ def validate_batch(batch: pa.RecordBatch) -> pa.RecordBatch:
             schema, (validate_batch(batch) for batch in batch_iter)
         )
 
+    if file_options is not None:
+        file_options.update(use_compliant_nested_type=False)
+    else:
+        file_options = ds.ParquetFileFormat().make_write_options(
+            use_compliant_nested_type=False
+        )
+
     ds.write_dataset(
         data,
         base_dir="/",

diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -17,7 +17,7 @@ classifiers = [
     "Programming Language :: Python :: 3.11"
 ]
 dependencies = [
-    "pyarrow>=8,<13",
+    "pyarrow>=8",
     'typing-extensions;python_version<"3.8"',
 ]
 

diff --git a/python/src/lib.rs b/python/src/lib.rs
@@ -13,6 +13,7 @@ use std::time;
 use std::time::{SystemTime, UNIX_EPOCH};
 
 use arrow::pyarrow::PyArrowType;
+use arrow_schema::DataType;
 use chrono::{DateTime, Duration, FixedOffset, Utc};
 use deltalake::arrow::compute::concat_batches;
 use deltalake::arrow::ffi_stream::ArrowArrayStreamReader;
@@ -947,16 +948,31 @@ fn filestats_to_expression<'py>(
     let mut expressions: Vec<PyResult<&PyAny>> = Vec::new();
 
     let cast_to_type = |column_name: &String, value: PyObject, schema: &ArrowSchema| {
-        let column_type = PyArrowType(
-            schema
-                .field_with_name(column_name)
-                .map_err(|_| {
-                    PyValueError::new_err(format!("Column not found in schema: {column_name}"))
-                })?
-                .data_type()
-                .clone(),
-        )
-        .into_py(py);
+        let column_type = schema
+            .field_with_name(column_name)
+            .map_err(|_| {
+                PyValueError::new_err(format!("Column not found in schema: {column_name}"))
+            })?
+            .data_type()
+            .clone();
+
+        let value = match column_type {
+            // Since PyArrow 13.0.0, casting string -> timestamp fails if it ends with "Z"
+            // and the target type is timezone naive.
+            DataType::Timestamp(_, _) if value.extract::<String>(py).is_ok() => {
+                value.call_method1(py, "rstrip", ("Z",))?
+            }
+            // PyArrow 13.0.0 lost the ability to cast from string to date32, so
+            // we have to implement that manually.
+            DataType::Date32 if value.extract::<String>(py).is_ok() => {
+                let date = Python::import(py, "datetime")?.getattr("date")?;
+                let date = date.call_method1("fromisoformat", (value,))?;
+                date.to_object(py)
+            }
+            _ => value,
+        };
+
+        let column_type = PyArrowType(column_type).into_py(py);
         pa.call_method1("scalar", (value,))?
             .call_method1("cast", (column_type,))
     };

diff --git a/python/tests/test_writer.py b/python/tests/test_writer.py
@@ -308,7 +308,9 @@ def test_write_recordbatchreader(
     tmp_path: pathlib.Path, existing_table: DeltaTable, sample_data: pa.Table
 ):
     batches = existing_table.to_pyarrow_dataset().to_batches()
-    reader = RecordBatchReader.from_batches(sample_data.schema, batches)
+    reader = RecordBatchReader.from_batches(
+        existing_table.to_pyarrow_dataset().schema, batches
+    )
 
     write_deltalake(tmp_path, reader, mode="overwrite")
     assert DeltaTable(tmp_path).to_pyarrow_table() == sample_data
@@ -892,7 +894,7 @@ def comp():
             # concurrently, then this will fail.
             assert data.num_rows == sample_data.num_rows
             try:
-                write_deltalake(dt.table_uri, data, mode="overwrite")
+                write_deltalake(dt.table_uri, sample_data, mode="overwrite")
             except Exception as e:
                 exception = e