Skip to content

Commit

Permalink
Persist arrow ext metadata in table operations (#99)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebarron authored Aug 1, 2024
1 parent b819f0c commit 30be5ca
Show file tree
Hide file tree
Showing 7 changed files with 70 additions and 10 deletions.
8 changes: 8 additions & 0 deletions arro3-core/python/arro3/core/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ class Array:
"""

@property
def field(self) -> Field:
"""Access the field stored on this Array.
Note that this field usually will not have a name associated, but it may have
metadata that signifies that this array is an extension (user-defined typed)
array.
"""
@property
def nbytes(self) -> int: ...
def slice(self, offset: int = 0, length: int | None = None) -> Array:
"""Compute zero-copy slice of this array.
Expand Down
16 changes: 15 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 7 additions & 1 deletion pyo3-arrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
use crate::input::AnyArray;
use crate::interop::numpy::from_numpy::from_numpy;
use crate::interop::numpy::to_numpy::to_numpy;
use crate::PyDataType;
use crate::{PyDataType, PyField};

/// A Python-facing Arrow array.
///
Expand Down Expand Up @@ -282,6 +282,12 @@ impl PyArray {
Ok(PyArray::new(new_array, new_field.into()).to_arro3(py)?)
}

#[getter]
#[pyo3(name = "field")]
fn py_field(&self, py: Python) -> PyResult<PyObject> {
PyField::new(self.field.clone()).to_arro3(py)
}

#[getter]
fn nbytes(&self) -> usize {
self.array.get_array_memory_size()
Expand Down
14 changes: 8 additions & 6 deletions pyo3-arrow/src/chunked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::input::AnyArray;
use crate::interop::numpy::to_numpy::chunked_to_numpy;
use crate::{PyArray, PyDataType};
use crate::{PyArray, PyDataType, PyField};

/// A Python-facing Arrow chunked array.
///
Expand Down Expand Up @@ -217,25 +217,27 @@ impl Display for PyChunkedArray {
#[pymethods]
impl PyChunkedArray {
#[new]
pub fn init(arrays: Vec<PyArray>, r#type: Option<PyDataType>) -> PyResult<Self> {
pub fn init(arrays: Vec<PyArray>, r#type: Option<PyField>) -> PyResult<Self> {
let (chunks, fields): (Vec<_>, Vec<_>) =
arrays.into_iter().map(|arr| arr.into_inner()).unzip();
if !fields
.windows(2)
.all(|w| w[0].data_type() == w[1].data_type())
.all(|w| w[0].data_type().equals_datatype(w[1].data_type()))
{
return Err(PyTypeError::new_err(
"Cannot create a ChunkedArray with differing data types.",
));
}

let data_type = r#type
let field = r#type
.map(|py_data_type| py_data_type.into_inner())
.unwrap_or_else(|| fields[0].data_type().clone());
.unwrap_or_else(|| fields[0].clone());

Ok(PyChunkedArray::new(
chunks,
Field::new("", data_type, true).into(),
Field::new("", field.data_type().clone(), true)
.with_metadata(field.metadata().clone())
.into(),
))
}

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ pytest = "^8.3.2"
pyarrow = "^17.0.0"
ipykernel = "^6.29.5"
maturin = "^1.7.0"
geoarrow-types = "^0.2.0"

[tool.poetry.group.docs.dependencies]
# We use ruff format ourselves, but mkdocstrings requires black to be installed
Expand Down
12 changes: 11 additions & 1 deletion tests/core/test_array.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from arro3.core import Array, DataType
import pyarrow as pa
from arro3.core import Array, DataType, Table


def test_from_numpy():
Expand All @@ -8,3 +9,12 @@ def test_from_numpy():

arr = np.array([1, 2, 3, 4], dtype=np.float64)
assert Array.from_numpy(arr).type == DataType.float64()


def test_extension_array_meta_persists():
arr = pa.array([1, 2, 3])
input_metadata = {"hello": "world"}
field = pa.field("arr", type=arr.type, metadata=input_metadata)
pa_table = pa.Table.from_arrays([arr], schema=pa.schema([field]))
table = Table.from_arrow(pa_table)
assert table[0].chunks[0].field.metadata_str == input_metadata
21 changes: 20 additions & 1 deletion tests/core/test_table.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import geoarrow.types as gt
import numpy as np
import pyarrow as pa
from arro3.core import Table
from arro3.core import ChunkedArray, Table


def test_table_getitem():
Expand All @@ -25,3 +27,20 @@ def test_table_from_pydict():
arro3_table = Table.from_pydict(mapping)
pa_table = pa.Table.from_pydict(mapping)
assert pa.table(arro3_table) == pa_table


def test_table_append_array_extension_type():
"""
Test that extension metadata gets propagated from an array to a column on a table.
"""
# Test that extension
extension_type = gt.point(dimensions="xy", coord_type="interleaved").to_pyarrow()
coords = np.array([1, 2, 3, 4], dtype=np.float64)
ext_array = pa.FixedSizeListArray.from_arrays(coords, 2).cast(extension_type)

table = Table.from_arrays([pa.array(["a", "b"])], names=["a"])
geo_table = table.append_column("geometry", ChunkedArray([ext_array]))

meta = geo_table.schema["geometry"].metadata
assert b"ARROW:extension:name" in meta.keys()
assert meta[b"ARROW:extension:name"] == b"geoarrow.point"

0 comments on commit 30be5ca

Please sign in to comment.