From 339743404f3dd2696a9e9dfa91429cee34679b69 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Wed, 4 Sep 2024 22:41:25 -0400 Subject: [PATCH] Chunked WKT and WKB parsing (#761) --- .../python/geoarrow/rust/core/_rust.pyi | 48 +++++++++- .../python/geoarrow/rust/core/enums.py | 14 +++ .../python/geoarrow/rust/core/types.py | 3 + python/geoarrow-core/src/coord_type.rs | 29 ++++++ python/geoarrow-core/src/interop/wkb.rs | 44 +++++++-- python/geoarrow-core/src/interop/wkt.rs | 95 ++++++++++++++----- python/geoarrow-core/src/lib.rs | 1 + python/tests/interop/test_wkb.py | 12 +++ python/tests/interop/test_wkt.py | 27 ++++++ src/io/wkb/api.rs | 12 +++ 10 files changed, 250 insertions(+), 35 deletions(-) create mode 100644 python/geoarrow-core/src/coord_type.rs create mode 100644 python/tests/interop/test_wkb.py create mode 100644 python/tests/interop/test_wkt.py diff --git a/python/geoarrow-core/python/geoarrow/rust/core/_rust.pyi b/python/geoarrow-core/python/geoarrow/rust/core/_rust.pyi index 881763dc4..81c312a89 100644 --- a/python/geoarrow-core/python/geoarrow/rust/core/_rust.pyi +++ b/python/geoarrow-core/python/geoarrow/rust/core/_rust.pyi @@ -29,6 +29,7 @@ except ImportError: from .enums import ( AreaMethod, + CoordType, LengthMethod, RotateOrigin, SimplifyMethod, @@ -37,6 +38,7 @@ from .types import ( AffineTransform, AreaMethodT, BroadcastGeometry, + CoordTypeT, GeoInterfaceProtocol, LengthMethodT, NumpyArrayProtocolf64, @@ -945,7 +947,23 @@ def from_shapely(input, *, crs: Any | None = None) -> GeometryArray: A GeoArrow array """ -def from_wkb(input: ArrowArrayExportable) -> GeometryArray: +@overload +def from_wkb( + input: ArrowArrayExportable, + *, + coord_type: CoordType | CoordTypeT = CoordType.Interleaved, +) -> GeometryArray: ... +@overload +def from_wkb( + input: ArrowStreamExportable, + *, + coord_type: CoordType | CoordTypeT = CoordType.Interleaved, +) -> ChunkedGeometryArray: ... +def from_wkb( + input: ArrowArrayExportable | ArrowStreamExportable, + *, + coord_type: CoordType | CoordTypeT = CoordType.Interleaved, +) -> GeometryArray | ChunkedGeometryArray: """ Parse an Arrow BinaryArray from WKB to its GeoArrow-native counterpart. @@ -954,17 +972,39 @@ def from_wkb(input: ArrowArrayExportable) -> GeometryArray: Args: input: An Arrow array of Binary type holding WKB-formatted geometries. + Other args: + coord_type: Specify the coordinate type of the generated GeoArrow data. + Returns: A GeoArrow-native geometry array """ -def from_wkt(input: ArrowArrayExportable) -> GeometryArray: +@overload +def from_wkt( + input: ArrowArrayExportable, + *, + coord_type: CoordType | CoordTypeT = CoordType.Interleaved, +) -> GeometryArray: ... +@overload +def from_wkt( + input: ArrowStreamExportable, + *, + coord_type: CoordType | CoordTypeT = CoordType.Interleaved, +) -> ChunkedGeometryArray: ... +def from_wkt( + input: ArrowArrayExportable | ArrowStreamExportable, + *, + coord_type: CoordType | CoordTypeT = CoordType.Interleaved, +) -> GeometryArray | ChunkedGeometryArray: """ Parse an Arrow StringArray from WKT to its GeoArrow-native counterpart. Args: input: An Arrow array of string type holding WKT-formatted geometries. + Other args: + coord_type: Specify the coordinate type of the generated GeoArrow data. + Returns: A GeoArrow-native geometry array """ @@ -997,6 +1037,10 @@ def to_shapely( numpy array with Shapely objects """ +@overload +def to_wkb(input: ArrowArrayExportable) -> GeometryArray: ... +@overload +def to_wkb(input: ArrowStreamExportable) -> ChunkedGeometryArray: ... def to_wkb(input: ArrowArrayExportable) -> GeometryArray: """ Encode a GeoArrow-native geometry array to a WKBArray, holding ISO-formatted WKB geometries. diff --git a/python/geoarrow-core/python/geoarrow/rust/core/enums.py b/python/geoarrow-core/python/geoarrow/rust/core/enums.py index eb448e013..04cd4f798 100644 --- a/python/geoarrow-core/python/geoarrow/rust/core/enums.py +++ b/python/geoarrow-core/python/geoarrow/rust/core/enums.py @@ -67,6 +67,20 @@ class AreaMethod(StrEnum): """ +class CoordType(StrEnum): + Interleaved = auto() + """Interleaved coordinate layout. + + All coordinates are stored in a single buffer, as `XYXYXY`. + """ + + Separated = auto() + """Separated coordinate layout. + + Coordinates are stored in a separate buffer per dimension, e.g. `XXXX` and `YYYY`. + """ + + class LengthMethod(StrEnum): Ellipsoidal = auto() """Determine the length of a geometry on an ellipsoidal model of the earth. diff --git a/python/geoarrow-core/python/geoarrow/rust/core/types.py b/python/geoarrow-core/python/geoarrow/rust/core/types.py index 3038ed019..f46148e83 100644 --- a/python/geoarrow-core/python/geoarrow/rust/core/types.py +++ b/python/geoarrow-core/python/geoarrow/rust/core/types.py @@ -44,6 +44,9 @@ [`signed_area`][geoarrow.rust.core.signed_area]. """ +CoordTypeT = Literal["interleaved", "separated"] +"""Acceptable coord_type strings. +""" LengthMethodT = Literal["ellipsoidal", "euclidean", "haversine", "vincenty"] """Acceptable strings to be passed into the `method` parameter for diff --git a/python/geoarrow-core/src/coord_type.rs b/python/geoarrow-core/src/coord_type.rs new file mode 100644 index 000000000..cf0f63fa9 --- /dev/null +++ b/python/geoarrow-core/src/coord_type.rs @@ -0,0 +1,29 @@ +use geoarrow::array::CoordType; +use pyo3::exceptions::PyValueError; +use pyo3::prelude::*; + +#[derive(Debug, Clone, Copy)] +pub enum PyCoordType { + Interleaved, + Separated, +} + +impl<'a> FromPyObject<'a> for PyCoordType { + fn extract(ob: &'a PyAny) -> PyResult { + let s: String = ob.extract()?; + match s.to_lowercase().as_str() { + "interleaved" => Ok(Self::Interleaved), + "separated" => Ok(Self::Separated), + _ => Err(PyValueError::new_err("Unexpected coord type")), + } + } +} + +impl From for CoordType { + fn from(value: PyCoordType) -> Self { + match value { + PyCoordType::Interleaved => Self::Interleaved, + PyCoordType::Separated => Self::Separated, + } + } +} diff --git a/python/geoarrow-core/src/interop/wkb.rs b/python/geoarrow-core/src/interop/wkb.rs index 616517e5c..6e6809811 100644 --- a/python/geoarrow-core/src/interop/wkb.rs +++ b/python/geoarrow-core/src/interop/wkb.rs @@ -1,27 +1,36 @@ use std::sync::Arc; -use geoarrow::array::{AsGeometryArray, CoordType, GeometryArrayDyn}; +use geoarrow::array::{AsChunkedGeometryArray, AsGeometryArray, GeometryArrayDyn}; +use geoarrow::chunked_array::ChunkedGeometryArrayTrait; use geoarrow::datatypes::GeoDataType; use geoarrow::error::GeoArrowError; -use geoarrow::io::wkb::{to_wkb as _to_wkb, FromWKB}; +use geoarrow::io::wkb::{to_wkb as _to_wkb, FromWKB, ToWKB}; use geoarrow::GeometryArrayTrait; use pyo3::prelude::*; use crate::array::*; +use crate::coord_type::PyCoordType; use crate::error::PyGeoArrowResult; use crate::ffi::from_python::AnyGeometryInput; -use crate::ffi::to_python::geometry_array_to_pyobject; +use crate::ffi::to_python::{chunked_geometry_array_to_pyobject, geometry_array_to_pyobject}; #[pyfunction] -pub fn from_wkb(py: Python, input: AnyGeometryInput) -> PyGeoArrowResult { +#[pyo3( + signature = (input, *, coord_type = PyCoordType::Interleaved), + text_signature = "(input, *, method = 'interleaved')") +] +pub fn from_wkb( + py: Python, + input: AnyGeometryInput, + coord_type: PyCoordType, +) -> PyGeoArrowResult { + let coord_type = coord_type.into(); match input { AnyGeometryInput::Array(arr) => { let geo_array: Arc = match arr.0.data_type() { - GeoDataType::WKB => { - FromWKB::from_wkb(arr.as_ref().as_wkb(), CoordType::Interleaved)? - } + GeoDataType::WKB => FromWKB::from_wkb(arr.as_ref().as_wkb(), coord_type)?, GeoDataType::LargeWKB => { - FromWKB::from_wkb(arr.as_ref().as_large_wkb(), CoordType::Interleaved)? + FromWKB::from_wkb(arr.as_ref().as_large_wkb(), coord_type)? } other => { return Err(GeoArrowError::IncorrectType( @@ -32,7 +41,19 @@ pub fn from_wkb(py: Python, input: AnyGeometryInput) -> PyGeoArrowResult todo!(), + AnyGeometryInput::Chunked(s) => { + let geo_array: Arc = match s.0.data_type() { + GeoDataType::WKB => FromWKB::from_wkb(s.as_ref().as_wkb(), coord_type)?, + GeoDataType::LargeWKB => FromWKB::from_wkb(s.as_ref().as_large_wkb(), coord_type)?, + other => { + return Err(GeoArrowError::IncorrectType( + format!("Unexpected array type {:?}", other).into(), + ) + .into()) + } + }; + chunked_geometry_array_to_pyobject(py, geo_array) + } } } @@ -43,6 +64,9 @@ pub fn to_wkb(py: Python, input: AnyGeometryInput) -> PyGeoArrowResult _to_wkb::(arr.as_ref()), ))) .into_py(py)), - AnyGeometryInput::Chunked(_) => todo!(), + AnyGeometryInput::Chunked(s) => { + let out = s.0.as_ref().to_wkb::(); + chunked_geometry_array_to_pyobject(py, Arc::new(out)) + } } } diff --git a/python/geoarrow-core/src/interop/wkt.rs b/python/geoarrow-core/src/interop/wkt.rs index b318aa521..779ffbb97 100644 --- a/python/geoarrow-core/src/interop/wkt.rs +++ b/python/geoarrow-core/src/interop/wkt.rs @@ -2,35 +2,84 @@ use std::sync::Arc; use arrow::datatypes::DataType; use arrow_array::cast::AsArray; -use geoarrow::array::CoordType; +use geoarrow::array::metadata::ArrayMetadata; +use geoarrow::array::MixedGeometryArray; +use geoarrow::chunked_array::{ChunkedArray, ChunkedMixedGeometryArray}; use geoarrow::io::geozero::FromWKT; -use geoarrow::GeometryArrayTrait; use pyo3::exceptions::PyTypeError; use pyo3::prelude::*; -use pyo3_arrow::PyArray; +use pyo3_arrow::input::AnyArray; +use crate::coord_type::PyCoordType; use crate::error::PyGeoArrowResult; -use crate::ffi::to_python::geometry_array_to_pyobject; +use crate::ffi::to_python::{chunked_geometry_array_to_pyobject, geometry_array_to_pyobject}; #[pyfunction] -pub fn from_wkt(py: Python, input: PyArray) -> PyGeoArrowResult { - let (array, _field) = input.into_inner(); - let geo_array: Arc = match array.data_type() { - DataType::Utf8 => FromWKT::from_wkt( - array.as_string::(), - CoordType::Interleaved, - Default::default(), - false, - )?, - DataType::LargeUtf8 => FromWKT::from_wkt( - array.as_string::(), - CoordType::Interleaved, - Default::default(), - false, - )?, - other => { - return Err(PyTypeError::new_err(format!("Unexpected array type {:?}", other)).into()) +#[pyo3( + signature = (input, *, coord_type = PyCoordType::Interleaved), + text_signature = "(input, *, method = 'interleaved')") +] +pub fn from_wkt( + py: Python, + input: AnyArray, + coord_type: PyCoordType, +) -> PyGeoArrowResult { + let coord_type = coord_type.into(); + match input { + AnyArray::Array(arr) => { + let (array, field) = arr.into_inner(); + let metadata = Arc::new(ArrayMetadata::try_from(field.as_ref())?); + let geo_array: MixedGeometryArray = match array.data_type() { + DataType::Utf8 => { + FromWKT::from_wkt(array.as_string::(), coord_type, metadata, false)? + } + DataType::LargeUtf8 => { + FromWKT::from_wkt(array.as_string::(), coord_type, metadata, false)? + } + other => { + return Err( + PyTypeError::new_err(format!("Unexpected array type {:?}", other)).into(), + ) + } + }; + geometry_array_to_pyobject(py, Arc::new(geo_array)) } - }; - geometry_array_to_pyobject(py, geo_array) + AnyArray::Stream(s) => { + let chunked_arr = s.into_chunked_array()?; + let (chunks, field) = chunked_arr.into_inner(); + let metadata = Arc::new(ArrayMetadata::try_from(field.as_ref())?); + let geo_array: ChunkedMixedGeometryArray = match field.data_type() { + DataType::Utf8 => { + let string_chunks = chunks + .iter() + .map(|chunk| chunk.as_string::().clone()) + .collect::>(); + FromWKT::from_wkt( + &ChunkedArray::new(string_chunks), + coord_type, + metadata, + false, + )? + } + DataType::LargeUtf8 => { + let string_chunks = chunks + .iter() + .map(|chunk| chunk.as_string::().clone()) + .collect::>(); + FromWKT::from_wkt( + &ChunkedArray::new(string_chunks), + coord_type, + metadata, + false, + )? + } + other => { + return Err( + PyTypeError::new_err(format!("Unexpected array type {:?}", other)).into(), + ) + } + }; + chunked_geometry_array_to_pyobject(py, Arc::new(geo_array)) + } + } } diff --git a/python/geoarrow-core/src/lib.rs b/python/geoarrow-core/src/lib.rs index 3a7c5d112..78e9dcc30 100644 --- a/python/geoarrow-core/src/lib.rs +++ b/python/geoarrow-core/src/lib.rs @@ -3,6 +3,7 @@ pub mod algorithm; pub mod array; pub mod broadcasting; pub mod chunked_array; +mod coord_type; pub(crate) mod crs; pub mod error; pub mod ffi; diff --git a/python/tests/interop/test_wkb.py b/python/tests/interop/test_wkb.py new file mode 100644 index 000000000..f4808be99 --- /dev/null +++ b/python/tests/interop/test_wkb.py @@ -0,0 +1,12 @@ +import pyarrow as pa +import shapely +from geoarrow.rust.core import from_shapely, from_wkb, to_shapely, to_wkb +from shapely.testing import assert_geometries_equal + + +def test_wkb_round_trip(): + geoms = shapely.points([0, 1, 2, 3], [4, 5, 6, 7]) + geo_arr = from_shapely(geoms) + wkb_arr = to_wkb(geo_arr) + assert pa.array(shapely.to_wkb(geoms, flavor="iso")) == pa.array(wkb_arr) + assert_geometries_equal(geoms, to_shapely(from_wkb(wkb_arr))) diff --git a/python/tests/interop/test_wkt.py b/python/tests/interop/test_wkt.py new file mode 100644 index 000000000..ec2477524 --- /dev/null +++ b/python/tests/interop/test_wkt.py @@ -0,0 +1,27 @@ +import pyarrow as pa +import shapely +from geoarrow.rust.core import from_wkt, to_shapely +from shapely.testing import assert_geometries_equal + + +def test_from_wkt(): + s = [ + "POINT (3 2)", + "POINT (0 2)", + "POINT (1 4)", + "POINT (3 2)", + "POINT (0 2)", + "POINT (1 4)", + ] + shapely_arr = shapely.from_wkt(s) + geo_arr = from_wkt(pa.array(s)) + assert_geometries_equal(shapely_arr, to_shapely(geo_arr)) + + +def test_from_wkt_chunked(): + s1 = ["POINT (3 2)", "POINT (0 2)", "POINT (1 4)"] + s2 = ["POINT (3 2)", "POINT (0 2)", "POINT (1 4)"] + ca = pa.chunked_array([pa.array(s1), pa.array(s2)]) + shapely_arr = shapely.from_wkt(s1 + s2) + geo_arr = from_wkt(ca) + assert_geometries_equal(shapely_arr, to_shapely(geo_arr)) diff --git a/src/io/wkb/api.rs b/src/io/wkb/api.rs index a189c86fa..5ad79b5b2 100644 --- a/src/io/wkb/api.rs +++ b/src/io/wkb/api.rs @@ -136,6 +136,18 @@ impl_chunked!(ChunkedMultiPolygonArray); impl_chunked!(ChunkedMixedGeometryArray); impl_chunked!(ChunkedGeometryCollectionArray); +impl FromWKB for Arc { + type Input = ChunkedWKBArray; + + fn from_wkb( + arr: &ChunkedWKBArray, + coord_type: CoordType, + ) -> Result { + let geom_arr = ChunkedGeometryCollectionArray::::from_wkb(arr, coord_type)?; + Ok(geom_arr.downcast(true)) + } +} + /// Parse an ISO [WKBArray] to a GeometryArray with GeoArrow native encoding. /// /// Does not downcast automatically