From 15ab557d6a36c17624373f96277c2deef01a43c8 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Mon, 12 Aug 2024 23:31:15 -0400 Subject: [PATCH] Improved rust docs and method visibility --- arro3-core/python/arro3/core/_core.pyi | 5 +- mkdocs.yml | 3 + pyo3-arrow/src/array.rs | 34 ++++--- pyo3-arrow/src/array_reader.rs | 28 +++--- pyo3-arrow/src/chunked.rs | 97 ++++++++++---------- pyo3-arrow/src/datatypes.rs | 18 ++-- pyo3-arrow/src/error.rs | 6 ++ pyo3-arrow/src/ffi/mod.rs | 3 +- pyo3-arrow/src/ffi/to_python/chunked.rs | 4 +- pyo3-arrow/src/ffi/to_python/utils.rs | 3 +- pyo3-arrow/src/field.rs | 42 ++++----- pyo3-arrow/src/input.rs | 26 ++++-- pyo3-arrow/src/lib.rs | 1 + pyo3-arrow/src/record_batch.rs | 57 ++++++------ pyo3-arrow/src/record_batch_reader.rs | 32 +++---- pyo3-arrow/src/schema.rs | 54 +++++------ pyo3-arrow/src/table.rs | 113 +++++++++++++----------- 17 files changed, 289 insertions(+), 237 deletions(-) diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi index e16dee8..66d3f87 100644 --- a/arro3-core/python/arro3/core/_core.pyi +++ b/arro3-core/python/arro3/core/_core.pyi @@ -10,6 +10,7 @@ from .types import ( class Array: """An Arrow Array.""" + def __init__(self, obj: Sequence[Any], /, type: ArrowSchemaExportable) -> None: """Create arro3.core.Array instance from a sequence of Python objects. @@ -109,8 +110,8 @@ class ArrayReader: This dunder method should not be called directly, but enables zero-copy data transfer to other Python libraries that understand Arrow memory. - For example, you can call [`pyarrow.table()`][pyarrow.table] to convert this - ArrayReader to a pyarrow table, without copying memory. + For example, you can call [`pyarrow.chunked_array()`][pyarrow.chunked_array] to + convert this ArrayReader to a pyarrow ChunkedArray, without copying memory. """ def __iter__(self) -> ArrayReader: ... def __next__(self) -> Array: ... diff --git a/mkdocs.yml b/mkdocs.yml index 956c69e..02b99d9 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -93,6 +93,9 @@ plugins: python: paths: [arro3-compute/python, arro3-core/python, arro3-io/python] options: + # We set allow_inspection: false to ensure that all docstrings come + # from the pyi files, not the Rust-facing doc comments. + allow_inspection: false docstring_section_style: list docstring_style: google line_length: 80 diff --git a/pyo3-arrow/src/array.rs b/pyo3-arrow/src/array.rs index 89b1a0f..58714be 100644 --- a/pyo3-arrow/src/array.rs +++ b/pyo3-arrow/src/array.rs @@ -7,7 +7,7 @@ use arrow::datatypes::{ UInt64Type, UInt8Type, }; use arrow_array::{ - make_array, Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, LargeBinaryArray, + Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, LargeBinaryArray, LargeStringArray, PrimitiveArray, StringArray, StringViewArray, }; use arrow_schema::{ArrowError, DataType, Field, FieldRef}; @@ -26,6 +26,9 @@ use crate::interop::numpy::from_numpy::from_numpy; use crate::interop::numpy::to_numpy::to_numpy; use crate::{PyDataType, PyField}; +/// A Python-facing Arrow array. +/// +/// This is a wrapper around an [ArrayRef] and a [FieldRef]. #[pyclass(module = "arro3.core._core", name = "Array", subclass)] pub struct PyArray { array: ArrayRef, @@ -52,11 +55,6 @@ impl PyArray { Ok(Self { array, field }) } - pub fn from_array(array: A) -> Self { - let array = make_array(array.into_data()); - Self::from_array_ref(array) - } - /// Create a new PyArray from an [ArrayRef], inferring its data type automatically. pub fn from_array_ref(array: ArrayRef) -> Self { let field = Field::new("", array.data_type().clone(), true); @@ -132,7 +130,7 @@ impl Display for PyArray { impl PyArray { #[new] #[pyo3(signature = (obj, /, r#type, *))] - pub fn init(py: Python, obj: PyObject, r#type: PyDataType) -> PyResult { + fn init(py: Python, obj: PyObject, r#type: PyDataType) -> PyResult { macro_rules! impl_primitive { ($rust_type:ty, $arrow_type:ty) => {{ let values: Vec<$rust_type> = obj.extract(py)?; @@ -194,7 +192,7 @@ impl PyArray { #[pyo3(signature = (dtype=None, copy=None))] #[allow(unused_variables)] - pub fn __array__( + fn __array__( &self, py: Python, dtype: Option, @@ -204,7 +202,7 @@ impl PyArray { } #[allow(unused_variables)] - pub fn __arrow_c_array__<'py>( + fn __arrow_c_array__<'py>( &'py self, py: Python<'py>, requested_schema: Option>, @@ -212,20 +210,20 @@ impl PyArray { to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema) } - pub fn __eq__(&self, other: &PyArray) -> bool { + fn __eq__(&self, other: &PyArray) -> bool { self.array.as_ref() == other.array.as_ref() && self.field == other.field } - pub fn __len__(&self) -> usize { + fn __len__(&self) -> usize { self.array.len() } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { + fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { match input { AnyArray::Array(array) => Ok(array), AnyArray::Stream(stream) => { @@ -239,7 +237,7 @@ impl PyArray { } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, schema_capsule: &Bound, array_capsule: &Bound, @@ -249,7 +247,7 @@ impl PyArray { } #[classmethod] - pub fn from_numpy( + fn from_numpy( _cls: &Bound, py: Python, array: Bound<'_, PyAny>, @@ -282,7 +280,7 @@ impl PyArray { } #[pyo3(signature = (offset=0, length=None))] - pub fn slice(&self, py: Python, offset: usize, length: Option) -> PyResult { + fn slice(&self, py: Python, offset: usize, length: Option) -> PyResult { let length = length.unwrap_or_else(|| self.array.len() - offset); let new_array = self.array.slice(offset, length); PyArray::new(new_array, self.field().clone()).to_arro3(py) @@ -293,12 +291,12 @@ impl PyArray { Ok(PyArray::new(new_array, self.field.clone()).to_arro3(py)?) } - pub fn to_numpy(&self, py: Python) -> PyResult { + fn to_numpy(&self, py: Python) -> PyResult { self.__array__(py, None, None) } #[getter] - pub fn r#type(&self, py: Python) -> PyResult { + fn r#type(&self, py: Python) -> PyResult { PyDataType::new(self.field.data_type().clone()).to_arro3(py) } } diff --git a/pyo3-arrow/src/array_reader.rs b/pyo3-arrow/src/array_reader.rs index c47202b..5fd2986 100644 --- a/pyo3-arrow/src/array_reader.rs +++ b/pyo3-arrow/src/array_reader.rs @@ -15,10 +15,14 @@ use crate::ffi::{ArrayIterator, ArrayReader}; use crate::input::AnyArray; use crate::{PyArray, PyChunkedArray, PyField}; +/// A Python-facing Arrow array reader. +/// +/// This is a wrapper around a [ArrayReader]. #[pyclass(module = "arro3.core._core", name = "ArrayReader", subclass)] pub struct PyArrayReader(pub(crate) Option>); impl PyArrayReader { + /// Construct a new [PyArrayReader] from an existing [ArrayReader]. pub fn new(reader: Box) -> Self { Self(Some(reader)) } @@ -45,7 +49,7 @@ impl PyArrayReader { for array in stream { arrays.push(array?); } - Ok(PyChunkedArray::new(arrays, field)) + Ok(PyChunkedArray::try_new(arrays, field)?) } /// Access the [FieldRef] of this ArrayReader. @@ -100,7 +104,7 @@ impl Display for PyArrayReader { #[pymethods] impl PyArrayReader { #[allow(unused_variables)] - pub fn __arrow_c_stream__<'py>( + fn __arrow_c_stream__<'py>( &'py mut self, py: Python<'py>, requested_schema: Option>, @@ -122,23 +126,23 @@ impl PyArrayReader { self.read_next_array(py) } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[getter] - pub fn closed(&self) -> bool { + fn closed(&self) -> bool { self.0.is_none() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { + fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { let reader = input.into_reader()?; Ok(Self::new(reader)) } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -149,7 +153,7 @@ impl PyArrayReader { } #[classmethod] - pub fn from_arrays(_cls: &Bound, field: PyField, arrays: Vec) -> Self { + fn from_arrays(_cls: &Bound, field: PyField, arrays: Vec) -> Self { let arrays = arrays .into_iter() .map(|array| { @@ -164,16 +168,16 @@ impl PyArrayReader { } #[classmethod] - pub fn from_stream(_cls: &Bound, data: &Bound) -> PyResult { + fn from_stream(_cls: &Bound, data: &Bound) -> PyResult { data.extract() } #[getter] - pub fn field(&self, py: Python) -> PyResult { + fn field(&self, py: Python) -> PyResult { PyField::new(self.field_ref()?).to_arro3(py) } - pub fn read_all(&mut self, py: Python) -> PyArrowResult { + fn read_all(&mut self, py: Python) -> PyArrowResult { let stream = self .0 .take() @@ -183,10 +187,10 @@ impl PyArrayReader { for array in stream { arrays.push(array?); } - Ok(PyChunkedArray::new(arrays, field).to_arro3(py)?) + Ok(PyChunkedArray::try_new(arrays, field)?.to_arro3(py)?) } - pub fn read_next_array(&mut self, py: Python) -> PyArrowResult { + fn read_next_array(&mut self, py: Python) -> PyArrowResult { let stream = self .0 .as_mut() diff --git a/pyo3-arrow/src/chunked.rs b/pyo3-arrow/src/chunked.rs index 328f578..e81d9b2 100644 --- a/pyo3-arrow/src/chunked.rs +++ b/pyo3-arrow/src/chunked.rs @@ -2,7 +2,7 @@ use std::fmt::Display; use std::sync::Arc; use arrow::compute::concat; -use arrow_array::{make_array, Array, ArrayRef}; +use arrow_array::{Array, ArrayRef}; use arrow_schema::{ArrowError, DataType, Field, FieldRef}; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::intern; @@ -19,6 +19,9 @@ use crate::input::AnyArray; use crate::interop::numpy::to_numpy::chunked_to_numpy; use crate::{PyArray, PyDataType, PyField}; +/// A Python-facing Arrow chunked array. +/// +/// This is a wrapper around a [FieldRef] and a `Vec` of [ArrayRef]. #[pyclass(module = "arro3.core._core", name = "ChunkedArray", subclass)] pub struct PyChunkedArray { chunks: Vec, @@ -26,28 +29,23 @@ pub struct PyChunkedArray { } impl PyChunkedArray { - pub fn new(chunks: Vec, field: FieldRef) -> Self { - assert!( - chunks - .iter() - .all(|chunk| chunk.data_type().equals_datatype(field.data_type())), - "All chunks must have same data type" - ); - Self { chunks, field } + /// Construct a new [PyChunkedArray] from existing chunks and a field. + pub fn try_new(chunks: Vec, field: FieldRef) -> PyResult { + if !chunks + .iter() + .all(|chunk| chunk.data_type().equals_datatype(field.data_type())) + { + return Err(PyTypeError::new_err("All chunks must have same data type")); + } + + Ok(Self { chunks, field }) } + /// Access the [DataType] of this ChunkedArray pub fn data_type(&self) -> &DataType { self.field.data_type() } - pub fn from_arrays(chunks: &[A]) -> PyArrowResult { - let arrays = chunks - .iter() - .map(|chunk| make_array(chunk.to_data())) - .collect::>(); - Self::from_array_refs(arrays) - } - /// Create a new PyChunkedArray from a vec of [ArrayRef]s, inferring their data type /// automatically. pub fn from_array_refs(chunks: Vec) -> PyArrowResult { @@ -66,30 +64,34 @@ impl PyChunkedArray { } let field = Field::new("", chunks.first().unwrap().data_type().clone(), true); - Ok(Self::new(chunks, Arc::new(field))) + Ok(Self::try_new(chunks, Arc::new(field))?) } + /// Access the underlying chunks. pub fn chunks(&self) -> &[ArrayRef] { &self.chunks } + /// Access the underlying field. pub fn field(&self) -> &FieldRef { &self.field } + /// Consume this and return its inner parts. pub fn into_inner(self) -> (Vec, FieldRef) { (self.chunks, self.field) } - pub fn is_empty(&self) -> bool { + #[allow(dead_code)] + pub(crate) fn is_empty(&self) -> bool { self.len() == 0 } - pub fn len(&self) -> usize { + pub(crate) fn len(&self) -> usize { self.chunks.iter().fold(0, |acc, arr| acc + arr.len()) } - pub fn rechunk(&self, chunk_lengths: Vec) -> PyArrowResult { + pub(crate) fn rechunk(&self, chunk_lengths: Vec) -> PyArrowResult { let total_chunk_length = chunk_lengths.iter().sum::(); if total_chunk_length != self.length() { return Err(PyValueError::new_err( @@ -104,7 +106,7 @@ impl PyChunkedArray { .zip(self.chunks()) .all(|(length, arr)| *length == arr.len()); if matches_existing_chunking { - return Ok(Self::new(self.chunks.clone(), self.field.clone())); + return Ok(Self::try_new(self.chunks.clone(), self.field.clone())?); } let mut offset = 0; @@ -119,10 +121,14 @@ impl PyChunkedArray { }) .collect::>>()?; - Ok(PyChunkedArray::new(chunks, self.field.clone())) + Ok(PyChunkedArray::try_new(chunks, self.field.clone())?) } - pub fn slice(&self, mut offset: usize, mut length: usize) -> PyArrowResult> { + pub(crate) fn slice( + &self, + mut offset: usize, + mut length: usize, + ) -> PyArrowResult> { if offset + length > self.length() { return Err( PyValueError::new_err("offset + length may not exceed length of array").into(), @@ -214,7 +220,7 @@ impl Display for PyChunkedArray { #[pymethods] impl PyChunkedArray { #[new] - pub fn init(arrays: Vec, r#type: Option) -> PyResult { + fn init(arrays: Vec, r#type: Option) -> PyArrowResult { let (chunks, fields): (Vec<_>, Vec<_>) = arrays.into_iter().map(|arr| arr.into_inner()).unzip(); if !fields @@ -223,24 +229,25 @@ impl PyChunkedArray { { return Err(PyTypeError::new_err( "Cannot create a ChunkedArray with differing data types.", - )); + ) + .into()); } let field = r#type .map(|py_data_type| py_data_type.into_inner()) .unwrap_or_else(|| fields[0].clone()); - Ok(PyChunkedArray::new( + Ok(PyChunkedArray::try_new( chunks, Field::new("", field.data_type().clone(), true) .with_metadata(field.metadata().clone()) .into(), - )) + )?) } #[pyo3(signature = (dtype=None, copy=None))] #[allow(unused_variables)] - pub fn __array__( + fn __array__( &self, py: Python, dtype: Option, @@ -255,7 +262,7 @@ impl PyChunkedArray { } #[allow(unused_variables)] - pub fn __arrow_c_stream__<'py>( + fn __arrow_c_stream__<'py>( &'py self, py: Python<'py>, requested_schema: Option>, @@ -275,17 +282,17 @@ impl PyChunkedArray { self.chunks.iter().fold(0, |acc, x| acc + x.len()) } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { + fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { input.into_chunked_array() } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -302,7 +309,7 @@ impl PyChunkedArray { chunks.push(array); } - Ok(PyChunkedArray::new(chunks, field)) + PyChunkedArray::try_new(chunks, field) } fn cast(&self, py: Python, target_type: PyDataType) -> PyArrowResult { @@ -313,10 +320,10 @@ impl PyChunkedArray { .map(|chunk| arrow::compute::cast(&chunk, &target_type)) .collect::, ArrowError>>()?; let new_field = self.field.as_ref().clone().with_data_type(target_type); - Ok(PyChunkedArray::new(new_chunks, new_field.into()).to_arro3(py)?) + Ok(PyChunkedArray::try_new(new_chunks, new_field.into())?.to_arro3(py)?) } - pub fn chunk(&self, py: Python, i: usize) -> PyResult { + fn chunk(&self, py: Python, i: usize) -> PyResult { let field = self.field().clone(); let array = self .chunks @@ -328,7 +335,7 @@ impl PyChunkedArray { #[getter] #[pyo3(name = "chunks")] - pub fn chunks_py(&self, py: Python) -> PyResult> { + fn chunks_py(&self, py: Python) -> PyResult> { let field = self.field().clone(); self.chunks .iter() @@ -336,13 +343,13 @@ impl PyChunkedArray { .collect() } - pub fn combine_chunks(&self, py: Python) -> PyArrowResult { + fn combine_chunks(&self, py: Python) -> PyArrowResult { let field = self.field().clone(); let arrays: Vec<&dyn Array> = self.chunks.iter().map(|arr| arr.as_ref()).collect(); Ok(PyArray::new(concat(&arrays)?, field).to_arro3(py)?) } - pub fn equals(&self, other: PyChunkedArray) -> bool { + fn equals(&self, other: PyChunkedArray) -> bool { self.field == other.field && self.chunks == other.chunks } @@ -358,20 +365,20 @@ impl PyChunkedArray { } #[getter] - pub fn null_count(&self) -> usize { + fn null_count(&self) -> usize { self.chunks .iter() .fold(0, |acc, arr| acc + arr.null_count()) } #[getter] - pub fn num_chunks(&self) -> usize { + fn num_chunks(&self) -> usize { self.chunks.len() } #[pyo3(signature = (offset=0, length=None))] #[pyo3(name = "slice")] - pub fn slice_py( + fn slice_py( &self, py: Python, offset: usize, @@ -379,15 +386,15 @@ impl PyChunkedArray { ) -> PyArrowResult { let length = length.unwrap_or_else(|| self.len() - offset); let sliced_chunks = self.slice(offset, length)?; - Ok(PyChunkedArray::new(sliced_chunks, self.field.clone()).to_arro3(py)?) + Ok(PyChunkedArray::try_new(sliced_chunks, self.field.clone())?.to_arro3(py)?) } - pub fn to_numpy(&self, py: Python) -> PyResult { + fn to_numpy(&self, py: Python) -> PyResult { self.__array__(py, None, None) } #[getter] - pub fn r#type(&self, py: Python) -> PyResult { + fn r#type(&self, py: Python) -> PyResult { PyDataType::new(self.field.data_type().clone()).to_arro3(py) } } diff --git a/pyo3-arrow/src/datatypes.rs b/pyo3-arrow/src/datatypes.rs index 0fb24b0..95cdb00 100644 --- a/pyo3-arrow/src/datatypes.rs +++ b/pyo3-arrow/src/datatypes.rs @@ -29,15 +29,18 @@ impl<'a> FromPyObject<'a> for PyTimeUnit { } } +/// A Python-facing wrapper around [DataType]. #[derive(PartialEq, Eq, Debug)] #[pyclass(module = "arro3.core._core", name = "DataType", subclass)] pub struct PyDataType(DataType); impl PyDataType { + /// Construct a new PyDataType around a [DataType]. pub fn new(data_type: DataType) -> Self { Self(data_type) } + /// Consume this and return its inner part. pub fn into_inner(self) -> DataType { self.0 } @@ -104,28 +107,25 @@ impl Display for PyDataType { #[pymethods] impl PyDataType { - pub fn __arrow_c_schema__<'py>( - &'py self, - py: Python<'py>, - ) -> PyArrowResult> { + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { to_schema_pycapsule(py, &self.0) } - pub fn __eq__(&self, other: PyDataType) -> bool { + fn __eq__(&self, other: PyDataType) -> bool { self.equals(other, false) } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: Self) -> Self { + fn from_arrow(_cls: &Bound, input: Self) -> Self { input } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -136,7 +136,7 @@ impl PyDataType { } #[getter] - pub fn bit_width(&self) -> Option { + fn bit_width(&self) -> Option { self.0.primitive_width() } diff --git a/pyo3-arrow/src/error.rs b/pyo3-arrow/src/error.rs index 959feb8..3b58b61 100644 --- a/pyo3-arrow/src/error.rs +++ b/pyo3-arrow/src/error.rs @@ -1,9 +1,14 @@ +//! Contains the [`PyArrowError`], the Error returned by most fallible functions in this crate. + use pyo3::exceptions::{PyException, PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::PyDowncastError; +/// The Error variants returned by this crate. pub enum PyArrowError { + /// A wrapped [arrow::error::ArrowError] ArrowError(arrow::error::ArrowError), + /// A wrapped [PyErr] PyErr(PyErr), } @@ -49,4 +54,5 @@ impl From for PyArrowError { } } +/// A type wrapper around `Result`. pub type PyArrowResult = Result; diff --git a/pyo3-arrow/src/ffi/mod.rs b/pyo3-arrow/src/ffi/mod.rs index d462b78..c8b97e3 100644 --- a/pyo3-arrow/src/ffi/mod.rs +++ b/pyo3-arrow/src/ffi/mod.rs @@ -1,6 +1,7 @@ +//! Utilities for managing Arrow FFI between Python and Rust. + pub(crate) mod from_python; pub(crate) mod to_python; pub use to_python::chunked::{ArrayIterator, ArrayReader}; pub use to_python::{to_array_pycapsules, to_schema_pycapsule, to_stream_pycapsule}; -// pub use diff --git a/pyo3-arrow/src/ffi/to_python/chunked.rs b/pyo3-arrow/src/ffi/to_python/chunked.rs index 27f8ad8..b1e87c4 100644 --- a/pyo3-arrow/src/ffi/to_python/chunked.rs +++ b/pyo3-arrow/src/ffi/to_python/chunked.rs @@ -8,8 +8,8 @@ use arrow_array::ArrayRef; pub trait ArrayReader: Iterator> { /// Returns the field of this `ArrayReader`. /// - /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this - /// reader should have the same schema as returned from this method. + /// Implementation of this trait should guarantee that all `ArrayRef`'s returned by this + /// reader should have the same field as returned from this method. fn field(&self) -> FieldRef; } diff --git a/pyo3-arrow/src/ffi/to_python/utils.rs b/pyo3-arrow/src/ffi/to_python/utils.rs index 00e4fe3..ac48c01 100644 --- a/pyo3-arrow/src/ffi/to_python/utils.rs +++ b/pyo3-arrow/src/ffi/to_python/utils.rs @@ -43,7 +43,8 @@ pub fn to_array_pycapsules<'py>( Ok(tuple) } -/// Export an [`ArrayIterator`] to a PyCapsule holding an Arrow C Stream pointer. +/// Export an [`ArrayIterator`][crate::ffi::ArrayIterator] to a PyCapsule holding an Arrow C Stream +/// pointer. pub fn to_stream_pycapsule<'py>( py: Python<'py>, array_reader: Box, diff --git a/pyo3-arrow/src/field.rs b/pyo3-arrow/src/field.rs index 4287e0c..efddefb 100644 --- a/pyo3-arrow/src/field.rs +++ b/pyo3-arrow/src/field.rs @@ -15,14 +15,19 @@ use crate::ffi::to_python::to_schema_pycapsule; use crate::input::MetadataInput; use crate::PyDataType; +/// A Python-facing Arrow field. +/// +/// This is a wrapper around a [FieldRef]. #[pyclass(module = "arro3.core._core", name = "Field", subclass)] pub struct PyField(FieldRef); impl PyField { + /// Construct a new PyField around a [FieldRef] pub fn new(field: FieldRef) -> Self { Self(field) } + /// Consume this and return its internal [FieldRef] pub fn into_inner(self) -> FieldRef { self.0 } @@ -90,7 +95,7 @@ impl Display for PyField { impl PyField { #[new] #[pyo3(signature = (name, r#type, nullable=true, *, metadata=None))] - pub fn init( + fn init( name: String, r#type: PyDataType, nullable: bool, @@ -101,28 +106,25 @@ impl PyField { Ok(PyField::new(field.into())) } - pub fn __arrow_c_schema__<'py>( - &'py self, - py: Python<'py>, - ) -> PyArrowResult> { + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { to_schema_pycapsule(py, self.0.as_ref()) } - pub fn __eq__(&self, other: &PyField) -> bool { + fn __eq__(&self, other: &PyField) -> bool { self.0 == other.0 } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: Self) -> Self { + fn from_arrow(_cls: &Bound, input: Self) -> Self { input } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -132,14 +134,14 @@ impl PyField { Ok(Self::new(Arc::new(field))) } - pub fn equals(&self, other: PyField) -> bool { + fn equals(&self, other: PyField) -> bool { self.0 == other.0 } // Note: we can't return HashMap, Vec> because that will coerce keys and values to // a list, not bytes #[getter] - pub fn metadata<'py>(&'py self, py: Python<'py>) -> PyResult> { + fn metadata<'py>(&'py self, py: Python<'py>) -> PyResult> { let d = PyDict::new_bound(py); self.0.metadata().iter().try_for_each(|(key, val)| { d.set_item( @@ -151,21 +153,21 @@ impl PyField { } #[getter] - pub fn metadata_str(&self) -> HashMap { + fn metadata_str(&self) -> HashMap { self.0.metadata().clone() } #[getter] - pub fn name(&self) -> String { + fn name(&self) -> String { self.0.name().clone() } #[getter] - pub fn nullable(&self) -> bool { + fn nullable(&self) -> bool { self.0.is_nullable() } - pub fn remove_metadata(&self, py: Python) -> PyResult { + fn remove_metadata(&self, py: Python) -> PyResult { PyField::new( self.0 .as_ref() @@ -177,11 +179,11 @@ impl PyField { } #[getter] - pub fn r#type(&self, py: Python) -> PyResult { + fn r#type(&self, py: Python) -> PyResult { PyDataType::new(self.0.data_type().clone()).to_arro3(py) } - pub fn with_metadata(&self, py: Python, metadata: MetadataInput) -> PyResult { + fn with_metadata(&self, py: Python, metadata: MetadataInput) -> PyResult { PyField::new( self.0 .as_ref() @@ -192,15 +194,15 @@ impl PyField { .to_arro3(py) } - pub fn with_name(&self, py: Python, name: String) -> PyResult { + fn with_name(&self, py: Python, name: String) -> PyResult { PyField::new(self.0.as_ref().clone().with_name(name).into()).to_arro3(py) } - pub fn with_nullable(&self, py: Python, nullable: bool) -> PyResult { + fn with_nullable(&self, py: Python, nullable: bool) -> PyResult { PyField::new(self.0.as_ref().clone().with_nullable(nullable).into()).to_arro3(py) } - pub fn with_type(&self, py: Python, new_type: PyDataType) -> PyResult { + fn with_type(&self, py: Python, new_type: PyDataType) -> PyResult { PyField::new( self.0 .as_ref() diff --git a/pyo3-arrow/src/input.rs b/pyo3-arrow/src/input.rs index 45288f4..047e565 100644 --- a/pyo3-arrow/src/input.rs +++ b/pyo3-arrow/src/input.rs @@ -20,11 +20,14 @@ use crate::{PyArray, PyChunkedArray, PyField, PyRecordBatch, PyRecordBatchReader /// An enum over [PyRecordBatch] and [PyRecordBatchReader], used when a function accepts either /// Arrow object as input. pub enum AnyRecordBatch { + /// A single RecordBatch, held in a [PyRecordBatch]. RecordBatch(PyRecordBatch), + /// A stream of possibly multiple RecordBatches, held in a [PyRecordBatchReader]. Stream(PyRecordBatchReader), } impl AnyRecordBatch { + /// Consume this and convert it into a [RecordBatchReader]. pub fn into_reader(self) -> PyResult> { match self { Self::RecordBatch(batch) => { @@ -36,13 +39,17 @@ impl AnyRecordBatch { } } + /// Consume this and convert it into a [PyTable]. + /// + /// All record batches from the stream will be materialized in memory. pub fn into_table(self) -> PyArrowResult { let reader = self.into_reader()?; let schema = reader.schema(); let batches = reader.collect::>()?; - Ok(PyTable::new(batches, schema)) + Ok(PyTable::try_new(batches, schema)?) } + /// Access the underlying [SchemaRef] of this object. pub fn schema(&self) -> PyResult { match self { Self::RecordBatch(batch) => Ok(batch.as_ref().schema()), @@ -54,18 +61,24 @@ impl AnyRecordBatch { /// An enum over [PyArray] and [PyArrayReader], used when a function accepts either /// Arrow object as input. pub enum AnyArray { + /// A single Array, held in a [PyArray]. Array(PyArray), + /// A stream of possibly multiple Arrays, held in a [PyArrayReader]. Stream(PyArrayReader), } impl AnyArray { + /// Consume this and convert it into a [PyChunkedArray]. + /// + /// All arrays from the stream will be materialized in memory. pub fn into_chunked_array(self) -> PyArrowResult { let reader = self.into_reader()?; let field = reader.field(); let chunks = reader.collect::>()?; - Ok(PyChunkedArray::new(chunks, field)) + Ok(PyChunkedArray::try_new(chunks, field)?) } + /// Consume this and convert it into a [ArrayReader]. pub fn into_reader(self) -> PyResult> { match self { Self::Array(array) => { @@ -76,6 +89,7 @@ impl AnyArray { } } + /// Access the underlying [FieldRef] of this object. pub fn field(&self) -> PyResult { match self { Self::Array(array) => Ok(array.field().clone()), @@ -85,7 +99,7 @@ impl AnyArray { } #[derive(FromPyObject)] -pub enum MetadataInput { +pub(crate) enum MetadataInput { String(HashMap), Bytes(HashMap, Vec>), } @@ -113,7 +127,7 @@ impl Default for MetadataInput { } #[derive(FromPyObject)] -pub enum FieldIndexInput { +pub(crate) enum FieldIndexInput { Name(String), Position(usize), } @@ -128,7 +142,7 @@ impl FieldIndexInput { } #[derive(FromPyObject)] -pub enum NameOrField { +pub(crate) enum NameOrField { Name(String), Field(PyField), } @@ -150,7 +164,7 @@ impl NameOrField { } #[derive(FromPyObject)] -pub enum SelectIndices { +pub(crate) enum SelectIndices { Names(Vec), Positions(Vec), } diff --git a/pyo3-arrow/src/lib.rs b/pyo3-arrow/src/lib.rs index 57f4721..8310f7d 100644 --- a/pyo3-arrow/src/lib.rs +++ b/pyo3-arrow/src/lib.rs @@ -1,4 +1,5 @@ #![doc = include_str!("../README.md")] +#![deny(missing_docs)] mod array; mod array_reader; diff --git a/pyo3-arrow/src/record_batch.rs b/pyo3-arrow/src/record_batch.rs index 635b8a8..c9941dd 100644 --- a/pyo3-arrow/src/record_batch.rs +++ b/pyo3-arrow/src/record_batch.rs @@ -19,15 +19,20 @@ use crate::input::{AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, use crate::schema::display_schema; use crate::{PyArray, PyField, PySchema}; +/// A Python-facing Arrow record batch. +/// +/// This is a wrapper around a [RecordBatch]. #[pyclass(module = "arro3.core._core", name = "RecordBatch", subclass)] #[derive(Debug)] pub struct PyRecordBatch(RecordBatch); impl PyRecordBatch { + /// Construct a new PyRecordBatch from a [RecordBatch]. pub fn new(batch: RecordBatch) -> Self { Self(batch) } + /// Consume this, returning its internal [RecordBatch]. pub fn into_inner(self) -> RecordBatch { self.0 } @@ -91,7 +96,7 @@ impl Display for PyRecordBatch { impl PyRecordBatch { #[new] #[pyo3(signature = (data, *, metadata=None))] - pub fn init( + fn init( py: Python, data: &Bound, metadata: Option, @@ -109,7 +114,7 @@ impl PyRecordBatch { } #[allow(unused_variables)] - pub fn __arrow_c_array__<'py>( + fn __arrow_c_array__<'py>( &'py self, py: Python<'py>, requested_schema: Option>, @@ -119,7 +124,7 @@ impl PyRecordBatch { to_array_pycapsules(py, field.into(), &array, requested_schema) } - pub fn __eq__(&self, other: &PyRecordBatch) -> bool { + fn __eq__(&self, other: &PyRecordBatch) -> bool { self.0 == other.0 } @@ -127,13 +132,13 @@ impl PyRecordBatch { self.column(py, key) } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] #[pyo3(signature = (arrays, *, schema))] - pub fn from_arrays( + fn from_arrays( _cls: &Bound, arrays: Vec, schema: PySchema, @@ -153,7 +158,7 @@ impl PyRecordBatch { #[classmethod] #[pyo3(signature = (mapping, *, metadata=None))] - pub fn from_pydict( + fn from_pydict( _cls: &Bound, mapping: IndexMap, metadata: Option, @@ -172,7 +177,7 @@ impl PyRecordBatch { } #[classmethod] - pub fn from_struct_array(_cls: &Bound, struct_array: PyArray) -> PyArrowResult { + fn from_struct_array(_cls: &Bound, struct_array: PyArray) -> PyArrowResult { let (array, field) = struct_array.into_inner(); match field.data_type() { DataType::Struct(fields) => { @@ -187,7 +192,7 @@ impl PyRecordBatch { } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { + fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { match input { AnyRecordBatch::RecordBatch(rb) => Ok(rb), AnyRecordBatch::Stream(stream) => { @@ -199,7 +204,7 @@ impl PyRecordBatch { } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, schema_capsule: &Bound, array_capsule: &Bound, @@ -229,7 +234,7 @@ impl PyRecordBatch { } } - pub fn add_column( + fn add_column( &self, py: Python, i: usize, @@ -247,7 +252,7 @@ impl PyRecordBatch { Ok(PyRecordBatch::new(new_rb).to_arro3(py)?) } - pub fn append_column( + fn append_column( &self, py: Python, field: NameOrField, @@ -264,7 +269,7 @@ impl PyRecordBatch { Ok(PyRecordBatch::new(new_rb).to_arro3(py)?) } - pub fn column(&self, py: Python, i: FieldIndexInput) -> PyResult { + fn column(&self, py: Python, i: FieldIndexInput) -> PyResult { let column_index = i.into_position(self.0.schema_ref())?; let field = self.0.schema().field(column_index).clone(); let array = self.0.column(column_index).clone(); @@ -272,7 +277,7 @@ impl PyRecordBatch { } #[getter] - pub fn column_names(&self) -> Vec { + fn column_names(&self) -> Vec { self.0 .schema() .fields() @@ -282,17 +287,17 @@ impl PyRecordBatch { } #[getter] - pub fn columns(&self, py: Python) -> PyResult> { + fn columns(&self, py: Python) -> PyResult> { (0..self.num_columns()) .map(|i| self.column(py, FieldIndexInput::Position(i))) .collect() } - pub fn equals(&self, other: PyRecordBatch) -> bool { + fn equals(&self, other: PyRecordBatch) -> bool { self.0 == other.0 } - pub fn field(&self, py: Python, i: FieldIndexInput) -> PyResult { + fn field(&self, py: Python, i: FieldIndexInput) -> PyResult { let schema_ref = self.0.schema_ref(); let field = schema_ref.field(i.into_position(schema_ref)?); PyField::new(field.clone().into()).to_arro3(py) @@ -304,33 +309,33 @@ impl PyRecordBatch { } #[getter] - pub fn num_columns(&self) -> usize { + fn num_columns(&self) -> usize { self.0.num_columns() } #[getter] - pub fn num_rows(&self) -> usize { + fn num_rows(&self) -> usize { self.0.num_rows() } - pub fn remove_column(&self, py: Python, i: usize) -> PyResult { + fn remove_column(&self, py: Python, i: usize) -> PyResult { let mut rb = self.0.clone(); rb.remove_column(i); PyRecordBatch::new(rb).to_arro3(py) } #[getter] - pub fn schema(&self, py: Python) -> PyResult { + fn schema(&self, py: Python) -> PyResult { PySchema::new(self.0.schema()).to_arro3(py) } - pub fn select(&self, py: Python, columns: SelectIndices) -> PyArrowResult { + fn select(&self, py: Python, columns: SelectIndices) -> PyArrowResult { let positions = columns.into_positions(self.0.schema_ref().fields())?; let new_rb = self.0.project(&positions)?; Ok(PyRecordBatch::new(new_rb).to_arro3(py)?) } - pub fn set_column( + fn set_column( &self, py: Python, i: usize, @@ -349,12 +354,12 @@ impl PyRecordBatch { } #[getter] - pub fn shape(&self) -> (usize, usize) { + fn shape(&self) -> (usize, usize) { (self.num_rows(), self.num_columns()) } #[pyo3(signature = (offset=0, length=None))] - pub fn slice(&self, py: Python, offset: usize, length: Option) -> PyResult { + fn slice(&self, py: Python, offset: usize, length: Option) -> PyResult { let length = length.unwrap_or_else(|| self.num_rows() - offset); PyRecordBatch::new(self.0.slice(offset, length)).to_arro3(py) } @@ -364,14 +369,14 @@ impl PyRecordBatch { Ok(PyRecordBatch::new(new_batch).to_arro3(py)?) } - pub fn to_struct_array(&self, py: Python) -> PyArrowResult { + fn to_struct_array(&self, py: Python) -> PyArrowResult { let struct_array: StructArray = self.0.clone().into(); let field = Field::new_struct("", self.0.schema_ref().fields().clone(), false) .with_metadata(self.0.schema_ref().metadata.clone()); Ok(PyArray::new(Arc::new(struct_array), field.into()).to_arro3(py)?) } - pub fn with_schema(&self, py: Python, schema: PySchema) -> PyArrowResult { + fn with_schema(&self, py: Python, schema: PySchema) -> PyArrowResult { let new_schema = schema.into_inner(); let new_batch = RecordBatch::try_new(new_schema.clone(), self.0.columns().to_vec())?; Ok(PyRecordBatch::new(new_batch).to_arro3(py)?) diff --git a/pyo3-arrow/src/record_batch_reader.rs b/pyo3-arrow/src/record_batch_reader.rs index a79b6f1..a695283 100644 --- a/pyo3-arrow/src/record_batch_reader.rs +++ b/pyo3-arrow/src/record_batch_reader.rs @@ -17,10 +17,14 @@ use crate::input::AnyRecordBatch; use crate::schema::display_schema; use crate::{PyRecordBatch, PySchema, PyTable}; +/// A Python-facing Arrow record batch reader. +/// +/// This is a wrapper around a [RecordBatchReader]. #[pyclass(module = "arro3.core._core", name = "RecordBatchReader", subclass)] pub struct PyRecordBatchReader(pub(crate) Option>); impl PyRecordBatchReader { + /// Construct a new PyRecordBatchReader from an existing [RecordBatchReader]. pub fn new(reader: Box) -> Self { Self(Some(reader)) } @@ -47,7 +51,7 @@ impl PyRecordBatchReader { for batch in stream { batches.push(batch?); } - Ok(PyTable::new(batches, schema)) + Ok(PyTable::try_new(batches, schema)?) } /// Access the [SchemaRef] of this RecordBatchReader. @@ -113,7 +117,7 @@ impl Display for PyRecordBatchReader { #[pymethods] impl PyRecordBatchReader { #[allow(unused_variables)] - pub fn __arrow_c_stream__<'py>( + fn __arrow_c_stream__<'py>( &'py mut self, py: Python<'py>, requested_schema: Option>, @@ -145,18 +149,18 @@ impl PyRecordBatchReader { self.read_next_batch(py) } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { + fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { let reader = input.into_reader()?; Ok(Self::new(reader)) } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -168,11 +172,7 @@ impl PyRecordBatchReader { } #[classmethod] - pub fn from_batches( - _cls: &Bound, - schema: PySchema, - batches: Vec, - ) -> Self { + fn from_batches(_cls: &Bound, schema: PySchema, batches: Vec) -> Self { let batches = batches .into_iter() .map(|batch| batch.into_inner()) @@ -184,16 +184,16 @@ impl PyRecordBatchReader { } #[classmethod] - pub fn from_stream(_cls: &Bound, data: &Bound) -> PyResult { + fn from_stream(_cls: &Bound, data: &Bound) -> PyResult { data.extract() } #[getter] - pub fn closed(&self) -> bool { + fn closed(&self) -> bool { self.0.is_none() } - pub fn read_all(&mut self, py: Python) -> PyArrowResult { + fn read_all(&mut self, py: Python) -> PyArrowResult { let stream = self .0 .take() @@ -203,10 +203,10 @@ impl PyRecordBatchReader { for batch in stream { batches.push(batch?); } - Ok(PyTable::new(batches, schema).to_arro3(py)?) + Ok(PyTable::try_new(batches, schema)?.to_arro3(py)?) } - pub fn read_next_batch(&mut self, py: Python) -> PyArrowResult { + fn read_next_batch(&mut self, py: Python) -> PyArrowResult { let stream = self .0 .as_mut() @@ -220,7 +220,7 @@ impl PyRecordBatchReader { } #[getter] - pub fn schema(&self, py: Python) -> PyResult { + fn schema(&self, py: Python) -> PyResult { PySchema::new(self.schema_ref()?.clone()).to_arro3(py) } } diff --git a/pyo3-arrow/src/schema.rs b/pyo3-arrow/src/schema.rs index c8f51ed..8bc3041 100644 --- a/pyo3-arrow/src/schema.rs +++ b/pyo3-arrow/src/schema.rs @@ -15,14 +15,19 @@ use crate::ffi::to_python::to_schema_pycapsule; use crate::input::{FieldIndexInput, MetadataInput}; use crate::{PyDataType, PyField, PyTable}; +/// A Python-facing Arrow schema. +/// +/// This is a wrapper around a [SchemaRef]. #[pyclass(module = "arro3.core._core", name = "Schema", subclass)] pub struct PySchema(SchemaRef); impl PySchema { + /// Construct a new PySchema pub fn new(schema: SchemaRef) -> Self { Self(schema) } + /// Consume this and return its internal [SchemaRef] pub fn into_inner(self) -> SchemaRef { self.0 } @@ -101,7 +106,7 @@ pub(crate) fn display_schema(schema: &Schema, f: &mut std::fmt::Formatter<'_>) - impl PySchema { #[new] #[pyo3(signature = (fields, *, metadata=None))] - pub fn init(fields: Vec, metadata: Option) -> PyResult { + fn init(fields: Vec, metadata: Option) -> PyResult { let fields = fields .into_iter() .map(|field| field.into_inner()) @@ -113,14 +118,11 @@ impl PySchema { Ok(schema) } - pub fn __arrow_c_schema__<'py>( - &'py self, - py: Python<'py>, - ) -> PyArrowResult> { + fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult> { to_schema_pycapsule(py, self.0.as_ref()) } - pub fn __eq__(&self, other: &PySchema) -> bool { + fn __eq__(&self, other: &PySchema) -> bool { self.0 == other.0 } @@ -128,21 +130,21 @@ impl PySchema { self.field(py, key) } - pub fn __len__(&self) -> usize { + fn __len__(&self) -> usize { self.0.fields().len() } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: Self) -> Self { + fn from_arrow(_cls: &Bound, input: Self) -> Self { input } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -152,28 +154,28 @@ impl PySchema { Ok(Self::new(Arc::new(schema))) } - pub fn append(&self, py: Python, field: PyField) -> PyResult { + fn append(&self, py: Python, field: PyField) -> PyResult { let mut fields = self.0.fields().to_vec(); fields.push(field.into_inner()); let schema = Schema::new_with_metadata(fields, self.0.metadata().clone()); PySchema::new(schema.into()).to_arro3(py) } - pub fn empty_table(&self, py: Python) -> PyResult { - PyTable::new(vec![], self.into()).to_arro3(py) + fn empty_table(&self, py: Python) -> PyResult { + PyTable::try_new(vec![], self.into())?.to_arro3(py) } - pub fn equals(&self, other: PySchema) -> bool { + fn equals(&self, other: PySchema) -> bool { self.0 == other.0 } - pub fn field(&self, py: Python, i: FieldIndexInput) -> PyArrowResult { + fn field(&self, py: Python, i: FieldIndexInput) -> PyArrowResult { let index = i.into_position(&self.0)?; let field = self.0.field(index); Ok(PyField::new(field.clone().into()).to_arro3(py)?) } - pub fn get_all_field_indices(&self, name: String) -> Vec { + fn get_all_field_indices(&self, name: String) -> Vec { let mut indices = self .0 .fields() @@ -186,7 +188,7 @@ impl PySchema { indices } - pub fn get_field_index(&self, name: String) -> PyArrowResult { + fn get_field_index(&self, name: String) -> PyArrowResult { let indices = self .0 .fields() @@ -202,7 +204,7 @@ impl PySchema { } } - pub fn insert(&self, py: Python, i: usize, field: PyField) -> PyResult { + fn insert(&self, py: Python, i: usize, field: PyField) -> PyResult { let mut fields = self.0.fields().to_vec(); fields.insert(i, field.into_inner()); let schema = Schema::new_with_metadata(fields, self.0.metadata().clone()); @@ -212,7 +214,7 @@ impl PySchema { // Note: we can't return HashMap, Vec> because that will coerce keys and values to // a list, not bytes #[getter] - pub fn metadata<'py>(&'py self, py: Python<'py>) -> PyResult> { + fn metadata<'py>(&'py self, py: Python<'py>) -> PyResult> { let d = PyDict::new_bound(py); self.0.metadata().iter().try_for_each(|(key, val)| { d.set_item( @@ -224,23 +226,23 @@ impl PySchema { } #[getter] - pub fn metadata_str(&self) -> HashMap { + fn metadata_str(&self) -> HashMap { self.0.metadata().clone() } #[getter] - pub fn names(&self) -> Vec { + fn names(&self) -> Vec { self.0.fields().iter().map(|f| f.name().clone()).collect() } - pub fn remove(&self, py: Python, i: usize) -> PyResult { + fn remove(&self, py: Python, i: usize) -> PyResult { let mut fields = self.0.fields().to_vec(); fields.remove(i); let schema = Schema::new_with_metadata(fields, self.0.metadata().clone()); PySchema::new(schema.into()).to_arro3(py) } - pub fn remove_metadata(&self, py: Python) -> PyResult { + fn remove_metadata(&self, py: Python) -> PyResult { PySchema::new( self.0 .as_ref() @@ -251,7 +253,7 @@ impl PySchema { .to_arro3(py) } - pub fn set(&self, py: Python, i: usize, field: PyField) -> PyResult { + fn set(&self, py: Python, i: usize, field: PyField) -> PyResult { let mut fields = self.0.fields().to_vec(); fields[i] = field.into_inner(); let schema = Schema::new_with_metadata(fields, self.0.metadata().clone()); @@ -259,7 +261,7 @@ impl PySchema { } #[getter] - pub fn types(&self, py: Python) -> PyArrowResult> { + fn types(&self, py: Python) -> PyArrowResult> { Ok(self .0 .fields() @@ -268,7 +270,7 @@ impl PySchema { .collect::>()?) } - pub fn with_metadata(&self, py: Python, metadata: MetadataInput) -> PyResult { + fn with_metadata(&self, py: Python, metadata: MetadataInput) -> PyResult { let schema = self .0 .as_ref() diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs index 8098bbe..456d7ef 100644 --- a/pyo3-arrow/src/table.rs +++ b/pyo3-arrow/src/table.rs @@ -24,6 +24,9 @@ use crate::schema::display_schema; use crate::utils::schema_equals; use crate::{PyChunkedArray, PyField, PyRecordBatch, PyRecordBatchReader, PySchema}; +/// A Python-facing Arrow table. +/// +/// This is a wrapper around a [SchemaRef] and a `Vec` of [RecordBatch]. #[pyclass(module = "arro3.core._core", name = "Table", subclass)] #[derive(Debug)] pub struct PyTable { @@ -32,20 +35,24 @@ pub struct PyTable { } impl PyTable { - pub fn new(batches: Vec, schema: SchemaRef) -> Self { - assert!( - batches - .iter() - .all(|rb| schema_equals(rb.schema_ref(), &schema)), - "All batches must have same schema" - ); - Self { schema, batches } + /// Create a new table from batches and a schema. + pub fn try_new(batches: Vec, schema: SchemaRef) -> PyResult { + if !batches + .iter() + .all(|rb| schema_equals(rb.schema_ref(), &schema)) + { + return Err(PyTypeError::new_err("All batches must have same schema")); + } + + Ok(Self { schema, batches }) } + /// Access the underlying batches pub fn batches(&self) -> &[RecordBatch] { &self.batches } + /// Consume this and return its internal batches and schema. pub fn into_inner(self) -> (Vec, SchemaRef) { (self.batches, self.schema) } @@ -88,7 +95,7 @@ impl Display for PyTable { #[pymethods] impl PyTable { #[allow(unused_variables)] - pub fn __arrow_c_stream__<'py>( + fn __arrow_c_stream__<'py>( &'py self, py: Python<'py>, requested_schema: Option>, @@ -105,7 +112,7 @@ impl PyTable { to_stream_pycapsule(py, array_reader, requested_schema) } - pub fn __eq__(&self, other: &PyTable) -> bool { + fn __eq__(&self, other: &PyTable) -> bool { self.batches == other.batches && self.schema == other.schema } @@ -113,21 +120,21 @@ impl PyTable { self.column(py, key) } - pub fn __len__(&self) -> usize { + fn __len__(&self) -> usize { self.batches.iter().fold(0, |acc, x| acc + x.num_rows()) } - pub fn __repr__(&self) -> String { + fn __repr__(&self) -> String { self.to_string() } #[classmethod] - pub fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { + fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { input.into_table() } #[classmethod] - pub fn from_arrow_pycapsule( + pub(crate) fn from_arrow_pycapsule( _cls: &Bound, capsule: &Bound, ) -> PyResult { @@ -142,12 +149,12 @@ impl PyTable { batches.push(batch); } - Ok(Self::new(batches, schema)) + Self::try_new(batches, schema) } #[classmethod] #[pyo3(signature = (batches, *, schema=None))] - pub fn from_batches( + fn from_batches( _cls: &Bound, batches: Vec, schema: Option, @@ -156,7 +163,7 @@ impl PyTable { let schema = schema.ok_or(PyValueError::new_err( "schema must be passed for an empty list of batches", ))?; - return Ok(Self::new(vec![], schema.into_inner())); + return Ok(Self::try_new(vec![], schema.into_inner())?); } let batches = batches @@ -166,12 +173,12 @@ impl PyTable { let schema = schema .map(|s| s.into_inner()) .unwrap_or(batches.first().unwrap().schema()); - Ok(Self::new(batches, schema)) + Ok(Self::try_new(batches, schema)?) } #[classmethod] #[pyo3(signature = (mapping, *, schema=None, metadata=None))] - pub fn from_pydict( + fn from_pydict( cls: &Bound, mapping: IndexMap, schema: Option, @@ -183,7 +190,7 @@ impl PyTable { #[classmethod] #[pyo3(signature = (arrays, *, names=None, schema=None, metadata=None))] - pub fn from_arrays( + fn from_arrays( _cls: &Bound, arrays: Vec, names: Option>, @@ -214,7 +221,7 @@ impl PyTable { }; if columns.is_empty() { - return Ok(Self::new(vec![], schema)); + return Ok(Self::try_new(vec![], schema)?); } let column_chunk_lengths = columns @@ -247,10 +254,10 @@ impl PyTable { batches.push(batch); } - Ok(Self::new(batches, schema)) + Ok(Self::try_new(batches, schema)?) } - pub fn add_column( + fn add_column( &self, py: Python, i: usize, @@ -289,10 +296,10 @@ impl PyTable { }) .collect::, PyArrowError>>()?; - Ok(PyTable::new(new_batches, new_schema).to_arro3(py)?) + Ok(PyTable::try_new(new_batches, new_schema)?.to_arro3(py)?) } - pub fn append_column( + fn append_column( &self, py: Python, field: NameOrField, @@ -330,15 +337,15 @@ impl PyTable { }) .collect::, PyArrowError>>()?; - Ok(PyTable::new(new_batches, new_schema).to_arro3(py)?) + Ok(PyTable::try_new(new_batches, new_schema)?.to_arro3(py)?) } #[getter] - pub fn chunk_lengths(&self) -> Vec { + fn chunk_lengths(&self) -> Vec { self.batches.iter().map(|batch| batch.num_rows()).collect() } - pub fn column(&self, py: Python, i: FieldIndexInput) -> PyArrowResult { + fn column(&self, py: Python, i: FieldIndexInput) -> PyArrowResult { let column_index = i.into_position(&self.schema)?; let field = self.schema.field(column_index).clone(); let chunks = self @@ -346,11 +353,11 @@ impl PyTable { .iter() .map(|batch| batch.column(column_index).clone()) .collect(); - Ok(PyChunkedArray::new(chunks, field.into()).to_arro3(py)?) + Ok(PyChunkedArray::try_new(chunks, field.into())?.to_arro3(py)?) } #[getter] - pub fn column_names(&self) -> Vec { + fn column_names(&self) -> Vec { self.schema .fields() .iter() @@ -359,18 +366,18 @@ impl PyTable { } #[getter] - pub fn columns(&self, py: Python) -> PyArrowResult> { + fn columns(&self, py: Python) -> PyArrowResult> { (0..self.num_columns()) .map(|i| self.column(py, FieldIndexInput::Position(i))) .collect() } - pub fn combine_chunks(&self, py: Python) -> PyArrowResult { + fn combine_chunks(&self, py: Python) -> PyArrowResult { let batch = concat_batches(&self.schema, &self.batches)?; - Ok(PyTable::new(vec![batch], self.schema.clone()).to_arro3(py)?) + Ok(PyTable::try_new(vec![batch], self.schema.clone())?.to_arro3(py)?) } - pub fn field(&self, py: Python, i: FieldIndexInput) -> PyArrowResult { + fn field(&self, py: Python, i: FieldIndexInput) -> PyArrowResult { let field = self.schema.field(i.into_position(&self.schema)?); Ok(PyField::new(field.clone().into()).to_arro3(py)?) } @@ -383,20 +390,20 @@ impl PyTable { } #[getter] - pub fn num_columns(&self) -> usize { + fn num_columns(&self) -> usize { self.schema.fields().len() } #[getter] - pub fn num_rows(&self) -> usize { + fn num_rows(&self) -> usize { self.batches() .iter() .fold(0, |acc, batch| acc + batch.num_rows()) } - // pub fn rechunk(&self, py: Python, max_chunksize: usize) {} + // fn rechunk(&self, py: Python, max_chunksize: usize) {} - pub fn remove_column(&self, py: Python, i: usize) -> PyArrowResult { + fn remove_column(&self, py: Python, i: usize) -> PyArrowResult { let mut fields = self.schema.fields().to_vec(); fields.remove(i); let new_schema = Arc::new(Schema::new_with_metadata( @@ -414,10 +421,10 @@ impl PyTable { }) .collect::, PyArrowError>>()?; - Ok(PyTable::new(new_batches, new_schema).to_arro3(py)?) + Ok(PyTable::try_new(new_batches, new_schema)?.to_arro3(py)?) } - pub fn rename_columns(&self, py: Python, names: Vec) -> PyArrowResult { + fn rename_columns(&self, py: Python, names: Vec) -> PyArrowResult { if names.len() != self.num_columns() { return Err(PyValueError::new_err("When names is a list[str], must pass the same number of names as there are columns.").into()); } @@ -433,15 +440,15 @@ impl PyTable { new_fields, self.schema.metadata().clone(), )); - Ok(PyTable::new(self.batches.clone(), new_schema).to_arro3(py)?) + Ok(PyTable::try_new(self.batches.clone(), new_schema)?.to_arro3(py)?) } #[getter] - pub fn schema(&self, py: Python) -> PyResult { + fn schema(&self, py: Python) -> PyResult { PySchema::new(self.schema.clone()).to_arro3(py) } - pub fn select(&self, py: Python, columns: SelectIndices) -> PyArrowResult { + fn select(&self, py: Python, columns: SelectIndices) -> PyArrowResult { let positions = columns.into_positions(self.schema.fields())?; let new_schema = Arc::new(self.schema.project(&positions)?); @@ -450,10 +457,10 @@ impl PyTable { .iter() .map(|batch| batch.project(&positions)) .collect::, ArrowError>>()?; - Ok(PyTable::new(new_batches, new_schema).to_arro3(py)?) + Ok(PyTable::try_new(new_batches, new_schema)?.to_arro3(py)?) } - pub fn set_column( + fn set_column( &self, py: Python, i: usize, @@ -492,22 +499,22 @@ impl PyTable { }) .collect::, PyArrowError>>()?; - Ok(PyTable::new(new_batches, new_schema).to_arro3(py)?) + Ok(PyTable::try_new(new_batches, new_schema)?.to_arro3(py)?) } #[getter] - pub fn shape(&self) -> (usize, usize) { + fn shape(&self) -> (usize, usize) { (self.num_rows(), self.num_columns()) } - pub fn to_batches(&self, py: Python) -> PyResult> { + fn to_batches(&self, py: Python) -> PyResult> { self.batches .iter() .map(|batch| PyRecordBatch::new(batch.clone()).to_arro3(py)) .collect() } - pub fn to_reader(&self, py: Python) -> PyResult { + fn to_reader(&self, py: Python) -> PyResult { let reader = Box::new(RecordBatchIterator::new( self.batches.clone().into_iter().map(Ok), self.schema.clone(), @@ -515,7 +522,7 @@ impl PyTable { PyRecordBatchReader::new(reader).to_arro3(py) } - pub fn to_struct_array(&self, py: Python) -> PyArrowResult { + fn to_struct_array(&self, py: Python) -> PyArrowResult { let chunks = self .batches .iter() @@ -526,16 +533,16 @@ impl PyTable { .collect::>(); let field = Field::new_struct("", self.schema.fields().clone(), false) .with_metadata(self.schema.metadata.clone()); - Ok(PyChunkedArray::new(chunks, field.into()).to_arro3(py)?) + Ok(PyChunkedArray::try_new(chunks, field.into())?.to_arro3(py)?) } - pub fn with_schema(&self, py: Python, schema: PySchema) -> PyArrowResult { + fn with_schema(&self, py: Python, schema: PySchema) -> PyArrowResult { let new_schema = schema.into_inner(); let new_batches = self .batches .iter() .map(|batch| RecordBatch::try_new(new_schema.clone(), batch.columns().to_vec())) .collect::, ArrowError>>()?; - Ok(PyTable::new(new_batches, new_schema).to_arro3(py)?) + Ok(PyTable::try_new(new_batches, new_schema)?.to_arro3(py)?) } }