diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi index ccbf79d..e16dee8 100644 --- a/arro3-core/python/arro3/core/_core.pyi +++ b/arro3-core/python/arro3/core/_core.pyi @@ -9,6 +9,7 @@ from .types import ( ) class Array: + """An Arrow Array.""" def __init__(self, obj: Sequence[Any], /, type: ArrowSchemaExportable) -> None: """Create arro3.core.Array instance from a sequence of Python objects. @@ -16,10 +17,23 @@ class Array: obj: A sequence of input objects. type: Explicit type to attempt to coerce to. """ - def __array__(self, dtype=None, copy=None) -> NDArray: ... + def __array__(self, dtype=None, copy=None) -> NDArray: + """ + An implementation of the Array interface, for interoperability with numpy and + other array libraries. + """ def __arrow_c_array__( self, requested_schema: object | None = None - ) -> tuple[object, object]: ... + ) -> tuple[object, object]: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this + array into a pyarrow array, without copying memory. + """ def __eq__(self, other) -> bool: ... def __len__(self) -> int: ... def __repr__(self) -> str: ... @@ -82,14 +96,34 @@ class Array: """The data type of this array.""" class ArrayReader: - def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... + """A stream of Arrow `Array`s. + + This is similar to the [`RecordBatchReader`][arro3.core.RecordBatchReader] but each + item yielded from the stream is an [`Array`][arro3.core.Array], not a + [`RecordBatch`][arro3.core.RecordBatch]. + """ + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example, you can call [`pyarrow.table()`][pyarrow.table] to convert this + ArrayReader to a pyarrow table, without copying memory. + """ def __iter__(self) -> ArrayReader: ... def __next__(self) -> Array: ... def __repr__(self) -> str: ... @classmethod def from_arrow( cls, input: ArrowArrayExportable | ArrowStreamExportable - ) -> ArrayReader: ... + ) -> ArrayReader: + """Construct this from an existing Arrow object. + + It can be called on anything that exports the Arrow stream interface + (has an `__arrow_c_stream__` method), such as a `Table` or `ArrayReader`. + """ @classmethod def from_arrow_pycapsule(cls, capsule) -> ArrayReader: """Construct this object from a bare Arrow PyCapsule""" @@ -98,28 +132,60 @@ class ArrayReader: cls, schema: ArrowSchemaExportable, arrays: Sequence[ArrowArrayExportable] ) -> ArrayReader: ... @classmethod - def from_stream(cls, data: ArrowStreamExportable) -> ArrayReader: ... + def from_stream(cls, data: ArrowStreamExportable) -> ArrayReader: + """Construct this from an existing Arrow object. + + This is an alias of and has the same behavior as + [`from_arrow`][arro3.core.ArrayReader.from_arrow], but is included for parity + with [`pyarrow.RecordBatchReader`][pyarrow.RecordBatchReader]. + """ + @property + def closed(self) -> bool: + """Returns `true` if this reader has already been consumed.""" + def read_all(self) -> ChunkedArray: + """Read all batches from this stream into a ChunkedArray.""" + def read_next_array(self) -> Array: + """Read the next array from this stream.""" @property - def closed(self) -> bool: ... - def read_all(self) -> ChunkedArray: ... - def read_next_array(self) -> Array: ... - def field(self) -> Field: ... + def field(self) -> Field: + """Access the field of this reader.""" class ChunkedArray: + """An Arrow ChunkedArray.""" def __init__( self, arrays: Sequence[ArrowArrayExportable], type: ArrowSchemaExportable | None = None, ) -> None: ... - def __array__(self, dtype=None, copy=None) -> NDArray: ... - def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... + def __array__(self, dtype=None, copy=None) -> NDArray: + """ + An implementation of the Array interface, for interoperability with numpy and + other array libraries. + """ + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example (as of pyarrow v16), you can call + [`pyarrow.chunked_array()`][pyarrow.chunked_array] to convert this array into a + pyarrow array, without copying memory. + """ def __eq__(self, other) -> bool: ... def __len__(self) -> int: ... def __repr__(self) -> str: ... @classmethod def from_arrow( cls, input: ArrowArrayExportable | ArrowStreamExportable - ) -> ChunkedArray: ... + ) -> ChunkedArray: + """Construct this from an existing Arrow object. + + It can be called on anything that exports the Arrow stream interface (has an + `__arrow_c_stream__` method). All batches from the stream will be materialized + in memory. + """ @classmethod def from_arrow_pycapsule(cls, capsule) -> ChunkedArray: """Construct this object from a bare Arrow PyCapsule""" @@ -129,29 +195,70 @@ class ChunkedArray: Args: target_type: Type to cast array to. """ - def chunk(self, i: int) -> Array: ... + def chunk(self, i: int) -> Array: + """Select a chunk by its index. + + Args: + i: chunk index. + + Returns: + new Array. + """ @property - def chunks(self) -> list[Array]: ... - def combine_chunks(self) -> Array: ... - def equals(self, other: ArrowStreamExportable) -> bool: ... - def length(self) -> int: ... + def chunks(self) -> list[Array]: + """Convert to a list of single-chunked arrays.""" + def combine_chunks(self) -> Array: + """Flatten this ChunkedArray into a single non-chunked array.""" + def equals(self, other: ArrowStreamExportable) -> bool: + """Return whether the contents of two chunked arrays are equal.""" + def length(self) -> int: + """Return length of a ChunkedArray.""" @property - def nbytes(self) -> int: ... + def nbytes(self) -> int: + """Total number of bytes consumed by the elements of the chunked array.""" @property - def null_count(self) -> int: ... + def null_count(self) -> int: + """Number of null entries""" @property - def num_chunks(self) -> int: ... - def slice(self, offset: int = 0, length: int | None = None) -> ChunkedArray: ... - def to_numpy(self) -> NDArray: ... + def num_chunks(self) -> int: + """Number of underlying chunks.""" + def slice(self, offset: int = 0, length: int | None = None) -> ChunkedArray: + """Compute zero-copy slice of this ChunkedArray + + Args: + offset: Offset from start of array to slice. Defaults to 0. + length: Length of slice (default is until end of batch starting from offset). + + Returns: + New ChunkedArray + """ + def to_numpy(self) -> NDArray: + """Copy this array to a `numpy` NDArray""" @property - def type(self) -> DataType: ... + def type(self) -> DataType: + """Return data type of a ChunkedArray.""" class DataType: - def __arrow_c_schema__(self) -> object: ... + """An Arrow DataType.""" + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example, you can call [`pyarrow.field()`][pyarrow.field] to convert this + array into a pyarrow field, without copying memory. + """ def __eq__(self, other) -> bool: ... def __repr__(self) -> str: ... @classmethod - def from_arrow(cls, input: ArrowSchemaExportable) -> DataType: ... + def from_arrow(cls, input: ArrowSchemaExportable) -> DataType: + """Construct this from an existing Arrow object. + + It can be called on anything that exports the Arrow schema interface + (has an `__arrow_c_schema__` method). + """ @classmethod def from_arrow_pycapsule(cls, capsule) -> DataType: """Construct this object from a bare Arrow PyCapsule""" @@ -564,6 +671,7 @@ class DataType: def is_dictionary_key_type(t: ArrowSchemaExportable) -> bool: ... class Field: + """An Arrow Field.""" def __init__( self, name: str, @@ -572,33 +680,68 @@ class Field: *, metadata: dict[str, str] | dict[bytes, bytes] | None = None, ) -> None: ... - def __arrow_c_schema__(self) -> object: ... + def __arrow_c_schema__(self) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example, you can call [`pyarrow.field()`][pyarrow.field] to convert this + array into a pyarrow field, without copying memory. + """ def __eq__(self, other) -> bool: ... def __repr__(self) -> str: ... @classmethod - def from_arrow(cls, input: ArrowSchemaExportable) -> Field: ... + def from_arrow(cls, input: ArrowSchemaExportable) -> Field: + """Construct this from an existing Arrow object. + + It can be called on anything that exports the Arrow schema interface + (has an `__arrow_c_schema__` method). + """ @classmethod def from_arrow_pycapsule(cls, capsule) -> Field: """Construct this object from a bare Arrow PyCapsule""" - def equals(self, other: ArrowSchemaExportable) -> bool: ... + def equals(self, other: ArrowSchemaExportable) -> bool: + """Test if this field is equal to the other.""" @property - def metadata(self) -> dict[bytes, bytes]: ... + def metadata(self) -> dict[bytes, bytes]: + """The schema's metadata.""" @property - def metadata_str(self) -> dict[str, str]: ... + def metadata_str(self) -> dict[str, str]: + """The schema's metadata where keys and values are `str`, not `bytes`.""" @property - def name(self) -> str: ... + def name(self) -> str: + """The field name.""" @property - def nullable(self) -> bool: ... - def remove_metadata(self) -> Field: ... + def nullable(self) -> bool: + """The field nullability.""" + def remove_metadata(self) -> Field: + """Create new field without metadata, if any.""" @property - def type(self) -> DataType: ... - def with_metadata(self, metadata: dict[str, str] | dict[bytes, bytes]) -> Field: ... - def with_name(self, name: str) -> Field: ... - def with_nullable(self, nullable: bool) -> Field: ... - def with_type(self, new_type: ArrowSchemaExportable) -> Field: ... + def type(self) -> DataType: + """Access the data type of this field.""" + def with_metadata(self, metadata: dict[str, str] | dict[bytes, bytes]) -> Field: + """Add metadata as dict of string keys and values to Field.""" + def with_name(self, name: str) -> Field: + """A copy of this field with the replaced name.""" + def with_nullable(self, nullable: bool) -> Field: + """A copy of this field with the replaced nullability.""" + def with_type(self, new_type: ArrowSchemaExportable) -> Field: + """A copy of this field with the replaced type""" class RecordBatch: + """ + A two-dimensional batch of column-oriented data with a defined + [schema][arro3.core.Schema]. + + A `RecordBatch` is a two-dimensional dataset of a number of contiguous arrays, each + the same length. A record batch has a schema which must match its arrays' datatypes. + + Record batches are a convenient unit of work for various serialization and + computation functions, possibly incremental. + """ def __init__( self, data: ArrowArrayExportable | dict[str, ArrowArrayExportable], @@ -607,7 +750,16 @@ class RecordBatch: ) -> None: ... def __arrow_c_array__( self, requested_schema: object | None = None - ) -> tuple[object, object]: ... + ) -> tuple[object, object]: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example, you can call [`pyarrow.record_batch()`][pyarrow.record_batch] to + convert this RecordBatch into a pyarrow RecordBatch, without copying memory. + """ def __eq__(self, other) -> bool: ... def __getitem__(self, key: int | str) -> Array: ... def __repr__(self) -> str: ... @@ -615,7 +767,7 @@ class RecordBatch: def from_arrays( cls, arrays: Sequence[ArrowArrayExportable], *, schema: ArrowSchemaExportable ) -> RecordBatch: - """Construct a RecordBatch from multiple pyarrow.Arrays + """Construct a RecordBatch from multiple Arrays Args: arrays: One for each field in RecordBatch @@ -655,7 +807,23 @@ class RecordBatch: @classmethod def from_arrow( cls, input: ArrowArrayExportable | ArrowStreamExportable - ) -> RecordBatch: ... + ) -> RecordBatch: + """ + + Construct this from an existing Arrow RecordBatch. + + + It can be called on anything that exports the Arrow data interface + (has a `__arrow_c_array__` method) and returns a StructArray.. + + + Args: + input: Arrow array to use for constructing this object + + + Returns: + new RecordBatch + """ @classmethod def from_arrow_pycapsule(cls, schema_capsule, array_capsule) -> RecordBatch: """Construct this object from bare Arrow PyCapsules""" @@ -710,7 +878,8 @@ class RecordBatch: _description_ """ @property - def nbytes(self) -> int: ... + def nbytes(self) -> int: + """Total number of bytes consumed by the elements of the record batch.""" @property def num_columns(self) -> int: """Number of columns.""" @@ -733,26 +902,97 @@ class RecordBatch: @property def schema(self) -> Schema: """Access the schema of this RecordBatch""" - def select(self, columns: list[int] | list[str]) -> RecordBatch: ... + def select(self, columns: list[int] | list[str]) -> RecordBatch: + """ + Select columns of the RecordBatch. + + Returns a new RecordBatch with the specified columns, and metadata preserved. + + + Args: + columns: The column names or integer indices to select. + + Returns: + New RecordBatch. + """ def set_column( self, i: int, field: str | ArrowSchemaExportable, column: ArrowArrayExportable - ) -> RecordBatch: ... + ) -> RecordBatch: + """Replace column in RecordBatch at position. + + Args: + i: Index to place the column at. + field: If a string is passed then the type is deduced from the column data. + column: Column data. + + Returns: + New RecordBatch. + """ @property - def shape(self) -> tuple[int, int]: ... - def slice(self, offset: int = 0, length: int | None = None) -> RecordBatch: ... - def take(self, indices: ArrowArrayExportable) -> RecordBatch: ... - def to_struct_array(self) -> Array: ... - def with_schema(self, schema: ArrowSchemaExportable) -> RecordBatch: ... + def shape(self) -> tuple[int, int]: + """ + Dimensions of the table or record batch: (number of rows, number of columns). + """ + def slice(self, offset: int = 0, length: int | None = None) -> RecordBatch: + """Compute zero-copy slice of this RecordBatch + + Args: + offset: Offset from start of record batch to slice. Defaults to 0. + length: Length of slice (default is until end of batch starting from offset). Defaults to None. + + Returns: + New RecordBatch. + """ + def take(self, indices: ArrowArrayExportable) -> RecordBatch: + """Select rows from a Table or RecordBatch. + + Args: + indices: The indices in the tabular object whose rows will be returned. + + Returns: + _description_ + """ + def to_struct_array(self) -> Array: + """Convert to a struct array. + + Returns: + _description_ + """ + def with_schema(self, schema: ArrowSchemaExportable) -> RecordBatch: + """Return a RecordBatch with the provided schema.""" class RecordBatchReader: - def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... + """An Arrow RecordBatchReader. + + A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch]. + """ + def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: + """ + An implementation of the [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This dunder method should not be called directly, but enables zero-copy data + transfer to other Python libraries that understand Arrow memory. + + For example, you can call + [`pyarrow.RecordBatchReader.from_stream`][pyarrow.RecordBatchReader.from_stream] + to convert this stream to a pyarrow `RecordBatchReader`. Alternatively, you can + call [`pyarrow.table()`][pyarrow.table] to consume this stream to a pyarrow + table or [`Table.from_arrow()`][arro3.core.Table] to consume this stream to an + arro3 Table. + """ def __iter__(self) -> RecordBatchReader: ... def __next__(self) -> RecordBatch: ... def __repr__(self) -> str: ... @classmethod def from_arrow( cls, input: ArrowArrayExportable | ArrowStreamExportable - ) -> RecordBatchReader: ... + ) -> RecordBatchReader: + """ + Construct this from an existing Arrow object. + + It can be called on anything that exports the Arrow stream interface + (has an `__arrow_c_stream__` method), such as a `Table` or `RecordBatchReader`. + """ @classmethod def from_arrow_pycapsule(cls, capsule) -> RecordBatchReader: """Construct this object from a bare Arrow PyCapsule""" @@ -763,12 +1003,16 @@ class RecordBatchReader: @classmethod def from_stream(cls, data: ArrowStreamExportable) -> RecordBatchReader: ... @property - def closed(self) -> bool: ... + def closed(self) -> bool: + """Returns `true` if this reader has already been consumed.""" def read_all(self) -> Table: ... def read_next_batch(self) -> RecordBatch: ... - def schema(self) -> Schema: ... + @property + def schema(self) -> Schema: + """Access the schema of this table.""" class Schema: + """An arrow Schema.""" def __init__( self, fields: Sequence[ArrowSchemaExportable], @@ -784,10 +1028,6 @@ class Schema: For example, you can call [`pyarrow.schema()`][pyarrow.schema] to convert this array into a pyarrow schema, without copying memory. - - - Returns: - _description_ """ def __eq__(self, other) -> bool: ... @@ -936,6 +1176,7 @@ class Schema: """ class Table: + """A collection of top-level named, equal length Arrow arrays.""" def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: """ An implementation of the [Arrow PyCapsule @@ -945,12 +1186,6 @@ class Table: For example, you can call [`pyarrow.table()`][pyarrow.table] to convert this array into a pyarrow table, without copying memory. - - Args: - requested_schema: _description_. Defaults to None. - - Returns: - _description_ """ def __eq__(self, other) -> bool: ... def __getitem__(self, key: int | str) -> ChunkedArray: ... @@ -1029,7 +1264,16 @@ class Table: batches: Sequence[ArrowArrayExportable], *, schema: ArrowSchemaExportable | None = None, - ) -> Table: ... + ) -> Table: + """Construct a Table from a sequence of Arrow RecordBatches. + + Args: + batches: Sequence of RecordBatch to be converted, all schemas must be equal. + schema: If not passed, will be inferred from the first RecordBatch. Defaults to None. + + Returns: + New Table. + """ @overload @classmethod def from_pydict( @@ -1138,7 +1382,8 @@ class Table: _description_ """ @property - def nbytes(self) -> int: ... + def nbytes(self) -> int: + """Total number of bytes consumed by the elements of the table.""" @property def num_columns(self) -> int: """Number of columns in this table.""" diff --git a/arro3-core/python/arro3/core/types.py b/arro3-core/python/arro3/core/types.py index 546a904..476b6e4 100644 --- a/arro3-core/python/arro3/core/types.py +++ b/arro3-core/python/arro3/core/types.py @@ -4,13 +4,39 @@ class ArrowSchemaExportable(Protocol): - """A C-level reference to an Arrow Schema or Field.""" + """ + An object with an `__arrow_c_schema__` method. + + Supported objects include: + + - arro3 `Schema`, `Field`, or `DataType` objects. + - pyarrow `Schema`, `Field`, or `DataType` objects. + + Such an object implements the [Arrow C Data Interface + interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the + [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This allows for zero-copy Arrow data interchange across libraries. + """ def __arrow_c_schema__(self) -> object: ... class ArrowArrayExportable(Protocol): - """A C-level reference to an Arrow Array or RecordBatch.""" + """ + An object with an `__arrow_c_array__` method. + + Supported objects include: + + - arro3 `Array` or `RecordBatch` objects. + - pyarrow `Array` or `RecordBatch` objects + + Such an object implements the [Arrow C Data Interface + interface](https://arrow.apache.org/docs/format/CDataInterface.html) via the + [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This allows for zero-copy Arrow data interchange across libraries. + """ def __arrow_c_array__( self, requested_schema: object | None = None @@ -18,6 +44,26 @@ def __arrow_c_array__( class ArrowStreamExportable(Protocol): - """A C-level reference to an Arrow RecordBatchReader, Table, or ChunkedArray.""" + """ + An object with an `__arrow_c_stream__` method. + + Supported objects include: + + - arro3 `Table`, `RecordBatchReader`, `ChunkedArray`, or `ArrayReader` objects. + - Polars `Series` or `DataFrame` objects (polars v1.2 or higher) + - pyarrow `RecordBatchReader`, `Table`, or `ChunkedArray` objects (pyarrow v14 or + higher) + - pandas `DataFrame`s (pandas v2.2 or higher) + - ibis `Table` objects. + + For an up to date list of supported objects, see [this + issue](https://github.com/apache/arrow/issues/39195#issuecomment-2245718008). + + Such an object implements the [Arrow C Stream + interface](https://arrow.apache.org/docs/format/CStreamInterface.html) via the + [Arrow PyCapsule + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). + This allows for zero-copy Arrow data interchange across libraries. + """ def __arrow_c_stream__(self, requested_schema: object | None = None) -> object: ... diff --git a/docs/api/core/array-reader.md b/docs/api/core/array-reader.md new file mode 100644 index 0000000..ba57dd0 --- /dev/null +++ b/docs/api/core/array-reader.md @@ -0,0 +1,8 @@ +# ArrayReader + +::: arro3.core.ArrayReader + options: + filters: + - "!^_" + - "^__arrow" + members: diff --git a/docs/api/core/types.md b/docs/api/core/types.md index bc1ddb2..9cb9e09 100644 --- a/docs/api/core/types.md +++ b/docs/api/core/types.md @@ -5,4 +5,4 @@ filters: - "!^_" - "^__arrow" - members: + show_if_no_docstring: true diff --git a/mkdocs.yml b/mkdocs.yml index f683ffa..956c69e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,11 +22,12 @@ nav: - API Reference: - arro3.core: - api/core/array.md + - api/core/array-reader.md - api/core/chunked-array.md - api/core/datatype.md - api/core/field.md - - api/core/record-batch-reader.md - api/core/record-batch.md + - api/core/record-batch-reader.md - api/core/schema.md - api/core/table.md - api/core/types.md diff --git a/pyo3-arrow/src/array.rs b/pyo3-arrow/src/array.rs index 61203b9..89b1a0f 100644 --- a/pyo3-arrow/src/array.rs +++ b/pyo3-arrow/src/array.rs @@ -26,9 +26,6 @@ use crate::interop::numpy::from_numpy::from_numpy; use crate::interop::numpy::to_numpy::to_numpy; use crate::{PyDataType, PyField}; -/// A Python-facing Arrow array. -/// -/// This is a wrapper around an [ArrayRef] and a [FieldRef]. #[pyclass(module = "arro3.core._core", name = "Array", subclass)] pub struct PyArray { array: ArrayRef, @@ -195,8 +192,6 @@ impl PyArray { Ok(Self::new(array, Field::new("", data_type, true).into())) } - /// An implementation of the Array interface, for interoperability with numpy and other - /// array libraries. #[pyo3(signature = (dtype=None, copy=None))] #[allow(unused_variables)] pub fn __array__( @@ -208,13 +203,6 @@ impl PyArray { to_numpy(py, &self.array) } - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this array - /// into a pyarrow array, without copying memory. #[allow(unused_variables)] pub fn __arrow_c_array__<'py>( &'py self, @@ -305,7 +293,6 @@ impl PyArray { Ok(PyArray::new(new_array, self.field.clone()).to_arro3(py)?) } - /// Copy this array to a `numpy` NDArray pub fn to_numpy(&self, py: Python) -> PyResult { self.__array__(py, None, None) } diff --git a/pyo3-arrow/src/array_reader.rs b/pyo3-arrow/src/array_reader.rs index e4aa202..c47202b 100644 --- a/pyo3-arrow/src/array_reader.rs +++ b/pyo3-arrow/src/array_reader.rs @@ -15,9 +15,6 @@ use crate::ffi::{ArrayIterator, ArrayReader}; use crate::input::AnyArray; use crate::{PyArray, PyChunkedArray, PyField}; -/// A Python-facing Arrow array reader. -/// -/// This is a wrapper around a [ArrayReader]. #[pyclass(module = "arro3.core._core", name = "ArrayReader", subclass)] pub struct PyArrayReader(pub(crate) Option>); @@ -102,13 +99,6 @@ impl Display for PyArrayReader { #[pymethods] impl PyArrayReader { - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example, you can call [`pyarrow.table()`][pyarrow.table] to convert this array - /// into a pyarrow table, without copying memory. #[allow(unused_variables)] pub fn __arrow_c_stream__<'py>( &'py mut self, @@ -136,23 +126,17 @@ impl PyArrayReader { self.to_string() } - /// Returns `true` if this reader has already been consumed. #[getter] pub fn closed(&self) -> bool { self.0.is_none() } - /// Construct this from an existing Arrow object. - /// - /// It can be called on anything that exports the Arrow stream interface - /// (`__arrow_c_stream__`), such as a `Table` or `ArrayReader`. #[classmethod] pub fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { let reader = input.into_reader()?; Ok(Self::new(reader)) } - /// Construct this object from a bare Arrow PyCapsule. #[classmethod] pub fn from_arrow_pycapsule( _cls: &Bound, @@ -184,7 +168,6 @@ impl PyArrayReader { data.extract() } - /// Access the field of this reader #[getter] pub fn field(&self, py: Python) -> PyResult { PyField::new(self.field_ref()?).to_arro3(py) diff --git a/pyo3-arrow/src/chunked.rs b/pyo3-arrow/src/chunked.rs index e56748c..328f578 100644 --- a/pyo3-arrow/src/chunked.rs +++ b/pyo3-arrow/src/chunked.rs @@ -19,9 +19,6 @@ use crate::input::AnyArray; use crate::interop::numpy::to_numpy::chunked_to_numpy; use crate::{PyArray, PyDataType, PyField}; -/// A Python-facing Arrow chunked array. -/// -/// This is a wrapper around a [FieldRef] and a `Vec` of [ArrayRef]. #[pyclass(module = "arro3.core._core", name = "ChunkedArray", subclass)] pub struct PyChunkedArray { chunks: Vec, @@ -241,8 +238,6 @@ impl PyChunkedArray { )) } - /// An implementation of the Array interface, for interoperability with numpy and other - /// array libraries. #[pyo3(signature = (dtype=None, copy=None))] #[allow(unused_variables)] pub fn __array__( @@ -259,14 +254,6 @@ impl PyChunkedArray { chunked_to_numpy(py, chunk_refs.as_slice()) } - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example (as of the upcoming pyarrow v16), you can call - /// [`pyarrow.chunked_array()`][pyarrow.chunked_array] to convert this array into a - /// pyarrow array, without copying memory. #[allow(unused_variables)] pub fn __arrow_c_stream__<'py>( &'py self, @@ -292,16 +279,11 @@ impl PyChunkedArray { self.to_string() } - /// Construct this from an existing Arrow object. - /// - /// It can be called on anything that exports the Arrow stream interface - /// (`__arrow_c_stream__`). All batches will be materialized in memory. #[classmethod] pub fn from_arrow(_cls: &Bound, input: AnyArray) -> PyArrowResult { input.into_chunked_array() } - /// Construct this object from a bare Arrow PyCapsule #[classmethod] pub fn from_arrow_pycapsule( _cls: &Bound, @@ -400,7 +382,6 @@ impl PyChunkedArray { Ok(PyChunkedArray::new(sliced_chunks, self.field.clone()).to_arro3(py)?) } - /// Copy this array to a `numpy` NDArray pub fn to_numpy(&self, py: Python) -> PyResult { self.__array__(py, None, None) } diff --git a/pyo3-arrow/src/datatypes.rs b/pyo3-arrow/src/datatypes.rs index de5d6d6..0fb24b0 100644 --- a/pyo3-arrow/src/datatypes.rs +++ b/pyo3-arrow/src/datatypes.rs @@ -104,13 +104,6 @@ impl Display for PyDataType { #[pymethods] impl PyDataType { - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example, you can call [`pyarrow.field()`][pyarrow.field] to convert this array - /// into a pyarrow field, without copying memory. pub fn __arrow_c_schema__<'py>( &'py self, py: Python<'py>, @@ -126,16 +119,11 @@ impl PyDataType { self.to_string() } - /// Construct this from an existing Arrow object. - /// - /// It can be called on anything that exports the Arrow schema interface - /// (`__arrow_c_schema__`). #[classmethod] pub fn from_arrow(_cls: &Bound, input: Self) -> Self { input } - /// Construct this object from a bare Arrow PyCapsule #[classmethod] pub fn from_arrow_pycapsule( _cls: &Bound, diff --git a/pyo3-arrow/src/field.rs b/pyo3-arrow/src/field.rs index 7c8582b..4287e0c 100644 --- a/pyo3-arrow/src/field.rs +++ b/pyo3-arrow/src/field.rs @@ -15,9 +15,6 @@ use crate::ffi::to_python::to_schema_pycapsule; use crate::input::MetadataInput; use crate::PyDataType; -/// A Python-facing Arrow field. -/// -/// This is a wrapper around a [FieldRef]. #[pyclass(module = "arro3.core._core", name = "Field", subclass)] pub struct PyField(FieldRef); @@ -104,13 +101,6 @@ impl PyField { Ok(PyField::new(field.into())) } - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example, you can call [`pyarrow.field()`][pyarrow.field] to convert this array - /// into a pyarrow field, without copying memory. pub fn __arrow_c_schema__<'py>( &'py self, py: Python<'py>, @@ -126,16 +116,11 @@ impl PyField { self.to_string() } - /// Construct this from an existing Arrow object. - /// - /// It can be called on anything that exports the Arrow schema interface - /// (`__arrow_c_schema__`). #[classmethod] pub fn from_arrow(_cls: &Bound, input: Self) -> Self { input } - /// Construct this object from a bare Arrow PyCapsule #[classmethod] pub fn from_arrow_pycapsule( _cls: &Bound, @@ -147,13 +132,10 @@ impl PyField { Ok(Self::new(Arc::new(field))) } - /// Test if this field is equal to the other - // TODO: add option to check field metadata pub fn equals(&self, other: PyField) -> bool { self.0 == other.0 } - /// The schema's metadata. // Note: we can't return HashMap, Vec> because that will coerce keys and values to // a list, not bytes #[getter] @@ -168,25 +150,21 @@ impl PyField { Ok(d) } - /// The schema's metadata where keys and values are `str`, not `bytes`. #[getter] pub fn metadata_str(&self) -> HashMap { self.0.metadata().clone() } - /// The field name. #[getter] pub fn name(&self) -> String { self.0.name().clone() } - /// The field nullability. #[getter] pub fn nullable(&self) -> bool { self.0.is_nullable() } - /// Create new field without metadata, if any pub fn remove_metadata(&self, py: Python) -> PyResult { PyField::new( self.0 @@ -198,7 +176,6 @@ impl PyField { .to_arro3(py) } - /// Create new field without metadata, if any #[getter] pub fn r#type(&self, py: Python) -> PyResult { PyDataType::new(self.0.data_type().clone()).to_arro3(py) diff --git a/pyo3-arrow/src/record_batch.rs b/pyo3-arrow/src/record_batch.rs index 581bb0b..635b8a8 100644 --- a/pyo3-arrow/src/record_batch.rs +++ b/pyo3-arrow/src/record_batch.rs @@ -19,9 +19,6 @@ use crate::input::{AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, use crate::schema::display_schema; use crate::{PyArray, PyField, PySchema}; -/// A Python-facing Arrow record batch. -/// -/// This is a wrapper around a [RecordBatch]. #[pyclass(module = "arro3.core._core", name = "RecordBatch", subclass)] #[derive(Debug)] pub struct PyRecordBatch(RecordBatch); @@ -111,13 +108,6 @@ impl PyRecordBatch { } } - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this array - /// into a pyarrow array, without copying memory. #[allow(unused_variables)] pub fn __arrow_c_array__<'py>( &'py self, @@ -141,7 +131,6 @@ impl PyRecordBatch { self.to_string() } - /// Construct a RecordBatch from multiple Arrays #[classmethod] #[pyo3(signature = (arrays, *, schema))] pub fn from_arrays( @@ -182,9 +171,6 @@ impl PyRecordBatch { Ok(Self::new(rb)) } - /// Construct a RecordBatch from a StructArray. - /// - /// Each field in the StructArray will become a column in the resulting RecordBatch. #[classmethod] pub fn from_struct_array(_cls: &Bound, struct_array: PyArray) -> PyArrowResult { let (array, field) = struct_array.into_inner(); @@ -200,16 +186,6 @@ impl PyRecordBatch { } } - /// Construct this from an existing Arrow RecordBatch. - /// - /// It can be called on anything that exports the Arrow data interface - /// (`__arrow_c_array__`) and returns a StructArray.. - /// - /// Args: - /// input: Arrow array to use for constructing this object - /// - /// Returns: - /// Self #[classmethod] pub fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { match input { @@ -222,7 +198,6 @@ impl PyRecordBatch { } } - /// Construct this object from a bare Arrow PyCapsule #[classmethod] pub fn from_arrow_pycapsule( _cls: &Bound, @@ -289,7 +264,6 @@ impl PyRecordBatch { Ok(PyRecordBatch::new(new_rb).to_arro3(py)?) } - /// Select single column from RecordBatch pub fn column(&self, py: Python, i: FieldIndexInput) -> PyResult { let column_index = i.into_position(self.0.schema_ref())?; let field = self.0.schema().field(column_index).clone(); @@ -297,7 +271,6 @@ impl PyRecordBatch { PyArray::new(array, field.into()).to_arro3(py) } - /// Names of the Table or RecordBatch columns. #[getter] pub fn column_names(&self) -> Vec { self.0 @@ -308,7 +281,6 @@ impl PyRecordBatch { .collect() } - /// List of all columns in numerical order. #[getter] pub fn columns(&self, py: Python) -> PyResult> { (0..self.num_columns()) @@ -320,7 +292,6 @@ impl PyRecordBatch { self.0 == other.0 } - /// Select a schema field by its numeric index. pub fn field(&self, py: Python, i: FieldIndexInput) -> PyResult { let schema_ref = self.0.schema_ref(); let field = schema_ref.field(i.into_position(schema_ref)?); @@ -332,13 +303,11 @@ impl PyRecordBatch { self.0.get_array_memory_size() } - /// Number of columns in this RecordBatch. #[getter] pub fn num_columns(&self) -> usize { self.0.num_columns() } - /// Number of rows in this RecordBatch. #[getter] pub fn num_rows(&self) -> usize { self.0.num_rows() @@ -379,7 +348,6 @@ impl PyRecordBatch { Ok(PyRecordBatch::new(new_rb).to_arro3(py)?) } - /// Dimensions of the table or record batch: (#rows, #columns). #[getter] pub fn shape(&self) -> (usize, usize) { (self.num_rows(), self.num_columns()) diff --git a/pyo3-arrow/src/record_batch_reader.rs b/pyo3-arrow/src/record_batch_reader.rs index 66bd3ab..a79b6f1 100644 --- a/pyo3-arrow/src/record_batch_reader.rs +++ b/pyo3-arrow/src/record_batch_reader.rs @@ -17,9 +17,6 @@ use crate::input::AnyRecordBatch; use crate::schema::display_schema; use crate::{PyRecordBatch, PySchema, PyTable}; -/// A Python-facing Arrow record batch reader. -/// -/// This is a wrapper around a [RecordBatchReader]. #[pyclass(module = "arro3.core._core", name = "RecordBatchReader", subclass)] pub struct PyRecordBatchReader(pub(crate) Option>); @@ -115,13 +112,6 @@ impl Display for PyRecordBatchReader { #[pymethods] impl PyRecordBatchReader { - /// An implementation of the [Arrow PyCapsule - /// Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). - /// This dunder method should not be called directly, but enables zero-copy - /// data transfer to other Python libraries that understand Arrow memory. - /// - /// For example, you can call [`pyarrow.table()`][pyarrow.table] to convert this array - /// into a pyarrow table, without copying memory. #[allow(unused_variables)] pub fn __arrow_c_stream__<'py>( &'py mut self, @@ -159,17 +149,12 @@ impl PyRecordBatchReader { self.to_string() } - /// Construct this from an existing Arrow object. - /// - /// It can be called on anything that exports the Arrow stream interface - /// (`__arrow_c_stream__`), such as a `Table` or `RecordBatchReader`. #[classmethod] pub fn from_arrow(_cls: &Bound, input: AnyRecordBatch) -> PyArrowResult { let reader = input.into_reader()?; Ok(Self::new(reader)) } - /// Construct this object from a bare Arrow PyCapsule. #[classmethod] pub fn from_arrow_pycapsule( _cls: &Bound, @@ -203,7 +188,6 @@ impl PyRecordBatchReader { data.extract() } - /// Returns `true` if this reader has already been consumed. #[getter] pub fn closed(&self) -> bool { self.0.is_none() @@ -235,7 +219,6 @@ impl PyRecordBatchReader { } } - /// Access the schema of this table #[getter] pub fn schema(&self, py: Python) -> PyResult { PySchema::new(self.schema_ref()?.clone()).to_arro3(py) diff --git a/pyo3-arrow/src/schema.rs b/pyo3-arrow/src/schema.rs index 8a6217b..c8f51ed 100644 --- a/pyo3-arrow/src/schema.rs +++ b/pyo3-arrow/src/schema.rs @@ -15,9 +15,6 @@ use crate::ffi::to_python::to_schema_pycapsule; use crate::input::{FieldIndexInput, MetadataInput}; use crate::{PyDataType, PyField, PyTable}; -/// A Python-facing Arrow schema. -/// -/// This is a wrapper around a [SchemaRef]. #[pyclass(module = "arro3.core._core", name = "Schema", subclass)] pub struct PySchema(SchemaRef); diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs index 4795b71..8098bbe 100644 --- a/pyo3-arrow/src/table.rs +++ b/pyo3-arrow/src/table.rs @@ -24,9 +24,6 @@ use crate::schema::display_schema; use crate::utils::schema_equals; use crate::{PyChunkedArray, PyField, PyRecordBatch, PyRecordBatchReader, PySchema}; -/// A Python-facing Arrow table. -/// -/// This is a wrapper around a [SchemaRef] and a `Vec` of [RecordBatch]. #[pyclass(module = "arro3.core._core", name = "Table", subclass)] #[derive(Debug)] pub struct PyTable {