From 6e937715c3852fbd244e8eaf507b8d1b29ce5748 Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Mon, 29 Jul 2024 19:05:57 -0400 Subject: [PATCH] Export DataType constructors (#77) --- arro3-core/python/arro3/core/_core.pyi | 259 ++++++++++++++- pyo3-arrow/src/datatypes.rs | 428 +++++++++++++------------ 2 files changed, 487 insertions(+), 200 deletions(-) diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi index 6eb78b2..a31e6b1 100644 --- a/arro3-core/python/arro3/core/_core.pyi +++ b/arro3-core/python/arro3/core/_core.pyi @@ -1,4 +1,4 @@ -from typing import Any, Sequence +from typing import Any, Literal, Sequence import numpy as np from numpy.typing import NDArray @@ -125,6 +125,263 @@ class DataType: def from_arrow_pycapsule(cls, capsule) -> DataType: """Construct this object from a bare Arrow PyCapsule""" def bit_width(self) -> int | None: ... + #### Constructors + @classmethod + def null(cls) -> DataType: + """Create instance of null type.""" + @classmethod + def bool(cls) -> DataType: + """Create instance of boolean type.""" + @classmethod + def int8(cls) -> DataType: + """Create instance of signed int8 type.""" + @classmethod + def int16(cls) -> DataType: + """Create instance of signed int16 type.""" + @classmethod + def int32(cls) -> DataType: + """Create instance of signed int32 type.""" + @classmethod + def int64(cls) -> DataType: + """Create instance of signed int64 type.""" + @classmethod + def uint8(cls) -> DataType: + """Create instance of unsigned int8 type.""" + @classmethod + def uint16(cls) -> DataType: + """Create instance of unsigned int16 type.""" + @classmethod + def uint32(cls) -> DataType: + """Create instance of unsigned int32 type.""" + @classmethod + def uint64(cls) -> DataType: + """Create instance of unsigned int64 type.""" + @classmethod + def float16(cls) -> DataType: + """Create half-precision floating point type.""" + @classmethod + def float32(cls) -> DataType: + """Create single-precision floating point type.""" + @classmethod + def float64(cls) -> DataType: + """Create double-precision floating point type.""" + @classmethod + def time32(cls, unit: Literal["s", "ms"]) -> DataType: + """Create instance of 32-bit time (time of day) type with unit resolution. + + Args: + unit: one of `'s'` [second], or `'ms'` [millisecond] + + Returns: + _description_ + """ + @classmethod + def time64(cls, unit: Literal["us", "ns"]) -> DataType: + """Create instance of 64-bit time (time of day) type with unit resolution. + + Args: + unit: One of `'us'` [microsecond], or `'ns'` [nanosecond]. + + Returns: + _description_ + """ + @classmethod + def timestamp( + cls, unit: Literal["s", "ms", "us", "ns"], *, tz: str | None = None + ) -> DataType: + """Create instance of timestamp type with resolution and optional time zone. + + Args: + unit: one of `'s'` [second], `'ms'` [millisecond], `'us'` [microsecond], or `'ns'` [nanosecond] + tz: Time zone name. None indicates time zone naive. Defaults to None. + + Returns: + _description_ + """ + @classmethod + def date32(cls) -> DataType: + """Create instance of 32-bit date (days since UNIX epoch 1970-01-01).""" + @classmethod + def date64(cls) -> DataType: + """Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01).""" + @classmethod + def duration(cls, unit: Literal["s", "ms", "us", "ns"]) -> DataType: + """Create instance of a duration type with unit resolution. + + Args: + unit: one of `'s'` [second], `'ms'` [millisecond], `'us'` [microsecond], or `'ns'` [nanosecond] + + Returns: + _description_ + """ + @classmethod + def month_day_nano_interval(cls) -> DataType: + """ + Create instance of an interval type representing months, days and nanoseconds + between two dates. + """ + @classmethod + def binary(cls, length: int | None = None) -> DataType: + """Create variable-length or fixed size binary type. + + Args: + length: If length is `None` then return a variable length binary type. If length is provided, then return a fixed size binary type of width `length`. Defaults to None. + + Returns: + _description_ + """ + @classmethod + def string(cls) -> DataType: + """Create UTF8 variable-length string type.""" + @classmethod + def utf8(cls) -> DataType: + """Alias for string().""" + @classmethod + def large_binary(cls) -> DataType: + """Create large variable-length binary type.""" + @classmethod + def large_string(cls) -> DataType: + """Create large UTF8 variable-length string type.""" + @classmethod + def large_utf8(cls) -> DataType: + """Alias for large_string().""" + @classmethod + def binary_view(cls) -> DataType: + """Create a variable-length binary view type.""" + @classmethod + def string_view(cls) -> DataType: + """Create UTF8 variable-length string view type.""" + @classmethod + def decimal128(cls, precision: int, scale: int) -> DataType: + """Create decimal type with precision and scale and 128-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled integer. The + precision is the number of significant digits that the decimal type can + represent; the scale is the number of digits after the decimal point (note the + scale can be negative). + + As an example, `decimal128(7, 3)` can exactly represent the numbers 1234.567 and + -1234.567 (encoded internally as the 128-bit integers 1234567 and -1234567, + respectively), but neither 12345.67 nor 123.4567. + + `decimal128(5, -3)` can exactly represent the number 12345000 (encoded + internally as the 128-bit integer 12345), but neither 123450000 nor 1234500. + + If you need a precision higher than 38 significant digits, consider using + `decimal256`. + + Args: + precision: Must be between 1 and 38 scale: _description_ + """ + @classmethod + def decimal256(cls, precision: int, scale: int) -> DataType: + """Create decimal type with precision and scale and 256-bit width.""" + @classmethod + def list(cls, value_type: ArrowSchemaExportable, list_size: int | None) -> DataType: + """Create ListType instance from child data type or field. + + Args: + value_type: _description_ + list_size: If length is `None` then return a variable length list type. If length is provided then return a fixed size list type. + + Returns: + _description_ + """ + @classmethod + def large_list(cls, value_type: ArrowSchemaExportable) -> DataType: + """Create LargeListType instance from child data type or field. + + This data type may not be supported by all Arrow implementations. Unless you + need to represent data larger than 2**31 elements, you should prefer `list()`. + + Args: + value_type: _description_ + + Returns: + _description_ + """ + @classmethod + def list_view(cls, value_type: ArrowSchemaExportable) -> DataType: + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations because it is + an alternative to the ListType. + + """ + @classmethod + def large_list_view(cls, value_type: ArrowSchemaExportable) -> DataType: + """Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations because it is + an alternative to the ListType. + + Args: + value_type: _description_ + + Returns: + _description_ + """ + + @classmethod + def map( + cls, + key_type: ArrowSchemaExportable, + item_type: ArrowSchemaExportable, + keys_sorted: bool, + ) -> DataType: + """Create MapType instance from key and item data types or fields. + + Args: + key_type: _description_ + item_type: _description_ + keys_sorted: _description_ + + Returns: + _description_ + """ + + @classmethod + def struct(cls, fields: Sequence[ArrowSchemaExportable]) -> DataType: + """Create StructType instance from fields. + + A struct is a nested type parameterized by an ordered sequence of types (which + can all be distinct), called its fields. + + Args: + fields: Each field must have a UTF8-encoded name, and these field names are part of the type metadata. + + Returns: + _description_ + """ + + @classmethod + def dictionary( + cls, index_type: ArrowSchemaExportable, value_type: ArrowSchemaExportable + ) -> DataType: + """Dictionary (categorical, or simply encoded) type. + + Args: + index_type: _description_ + value_type: _description_ + + Returns: + _description_ + """ + + @classmethod + def run_end_encoded( + cls, run_end_type: ArrowSchemaExportable, value_type: ArrowSchemaExportable + ) -> DataType: + """Create RunEndEncodedType from run-end and value types. + + Args: + run_end_type: The integer type of the run_ends array. Must be `'int16'`, `'int32'`, or `'int64'`. + value_type: The type of the values array. + + Returns: + _description_ + """ class Field: def __init__( diff --git a/pyo3-arrow/src/datatypes.rs b/pyo3-arrow/src/datatypes.rs index bab97b2..73ac550 100644 --- a/pyo3-arrow/src/datatypes.rs +++ b/pyo3-arrow/src/datatypes.rs @@ -1,7 +1,8 @@ use std::fmt::Display; +use std::sync::Arc; use arrow::datatypes::DataType; -use arrow_schema::TimeUnit; +use arrow_schema::{Field, IntervalUnit, TimeUnit}; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::intern; use pyo3::prelude::*; @@ -11,12 +12,12 @@ use crate::error::PyArrowResult; use crate::ffi::from_python::utils::import_schema_pycapsule; use crate::ffi::to_python::nanoarrow::to_nanoarrow_schema; use crate::ffi::to_schema_pycapsule; +use crate::PyField; -#[allow(dead_code)] -pub struct PyTimeUnit(arrow_schema::TimeUnit); +struct PyTimeUnit(arrow_schema::TimeUnit); impl<'a> FromPyObject<'a> for PyTimeUnit { - fn extract(ob: &'a PyAny) -> PyResult { + fn extract_bound(ob: &Bound<'a, PyAny>) -> PyResult { let s: String = ob.extract()?; match s.to_lowercase().as_str() { "s" => Ok(Self(TimeUnit::Second)), @@ -150,199 +151,228 @@ impl PyDataType { self.0.primitive_width() } - // TODO: decide whether to make this public - - // #[classmethod] - // fn null(_: &Bound) -> Self { - // Self(DataType::Null) - // } - - // #[classmethod] - // fn bool(_: &Bound) -> Self { - // Self(DataType::Boolean) - // } - - // #[classmethod] - // fn int8(_: &Bound) -> Self { - // Self(DataType::Int8) - // } - - // #[classmethod] - // fn int16(_: &Bound) -> Self { - // Self(DataType::Int16) - // } - - // #[classmethod] - // fn int32(_: &Bound) -> Self { - // Self(DataType::Int32) - // } - - // #[classmethod] - // fn int64(_: &Bound) -> Self { - // Self(DataType::Int64) - // } - - // #[classmethod] - // fn uint8(_: &Bound) -> Self { - // Self(DataType::UInt8) - // } - - // #[classmethod] - // fn uint16(_: &Bound) -> Self { - // Self(DataType::UInt16) - // } - - // #[classmethod] - // fn uint32(_: &Bound) -> Self { - // Self(DataType::UInt32) - // } - - // #[classmethod] - // fn uint64(_: &Bound) -> Self { - // Self(DataType::UInt64) - // } - - // #[classmethod] - // fn float16(_: &Bound) -> Self { - // Self(DataType::Float16) - // } - - // #[classmethod] - // fn float32(_: &Bound) -> Self { - // Self(DataType::Float32) - // } - - // #[classmethod] - // fn float64(_: &Bound) -> Self { - // Self(DataType::Float64) - // } - - // #[classmethod] - // fn time32(_: &Bound, unit: PyTimeUnit) -> PyArrowResult { - // if unit.0 == TimeUnit::Microsecond || unit.0 == TimeUnit::Nanosecond { - // return Err(PyValueError::new_err("Unexpected timeunit for time32").into()); - // } - - // Ok(Self(DataType::Time32(unit.0))) - // } - - // #[classmethod] - // fn time64(_: &Bound, unit: PyTimeUnit) -> PyArrowResult { - // if unit.0 == TimeUnit::Second || unit.0 == TimeUnit::Millisecond { - // return Err(PyValueError::new_err("Unexpected timeunit for time64").into()); - // } - - // Ok(Self(DataType::Time64(unit.0))) - // } - - // #[classmethod] - // fn timestamp(_: &Bound, unit: PyTimeUnit, tz: Option) -> Self { - // Self(DataType::Timestamp(unit.0, tz.map(|s| s.into()))) - // } - - // #[classmethod] - // fn date32(_: &Bound) -> Self { - // Self(DataType::Date32) - // } - - // #[classmethod] - // fn date64(_: &Bound) -> Self { - // Self(DataType::Date64) - // } - - // #[classmethod] - // fn duration(_: &Bound, unit: PyTimeUnit) -> Self { - // Self(DataType::Duration(unit.0)) - // } - - // #[classmethod] - // fn month_day_nano_interval(_: &Bound) -> Self { - // Self(DataType::Interval(IntervalUnit::MonthDayNano)) - // } - - // #[classmethod] - // fn binary(_: &Bound) -> Self { - // Self(DataType::Binary) - // } - - // #[classmethod] - // fn string(_: &Bound) -> Self { - // Self(DataType::Utf8) - // } - - // #[classmethod] - // fn utf8(_: &Bound) -> Self { - // Self(DataType::Utf8) - // } - - // #[classmethod] - // fn large_binary(_: &Bound) -> Self { - // Self(DataType::LargeBinary) - // } - - // #[classmethod] - // fn large_string(_: &Bound) -> Self { - // Self(DataType::LargeUtf8) - // } - - // #[classmethod] - // fn large_utf8(_: &Bound) -> Self { - // Self(DataType::LargeUtf8) - // } - - // #[classmethod] - // fn binary_view(_: &Bound) -> Self { - // Self(DataType::BinaryView) - // } - - // #[classmethod] - // fn string_view(_: &Bound) -> Self { - // Self(DataType::Utf8View) - // } - - // #[classmethod] - // fn decimal128(_: &Bound, precision: u8, scale: i8) -> Self { - // Self(DataType::Decimal128(precision, scale)) - // } - - // #[classmethod] - // fn decimal256(_: &Bound, precision: u8, scale: i8) -> Self { - // Self(DataType::Decimal256(precision, scale)) - // } - - // #[classmethod] - // fn list(_: &Bound, value_type: PyField, list_size: Option) -> Self { - // if let Some(list_size) = list_size { - // Self(DataType::FixedSizeList(value_type.into(), list_size)) - // } else { - // Self(DataType::List(value_type.into())) - // } - // } - - // #[classmethod] - // fn large_list(_: &Bound, value_type: PyField) -> Self { - // Self(DataType::LargeList(value_type.into())) - // } - - // #[classmethod] - // fn list_view(_: &Bound, value_type: PyField) -> Self { - // Self(DataType::ListView(value_type.into())) - // } - - // #[classmethod] - // fn large_list_view(_: &Bound, value_type: PyField) -> Self { - // Self(DataType::LargeListView(value_type.into())) - // } - - // TODO: fix this. - // #[classmethod] - // fn map(_: &PyType, key_type: PyField, item_type: PyField, keys_sorted: bool) -> Self { - // let field = Field::new( - // "entries", - // DataType::Struct(vec![Arc::new(key_type.0), Arc::new(item_type.0)].into()), - // true, - // ); - // // ::new_struct("entries", , true); - // Self(DataType::Map(field.into(), keys_sorted)) - // } + #[classmethod] + fn null(_: &Bound) -> Self { + Self(DataType::Null) + } + + #[classmethod] + fn bool(_: &Bound) -> Self { + Self(DataType::Boolean) + } + + #[classmethod] + fn int8(_: &Bound) -> Self { + Self(DataType::Int8) + } + + #[classmethod] + fn int16(_: &Bound) -> Self { + Self(DataType::Int16) + } + + #[classmethod] + fn int32(_: &Bound) -> Self { + Self(DataType::Int32) + } + + #[classmethod] + fn int64(_: &Bound) -> Self { + Self(DataType::Int64) + } + + #[classmethod] + fn uint8(_: &Bound) -> Self { + Self(DataType::UInt8) + } + + #[classmethod] + fn uint16(_: &Bound) -> Self { + Self(DataType::UInt16) + } + + #[classmethod] + fn uint32(_: &Bound) -> Self { + Self(DataType::UInt32) + } + + #[classmethod] + fn uint64(_: &Bound) -> Self { + Self(DataType::UInt64) + } + + #[classmethod] + fn float16(_: &Bound) -> Self { + Self(DataType::Float16) + } + + #[classmethod] + fn float32(_: &Bound) -> Self { + Self(DataType::Float32) + } + + #[classmethod] + fn float64(_: &Bound) -> Self { + Self(DataType::Float64) + } + + #[classmethod] + fn time32(_: &Bound, unit: PyTimeUnit) -> PyArrowResult { + if unit.0 == TimeUnit::Microsecond || unit.0 == TimeUnit::Nanosecond { + return Err(PyValueError::new_err("Unexpected timeunit for time32").into()); + } + + Ok(Self(DataType::Time32(unit.0))) + } + + #[classmethod] + fn time64(_: &Bound, unit: PyTimeUnit) -> PyArrowResult { + if unit.0 == TimeUnit::Second || unit.0 == TimeUnit::Millisecond { + return Err(PyValueError::new_err("Unexpected timeunit for time64").into()); + } + + Ok(Self(DataType::Time64(unit.0))) + } + + #[classmethod] + #[pyo3(signature = (unit, *, tz=None))] + fn timestamp(_: &Bound, unit: PyTimeUnit, tz: Option) -> Self { + Self(DataType::Timestamp(unit.0, tz.map(|s| s.into()))) + } + + #[classmethod] + fn date32(_: &Bound) -> Self { + Self(DataType::Date32) + } + + #[classmethod] + fn date64(_: &Bound) -> Self { + Self(DataType::Date64) + } + + #[classmethod] + fn duration(_: &Bound, unit: PyTimeUnit) -> Self { + Self(DataType::Duration(unit.0)) + } + + #[classmethod] + fn month_day_nano_interval(_: &Bound) -> Self { + Self(DataType::Interval(IntervalUnit::MonthDayNano)) + } + + #[classmethod] + fn binary(_: &Bound, length: Option) -> Self { + if let Some(length) = length { + Self(DataType::FixedSizeBinary(length)) + } else { + Self(DataType::Binary) + } + } + + #[classmethod] + fn string(_: &Bound) -> Self { + Self(DataType::Utf8) + } + + #[classmethod] + fn utf8(_: &Bound) -> Self { + Self(DataType::Utf8) + } + + #[classmethod] + fn large_binary(_: &Bound) -> Self { + Self(DataType::LargeBinary) + } + + #[classmethod] + fn large_string(_: &Bound) -> Self { + Self(DataType::LargeUtf8) + } + + #[classmethod] + fn large_utf8(_: &Bound) -> Self { + Self(DataType::LargeUtf8) + } + + #[classmethod] + fn binary_view(_: &Bound) -> Self { + Self(DataType::BinaryView) + } + + #[classmethod] + fn string_view(_: &Bound) -> Self { + Self(DataType::Utf8View) + } + + #[classmethod] + fn decimal128(_: &Bound, precision: u8, scale: i8) -> Self { + Self(DataType::Decimal128(precision, scale)) + } + + #[classmethod] + fn decimal256(_: &Bound, precision: u8, scale: i8) -> Self { + Self(DataType::Decimal256(precision, scale)) + } + + #[classmethod] + fn list(_: &Bound, value_type: PyField, list_size: Option) -> Self { + if let Some(list_size) = list_size { + Self(DataType::FixedSizeList(value_type.into(), list_size)) + } else { + Self(DataType::List(value_type.into())) + } + } + + #[classmethod] + fn large_list(_: &Bound, value_type: PyField) -> Self { + Self(DataType::LargeList(value_type.into())) + } + + #[classmethod] + fn list_view(_: &Bound, value_type: PyField) -> Self { + Self(DataType::ListView(value_type.into())) + } + + #[classmethod] + fn large_list_view(_: &Bound, value_type: PyField) -> Self { + Self(DataType::LargeListView(value_type.into())) + } + + #[classmethod] + fn map(_: &Bound, key_type: PyField, item_type: PyField, keys_sorted: bool) -> Self { + // Note: copied from source of `Field::new_map` + // https://github.com/apache/arrow-rs/blob/bf9ce475df82d362631099d491d3454d64d50217/arrow-schema/src/field.rs#L251-L258 + let data_type = DataType::Map( + Arc::new(Field::new( + "entries", + DataType::Struct(vec![key_type.into_inner(), item_type.into_inner()].into()), + false, // The inner map field is always non-nullable (arrow-rs #1697), + )), + keys_sorted, + ); + Self(data_type) + } + + #[classmethod] + fn r#struct(_: &Bound, fields: Vec) -> Self { + Self(DataType::Struct( + fields.into_iter().map(|field| field.into_inner()).collect(), + )) + } + + #[classmethod] + fn dictionary(_: &Bound, index_type: PyField, value_type: PyField) -> Self { + Self(DataType::Dictionary( + Box::new(index_type.into_inner().data_type().clone()), + Box::new(value_type.into_inner().data_type().clone()), + )) + } + + #[classmethod] + fn run_end_encoded(_: &Bound, run_end_type: PyField, value_type: PyField) -> Self { + Self(DataType::RunEndEncoded( + run_end_type.into_inner(), + value_type.into_inner(), + )) + } }