From b83f08eb328e960f501da6b69dfb1cad1c9d174e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 17 Jan 2024 10:01:02 +0100 Subject: [PATCH] GH-39651: [Python] Basic pyarrow bindings for Binary/StringView classes --- docs/source/python/api/datatypes.rst | 4 ++ python/pyarrow/__init__.py | 7 ++- python/pyarrow/array.pxi | 14 +++++ python/pyarrow/builder.pxi | 66 ++++++++++++++++++++++ python/pyarrow/includes/libarrow.pxd | 9 +++ python/pyarrow/lib.pxd | 8 +++ python/pyarrow/lib.pyx | 2 + python/pyarrow/scalar.pxi | 10 ++++ python/pyarrow/src/arrow/python/helpers.cc | 2 + python/pyarrow/tests/test_builder.py | 21 ++++++- python/pyarrow/tests/test_misc.py | 4 ++ python/pyarrow/tests/test_scalars.py | 28 ++++++++- python/pyarrow/tests/test_types.py | 8 +++ python/pyarrow/types.pxi | 32 +++++++++++ python/pyarrow/types.py | 10 ++++ 15 files changed, 219 insertions(+), 6 deletions(-) diff --git a/docs/source/python/api/datatypes.rst b/docs/source/python/api/datatypes.rst index 4066ef314234d..642c243b21af0 100644 --- a/docs/source/python/api/datatypes.rst +++ b/docs/source/python/api/datatypes.rst @@ -55,6 +55,8 @@ These should be used to create Arrow data types and schemas. large_binary large_string large_utf8 + binary_view + string_view decimal128 list_ large_list @@ -168,6 +170,8 @@ represents a given data type (such as ``int32``) or general category is_large_binary is_large_unicode is_large_string + is_binary_view + is_string_view is_fixed_size_binary is_map is_dictionary diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 9da94885ec6b2..4dbd1258d3cea 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -163,7 +163,7 @@ def print_entry(label, value): time32, time64, timestamp, date32, date64, duration, month_day_nano_interval, float16, float32, float64, - binary, string, utf8, + binary, string, utf8, binary_view, string_view, large_binary, large_string, large_utf8, decimal128, decimal256, list_, large_list, map_, struct, @@ -205,6 +205,7 @@ def print_entry(label, value): FixedSizeListArray, UnionArray, BinaryArray, StringArray, LargeBinaryArray, LargeStringArray, + BinaryViewArray, StringViewArray, FixedSizeBinaryArray, DictionaryArray, Date32Array, Date64Array, TimestampArray, @@ -223,8 +224,8 @@ def print_entry(label, value): Time32Scalar, Time64Scalar, TimestampScalar, DurationScalar, MonthDayNanoIntervalScalar, - BinaryScalar, LargeBinaryScalar, - StringScalar, LargeStringScalar, + BinaryScalar, LargeBinaryScalar, BinaryViewScalar, + StringScalar, LargeStringScalar, StringViewScalar, FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, ExtensionScalar) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 5c2d22aef1895..addf4e3584499 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -2943,6 +2943,12 @@ cdef class LargeStringArray(Array): null_count, offset) +cdef class StringViewArray(Array): + """ + Concrete class for Arrow arrays of string (or utf8) view data type. + """ + + cdef class BinaryArray(Array): """ Concrete class for Arrow arrays of variable-sized binary data type. @@ -2969,6 +2975,12 @@ cdef class LargeBinaryArray(Array): return ( self.ap).total_values_length() +cdef class BinaryViewArray(Array): + """ + Concrete class for Arrow arrays of variable-sized binary view data type. + """ + + cdef class DictionaryArray(Array): """ Concrete class for dictionary-encoded Arrow arrays. @@ -3670,6 +3682,8 @@ cdef dict _array_classes = { _Type_STRING: StringArray, _Type_LARGE_BINARY: LargeBinaryArray, _Type_LARGE_STRING: LargeStringArray, + _Type_BINARY_VIEW: BinaryViewArray, + _Type_STRING_VIEW: StringViewArray, _Type_DICTIONARY: DictionaryArray, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryArray, _Type_DECIMAL128: Decimal128Array, diff --git a/python/pyarrow/builder.pxi b/python/pyarrow/builder.pxi index a34ea5412e14a..2af39e2c589e6 100644 --- a/python/pyarrow/builder.pxi +++ b/python/pyarrow/builder.pxi @@ -80,3 +80,69 @@ cdef class StringBuilder(_Weakrefable): def __len__(self): return self.builder.get().length() + + +cdef class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + cdef: + unique_ptr[CStringViewBuilder] builder + + def __cinit__(self, MemoryPool memory_pool=None): + cdef CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + self.builder.reset(new CStringViewBuilder(pool)) + + def append(self, value): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + if value is None or value is np.nan: + self.builder.get().AppendNull() + elif isinstance(value, (bytes, str)): + self.builder.get().Append(tobytes(value)) + else: + raise TypeError('StringViewBuilder only accepts string objects') + + def append_values(self, values): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + for value in values: + self.append(value) + + def finish(self): + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + cdef shared_ptr[CArray] out + with nogil: + self.builder.get().Finish(&out) + return pyarrow_wrap_array(out) + + @property + def null_count(self): + return self.builder.get().null_count() + + def __len__(self): + return self.builder.get().length() diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 74e92594b04e5..d92f09da779b6 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -126,6 +126,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: _Type_LARGE_BINARY" arrow::Type::LARGE_BINARY" _Type_LARGE_STRING" arrow::Type::LARGE_STRING" _Type_FIXED_SIZE_BINARY" arrow::Type::FIXED_SIZE_BINARY" + _Type_BINARY_VIEW" arrow::Type::BINARY_VIEW" + _Type_STRING_VIEW" arrow::Type::STRING_VIEW" _Type_LIST" arrow::Type::LIST" _Type_LARGE_LIST" arrow::Type::LARGE_LIST" @@ -1295,7 +1297,14 @@ cdef extern from "arrow/builder.h" namespace "arrow" nogil: cdef cppclass CStringBuilder" arrow::StringBuilder"(CBinaryBuilder): CStringBuilder(CMemoryPool* pool) + CStatus Append(const c_string& value) + + cdef cppclass CBinaryViewBuilder" arrow::BinaryViewBuilder"(CArrayBuilder): + CBinaryViewBuilder(shared_ptr[CDataType], CMemoryPool* pool) + CStatus Append(const char* value, int32_t length) + cdef cppclass CStringViewBuilder" arrow::StringViewBuilder"(CBinaryViewBuilder): + CStringViewBuilder(CMemoryPool* pool) CStatus Append(const c_string& value) cdef cppclass CTimestampBuilder "arrow::TimestampBuilder"(CArrayBuilder): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 58ec34addbc0a..c1104864066e9 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -445,6 +445,14 @@ cdef class BinaryArray(Array): pass +cdef class StringViewArray(Array): + pass + + +cdef class BinaryViewArray(Array): + pass + + cdef class DictionaryArray(Array): cdef: object _indices, _dictionary diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 29a0bed55949c..b0368b67f790e 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -106,6 +106,8 @@ Type_STRING = _Type_STRING Type_LARGE_BINARY = _Type_LARGE_BINARY Type_LARGE_STRING = _Type_LARGE_STRING Type_FIXED_SIZE_BINARY = _Type_FIXED_SIZE_BINARY +Type_BINARY_VIEW = _Type_BINARY_VIEW +Type_STRING_VIEW = _Type_STRING_VIEW Type_LIST = _Type_LIST Type_LARGE_LIST = _Type_LARGE_LIST Type_MAP = _Type_MAP diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 9a66dc81226d4..2772acf81861c 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -665,6 +665,14 @@ cdef class LargeStringScalar(StringScalar): pass +cdef class BinaryViewScalar(BinaryScalar): + pass + + +cdef class StringViewScalar(StringScalar): + pass + + cdef class ListScalar(Scalar): """ Concrete class for list-like scalars. @@ -1051,8 +1059,10 @@ cdef dict _scalar_classes = { _Type_BINARY: BinaryScalar, _Type_LARGE_BINARY: LargeBinaryScalar, _Type_FIXED_SIZE_BINARY: FixedSizeBinaryScalar, + _Type_BINARY_VIEW: BinaryViewScalar, _Type_STRING: StringScalar, _Type_LARGE_STRING: LargeStringScalar, + _Type_STRING_VIEW: StringViewScalar, _Type_LIST: ListScalar, _Type_LARGE_LIST: LargeListScalar, _Type_FIXED_SIZE_LIST: FixedSizeListScalar, diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index c266abc169d49..2c86c86a919be 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -63,6 +63,8 @@ std::shared_ptr GetPrimitiveType(Type::type type) { GET_PRIMITIVE_TYPE(STRING, utf8); GET_PRIMITIVE_TYPE(LARGE_BINARY, large_binary); GET_PRIMITIVE_TYPE(LARGE_STRING, large_utf8); + GET_PRIMITIVE_TYPE(BINARY_VIEW, binary_view); + GET_PRIMITIVE_TYPE(STRING_VIEW, utf8_view); GET_PRIMITIVE_TYPE(INTERVAL_MONTH_DAY_NANO, month_day_nano_interval); default: return nullptr; diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py index 50d801026b7d8..abc8a0013df37 100644 --- a/python/pyarrow/tests/test_builder.py +++ b/python/pyarrow/tests/test_builder.py @@ -20,7 +20,7 @@ import numpy as np import pyarrow as pa -from pyarrow.lib import StringBuilder +from pyarrow.lib import StringBuilder, StringViewBuilder def test_weakref(): @@ -65,3 +65,22 @@ def test_string_builder_append_after_finish(): sbuilder.append("No effect") expected = [None, None, "text", None, "other text"] assert arr.to_pylist() == expected + + +def test_string_view_builder(): + builder = StringViewBuilder() + builder.append(b"a byte string") + builder.append("a string") + builder.append("a longer not-inlined string") + builder.append(np.nan) + builder.append_values([None, "text"]) + assert len(builder) == 6 + assert builder.null_count == 2 + arr = builder.finish() + assert isinstance(arr, pa.Array) + assert arr.null_count == 2 + assert arr.type == 'string_view' + expected = [ + "a byte string", "a string", "a longer not-inlined string", None, None, "text" + ] + assert arr.to_pylist() == expected diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 8b8c50882b749..8cec8783280dd 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -185,6 +185,8 @@ def test_set_timezone_db_path_non_windows(): pa.UnionArray, pa.BinaryArray, pa.StringArray, + pa.BinaryViewArray, + pa.StringViewArray, pa.FixedSizeBinaryArray, pa.DictionaryArray, pa.Date32Array, @@ -221,6 +223,8 @@ def test_set_timezone_db_path_non_windows(): pa.StringScalar, pa.BinaryScalar, pa.FixedSizeBinaryScalar, + pa.BinaryViewScalar, + pa.StringViewScalar, pa.ListScalar, pa.LargeListScalar, pa.MapScalar, diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 74dee59558239..4a239b23d5676 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -51,6 +51,9 @@ (b"bytes", None, pa.BinaryScalar), ("largestring", pa.large_string(), pa.LargeStringScalar), (b"largebytes", pa.large_binary(), pa.LargeBinaryScalar), + # TODO(GH-39633) pa.scalar(..) requires python->arrow conversion to be implemented + # ("string_view", pa.string_view(), pa.StringViewScalar), + # (b"bytes_view", pa.binary_view(), pa.BinaryViewScalar), (b"abc", pa.binary(3), pa.FixedSizeBinaryScalar), ([1, 2, 3], None, pa.ListScalar), ([1, 2, 3, 4], pa.large_list(pa.int8()), pa.LargeListScalar), @@ -488,7 +491,8 @@ def test_month_day_nano_interval(): @pytest.mark.parametrize('value', ['foo', 'mañana']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.string(), pa.StringScalar), - (pa.large_string(), pa.LargeStringScalar) + (pa.large_string(), pa.LargeStringScalar), + # (pa.string_view(), pa.StringViewScalar), ]) def test_string(value, ty, scalar_typ): s = pa.scalar(value, type=ty) @@ -503,10 +507,30 @@ def test_string(value, ty, scalar_typ): assert buf.to_pybytes() == value.encode() +@pytest.mark.parametrize('value', ['foo', 'mañana']) +def test_string_view(value): + # TODO: replace with normal scalar construction + builder = pa.lib.StringViewBuilder() + builder.append(value) + arr = builder.finish() + + s = arr[0] + assert isinstance(s, pa.StringViewScalar) + assert s.as_py() == value + assert s.as_py() != 'something' + assert repr(value) in repr(s) + assert str(s) == str(value) + + buf = s.as_buffer() + assert isinstance(buf, pa.Buffer) + assert buf.to_pybytes() == value.encode() + + @pytest.mark.parametrize('value', [b'foo', b'bar']) @pytest.mark.parametrize(('ty', 'scalar_typ'), [ (pa.binary(), pa.BinaryScalar), - (pa.large_binary(), pa.LargeBinaryScalar) + (pa.large_binary(), pa.LargeBinaryScalar), + # (pa.binary_view(), pa.BinaryViewScalar), ]) def test_binary(value, ty, scalar_typ): s = pa.scalar(value, type=ty) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index c8a52c6b626c2..a5ab3128dc874 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -61,6 +61,8 @@ def get_many_types(): pa.binary(10), pa.large_string(), pa.large_binary(), + pa.string_view(), + pa.binary_view(), pa.list_(pa.int32()), pa.list_(pa.int32(), 2), pa.large_list(pa.uint16()), @@ -244,6 +246,12 @@ def test_is_binary_string(): assert types.is_fixed_size_binary(pa.binary(5)) assert not types.is_fixed_size_binary(pa.binary()) + assert types.is_string_view(pa.string_view()) + assert not types.is_string_view(pa.string()) + assert types.is_binary_view(pa.binary_view()) + assert not types.is_binary_view(pa.binary()) + assert not types.is_binary_view(pa.string_view()) + def test_is_temporal_date_time_timestamp(): date_types = [pa.date32(), pa.date64()] diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 912ee39f7d712..94c0e947422eb 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -4375,6 +4375,36 @@ def large_utf8(): return large_string() +def binary_view(): + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + return primitive_type(_Type_BINARY_VIEW) + + +def string_view(): + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ + return primitive_type(_Type_STRING_VIEW) + + def list_(value_type, int list_size=-1): """ Create ListType instance from child data type or field. @@ -4991,6 +5021,8 @@ cdef dict _type_aliases = { 'large_str': large_string, 'large_utf8': large_string, 'large_binary': large_binary, + 'binary_view': binary_view, + 'string_view': string_view, 'date32': date32, 'date64': date64, 'date32[day]': date32, diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index 5d7dbe4b451b9..32398dac9c5f5 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -243,6 +243,16 @@ def is_fixed_size_binary(t): return t.id == lib.Type_FIXED_SIZE_BINARY +@doc(is_null, datatype="variable-length binary view") +def is_binary_view(t): + return t.id == lib.Type_BINARY_VIEW + + +@doc(is_null, datatype="variable-length string (utf-8) view") +def is_string_view(t): + return t.id == lib.Type_STRING_VIEW + + @doc(is_null, datatype="date") def is_date(t): return t.id in _DATE_TYPES