diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index fb785e1e9571b..fb7253b6fd69d 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -906,6 +906,7 @@ endif() if(ARROW_JSON) arrow_add_object_library(ARROW_JSON + extension/bool8.cc extension/fixed_shape_tensor.cc extension/opaque.cc json/options.cc diff --git a/cpp/src/arrow/extension/CMakeLists.txt b/cpp/src/arrow/extension/CMakeLists.txt index 6741ab602f50b..fcd5fa529ab56 100644 --- a/cpp/src/arrow/extension/CMakeLists.txt +++ b/cpp/src/arrow/extension/CMakeLists.txt @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +add_arrow_test(test + SOURCES + bool8_test.cc + PREFIX + "arrow-extension-bool8") + add_arrow_test(test SOURCES fixed_shape_tensor_test.cc diff --git a/cpp/src/arrow/extension/bool8.cc b/cpp/src/arrow/extension/bool8.cc new file mode 100644 index 0000000000000..c081f0c2b2866 --- /dev/null +++ b/cpp/src/arrow/extension/bool8.cc @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/extension/bool8.h" +#include "arrow/util/logging.h" + +namespace arrow::extension { + +bool Bool8Type::ExtensionEquals(const ExtensionType& other) const { + return extension_name() == other.extension_name(); +} + +std::string Bool8Type::ToString(bool show_metadata) const { + std::stringstream ss; + ss << "extension<" << this->extension_name() << ">"; + return ss.str(); +} + +std::string Bool8Type::Serialize() const { return ""; } + +Result> Bool8Type::Deserialize( + std::shared_ptr storage_type, const std::string& serialized_data) const { + if (storage_type->id() != Type::INT8) { + return Status::Invalid("Expected INT8 storage type, got ", storage_type->ToString()); + } + if (serialized_data != "") { + return Status::Invalid("Serialize data must be empty, got ", serialized_data); + } + return bool8(); +} + +std::shared_ptr Bool8Type::MakeArray(std::shared_ptr data) const { + DCHECK_EQ(data->type->id(), Type::EXTENSION); + DCHECK_EQ("arrow.bool8", + internal::checked_cast(*data->type).extension_name()); + return std::make_shared(data); +} + +Result> Bool8Type::Make() { + return std::make_shared(); +} + +std::shared_ptr bool8() { return std::make_shared(); } + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/bool8.h b/cpp/src/arrow/extension/bool8.h new file mode 100644 index 0000000000000..02e629b28a867 --- /dev/null +++ b/cpp/src/arrow/extension/bool8.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension_type.h" + +namespace arrow::extension { + +/// \brief Bool8 is an alternate representation for boolean +/// arrays using 8 bits instead of 1 bit per value. The underlying +/// storage type is int8. +class ARROW_EXPORT Bool8Array : public ExtensionArray { + public: + using ExtensionArray::ExtensionArray; +}; + +/// \brief Bool8 is an alternate representation for boolean +/// arrays using 8 bits instead of 1 bit per value. The underlying +/// storage type is int8. +class ARROW_EXPORT Bool8Type : public ExtensionType { + public: + /// \brief Construct a Bool8Type. + Bool8Type() : ExtensionType(int8()) {} + + std::string extension_name() const override { return "arrow.bool8"; } + std::string ToString(bool show_metadata = false) const override; + + bool ExtensionEquals(const ExtensionType& other) const override; + + std::string Serialize() const override; + + Result> Deserialize( + std::shared_ptr storage_type, + const std::string& serialized_data) const override; + + /// Create a Bool8Array from ArrayData + std::shared_ptr MakeArray(std::shared_ptr data) const override; + + static Result> Make(); +}; + +/// \brief Return a Bool8Type instance. +ARROW_EXPORT std::shared_ptr bool8(); + +} // namespace arrow::extension diff --git a/cpp/src/arrow/extension/bool8_test.cc b/cpp/src/arrow/extension/bool8_test.cc new file mode 100644 index 0000000000000..eabcfcf62d32c --- /dev/null +++ b/cpp/src/arrow/extension/bool8_test.cc @@ -0,0 +1,91 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/extension/bool8.h" +#include "arrow/io/memory.h" +#include "arrow/ipc/reader.h" +#include "arrow/ipc/writer.h" +#include "arrow/testing/extension_type.h" +#include "arrow/testing/gtest_util.h" + +namespace arrow { + +TEST(Bool8Type, Basics) { + auto type = internal::checked_pointer_cast(extension::bool8()); + auto type2 = internal::checked_pointer_cast(extension::bool8()); + ASSERT_EQ("arrow.bool8", type->extension_name()); + ASSERT_EQ(*type, *type); + ASSERT_NE(*arrow::null(), *type); + ASSERT_EQ(*type, *type2); + ASSERT_EQ(*arrow::int8(), *type->storage_type()); + ASSERT_EQ("", type->Serialize()); + ASSERT_EQ("extension", type->ToString(false)); +} + +TEST(Bool8Type, CreateFromArray) { + auto type = internal::checked_pointer_cast(extension::bool8()); + auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); + auto array = ExtensionType::WrapArray(type, storage); + ASSERT_EQ(5, array->length()); + ASSERT_EQ(1, array->null_count()); +} + +TEST(Bool8Type, Deserialize) { + auto type = internal::checked_pointer_cast(extension::bool8()); + ASSERT_OK_AND_ASSIGN(auto deserialized, type->Deserialize(type->storage_type(), "")); + ASSERT_EQ(*type, *deserialized); + ASSERT_NOT_OK(type->Deserialize(type->storage_type(), "must be empty")); + ASSERT_EQ(*type, *deserialized); + ASSERT_NOT_OK(type->Deserialize(uint8(), "")); + ASSERT_EQ(*type, *deserialized); +} + +TEST(Bool8Type, MetadataRoundTrip) { + auto type = internal::checked_pointer_cast(extension::bool8()); + std::string serialized = type->Serialize(); + ASSERT_OK_AND_ASSIGN(auto deserialized, + type->Deserialize(type->storage_type(), serialized)); + ASSERT_EQ(*type, *deserialized); +} + +TEST(Bool8Type, BatchRoundTrip) { + auto type = internal::checked_pointer_cast(extension::bool8()); + + auto storage = ArrayFromJSON(int8(), "[-1,0,1,2,null]"); + auto array = ExtensionType::WrapArray(type, storage); + auto batch = + RecordBatch::Make(schema({field("field", type)}), array->length(), {array}); + + std::shared_ptr written; + { + ASSERT_OK_AND_ASSIGN(auto out_stream, io::BufferOutputStream::Create()); + ASSERT_OK(ipc::WriteRecordBatchStream({batch}, ipc::IpcWriteOptions::Defaults(), + out_stream.get())); + + ASSERT_OK_AND_ASSIGN(auto complete_ipc_stream, out_stream->Finish()); + + io::BufferReader reader(complete_ipc_stream); + std::shared_ptr batch_reader; + ASSERT_OK_AND_ASSIGN(batch_reader, ipc::RecordBatchStreamReader::Open(&reader)); + ASSERT_OK(batch_reader->ReadNext(&written)); + } + + ASSERT_EQ(*batch->schema(), *written->schema()); + ASSERT_BATCHES_EQUAL(*batch, *written); +} + +} // namespace arrow diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc index cf8dda7a85df4..685018f7de7b8 100644 --- a/cpp/src/arrow/extension_type.cc +++ b/cpp/src/arrow/extension_type.cc @@ -28,6 +28,7 @@ #include "arrow/chunked_array.h" #include "arrow/config.h" #ifdef ARROW_JSON +#include "arrow/extension/bool8.h" #include "arrow/extension/fixed_shape_tensor.h" #endif #include "arrow/status.h" @@ -146,10 +147,12 @@ static void CreateGlobalRegistry() { #ifdef ARROW_JSON // Register canonical extension types - auto ext_type = + auto fst_ext_type = checked_pointer_cast(extension::fixed_shape_tensor(int64(), {})); + ARROW_CHECK_OK(g_registry->RegisterType(fst_ext_type)); - ARROW_CHECK_OK(g_registry->RegisterType(ext_type)); + auto bool8_ext_type = checked_pointer_cast(extension::bool8()); + ARROW_CHECK_OK(g_registry->RegisterType(bool8_ext_type)); #endif } diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index aa7bab9f97e05..807bcdc315036 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -174,6 +174,7 @@ def print_entry(label, value): run_end_encoded, fixed_shape_tensor, opaque, + bool8, field, type_for_alias, DataType, DictionaryType, StructType, @@ -184,7 +185,7 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, FixedShapeTensorType, OpaqueType, - PyExtensionType, UnknownExtensionType, + Bool8Type, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, KeyValueMetadata, @@ -218,7 +219,7 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, FixedShapeTensorArray, OpaqueArray, - scalar, NA, _NULL as NULL, Scalar, + Bool8Array, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, @@ -235,7 +236,7 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar) + FixedShapeTensorScalar, OpaqueScalar, Bool8Scalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 6c40a21db96ca..4c3eb93232634 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1581,7 +1581,7 @@ cdef class Array(_PandasConvertible): def to_numpy(self, zero_copy_only=True, writable=False): """ - Return a NumPy view or copy of this array (experimental). + Return a NumPy view or copy of this array. By default, tries to return a view of this array. This is only supported for primitive arrays with the same memory layout as NumPy @@ -4476,6 +4476,118 @@ cdef class OpaqueArray(ExtensionArray): """ +cdef class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + + Examples + -------- + Define the extension type for an bool8 array + + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + + Create an extension array + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self, zero_copy_only=True, writable=False): + """ + Return a NumPy bool view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + if not writable: + try: + return self.storage.to_numpy().view(np.bool_) + except ArrowInvalid as e: + if zero_copy_only: + raise e + + return _pc().not_equal(self.storage, 0).to_numpy(zero_copy_only=zero_copy_only, writable=writable) + + @staticmethod + def from_storage(Int8Array storage): + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ + return ExtensionArray.from_storage(bool8(), storage) + + @staticmethod + def from_numpy(obj): + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. + + Parameters + ---------- + obj : numpy.ndarray + + Returns + ------- + bool8_array : Bool8Array + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ + + if obj.ndim != 1: + raise ValueError(f"Cannot convert {obj.ndim}-D array to bool8 array") + + if obj.dtype not in [np.bool_, np.int8]: + raise TypeError(f"Array dtype {obj.dtype} incompatible with bool8 storage") + + storage_arr = array(obj.view(np.int8), type=int8()) + return Bool8Array.from_storage(storage_arr) + + cdef dict _array_classes = { _Type_NA: NullArray, _Type_BOOL: BooleanArray, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 9b008d150f1f1..a54a1db292f70 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2895,6 +2895,15 @@ cdef extern from "arrow/extension/opaque.h" namespace "arrow::extension" nogil: pass +cdef extern from "arrow/extension/bool8.h" namespace "arrow::extension" nogil: + cdef cppclass CBool8Type" arrow::extension::Bool8Type"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make() + + cdef cppclass CBool8Array" arrow::extension::Bool8Array"(CExtensionArray): + pass + cdef extern from "arrow/util/compression.h" namespace "arrow" nogil: cdef enum CCompressionType" arrow::Compression::type": CCompressionType_UNCOMPRESSED" arrow::Compression::UNCOMPRESSED" diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 2cb302d20a8ac..e3625c1815274 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -214,6 +214,9 @@ cdef class FixedShapeTensorType(BaseExtensionType): cdef: const CFixedShapeTensorType* tensor_ext_type +cdef class Bool8Type(BaseExtensionType): + cdef: + const CBool8Type* bool8_ext_type cdef class OpaqueType(BaseExtensionType): cdef: diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index 2f9fc1c554209..19a26bd6c683d 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -126,6 +126,8 @@ cdef api object pyarrow_wrap_data_type( out = FixedShapeTensorType.__new__(FixedShapeTensorType) elif ext_type.extension_name() == b"arrow.opaque": out = OpaqueType.__new__(OpaqueType) + elif ext_type.extension_name() == b"arrow.bool8": + out = Bool8Type.__new__(Bool8Type) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 12a99c2aece63..72ae2aee5f8b3 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1091,6 +1091,18 @@ cdef class OpaqueScalar(ExtensionScalar): """ +cdef class Bool8Scalar(ExtensionScalar): + """ + Concrete class for bool8 extension scalar. + """ + + def as_py(self): + """ + Return this scalar as a Python object. + """ + py_val = super().as_py() + return None if py_val is None else py_val != 0 + cdef dict _scalar_classes = { _Type_BOOL: BooleanScalar, _Type_UINT8: UInt8Scalar, @@ -1199,6 +1211,11 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): type = ensure_type(type, allow_none=True) pool = maybe_unbox_memory_pool(memory_pool) + extension_type = None + if type is not None and type.id == _Type_EXTENSION: + extension_type = type + type = type.storage_type + if _is_array_like(value): value = get_values(value, &is_pandas_object) @@ -1223,4 +1240,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): # retrieve the scalar from the first position scalar = GetResultValue(array.get().GetScalar(0)) - return Scalar.wrap(scalar) + result = Scalar.wrap(scalar) + + if extension_type is not None: + result = ExtensionScalar.from_storage(extension_type, result) + return result diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 58c54189f223e..b04ee85ec99ad 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1707,3 +1707,155 @@ def test_opaque_type(pickle_module, storage_type, storage): # cast extension type -> storage type inner = arr.cast(storage_type) assert inner == storage + + +def test_bool8_type(pickle_module): + bool8_type = pa.bool8() + storage_type = pa.int8() + assert bool8_type.extension_name == "arrow.bool8" + assert bool8_type.storage_type == storage_type + assert str(bool8_type) == "extension" + + assert bool8_type == bool8_type + assert bool8_type == pa.bool8() + assert bool8_type != storage_type + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(bool8_type)) + assert result == bool8_type + + # IPC roundtrip + storage = pa.array([-1, 0, 1, 2, None], storage_type) + arr = pa.ExtensionArray.from_storage(bool8_type, storage) + assert isinstance(arr, pa.Bool8Array) + + # extension is registered by default + buf = ipc_write_batch(pa.RecordBatch.from_arrays([arr], ["ext"])) + batch = ipc_read_batch(buf) + + assert batch.column(0).type.extension_name == "arrow.bool8" + assert isinstance(batch.column(0), pa.Bool8Array) + + # cast storage -> extension type + result = storage.cast(bool8_type) + assert result == arr + + # cast extension type -> storage type + inner = arr.cast(storage_type) + assert inner == storage + + +def test_bool8_to_bool_conversion(): + bool_arr = pa.array([True, False, True, True, None], pa.bool_()) + bool8_arr = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2, None], pa.int8()), + ) + + # cast extension type -> arrow boolean type + assert bool8_arr.cast(pa.bool_()) == bool_arr + + # cast arrow boolean type -> extension type, expecting canonical values + canonical_storage = pa.array([1, 0, 1, 1, None], pa.int8()) + canonical_bool8_arr = pa.ExtensionArray.from_storage(pa.bool8(), canonical_storage) + assert bool_arr.cast(pa.bool8()) == canonical_bool8_arr + + +def test_bool8_to_numpy_conversion(): + arr = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2, None], pa.int8()), + ) + + # cannot zero-copy with nulls + with pytest.raises( + pa.ArrowInvalid, + match="Needed to copy 1 chunks with 1 nulls, but zero_copy_only was True", + ): + arr.to_numpy() + + # nullable conversion possible with a copy, but dest dtype is object + assert np.array_equal( + arr.to_numpy(zero_copy_only=False), + np.array([True, False, True, True, None], dtype=np.object_), + ) + + # zero-copy possible with non-null array + np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) + arr_no_nulls = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2], pa.int8()), + ) + + arr_to_np = arr_no_nulls.to_numpy() + assert np.array_equal(arr_to_np, np_arr_no_nulls) + + # same underlying buffer + assert arr_to_np.ctypes.data == arr_no_nulls.buffers()[1].address + + # if the user requests a writable array, a copy should be performed + arr_to_np_writable = arr_no_nulls.to_numpy(zero_copy_only=False, writable=True) + assert np.array_equal(arr_to_np_writable, np_arr_no_nulls) + + # different underlying buffer + assert arr_to_np_writable.ctypes.data != arr_no_nulls.buffers()[1].address + + +def test_bool8_from_numpy_conversion(): + np_arr_no_nulls = np.array([True, False, True, True], dtype=np.bool_) + canonical_bool8_arr_no_nulls = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([1, 0, 1, 1], pa.int8()), + ) + + arr_from_np = pa.Bool8Array.from_numpy(np_arr_no_nulls) + assert arr_from_np == canonical_bool8_arr_no_nulls + + # same underlying buffer + assert arr_from_np.buffers()[1].address == np_arr_no_nulls.ctypes.data + + # conversion only valid for 1-D arrays + with pytest.raises( + ValueError, + match="Cannot convert 2-D array to bool8 array", + ): + pa.Bool8Array.from_numpy( + np.array([[True, False], [False, True]], dtype=np.bool_), + ) + + with pytest.raises( + ValueError, + match="Cannot convert 0-D array to bool8 array", + ): + pa.Bool8Array.from_numpy(np.bool_()) + + # must use compatible storage type + with pytest.raises( + TypeError, + match="Array dtype float64 incompatible with bool8 storage", + ): + pa.Bool8Array.from_numpy(np.array([1, 2, 3], dtype=np.float64)) + + +def test_bool8_scalar(): + assert pa.ExtensionScalar.from_storage(pa.bool8(), -1).as_py() is True + assert pa.ExtensionScalar.from_storage(pa.bool8(), 0).as_py() is False + assert pa.ExtensionScalar.from_storage(pa.bool8(), 1).as_py() is True + assert pa.ExtensionScalar.from_storage(pa.bool8(), 2).as_py() is True + assert pa.ExtensionScalar.from_storage(pa.bool8(), None).as_py() is None + + arr = pa.ExtensionArray.from_storage( + pa.bool8(), + pa.array([-1, 0, 1, 2, None], pa.int8()), + ) + assert arr[0].as_py() is True + assert arr[1].as_py() is False + assert arr[2].as_py() is True + assert arr[3].as_py() is True + assert arr[4].as_py() is None + + assert pa.scalar(-1, type=pa.bool8()).as_py() is True + assert pa.scalar(0, type=pa.bool8()).as_py() is False + assert pa.scalar(1, type=pa.bool8()).as_py() is True + assert pa.scalar(2, type=pa.bool8()).as_py() is True + assert pa.scalar(None, type=pa.bool8()).as_py() is None diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 9a55a38177fc8..5d3471c7c35db 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -250,6 +250,9 @@ def test_set_timezone_db_path_non_windows(): pa.OpaqueArray, pa.OpaqueScalar, pa.OpaqueType, + pa.Bool8Array, + pa.Bool8Scalar, + pa.Bool8Type, ]) def test_extension_type_constructor_errors(klass): # ARROW-2638: prevent calling extension class constructors directly diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index dcd2b61c33411..563782f0c2643 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1837,6 +1837,37 @@ cdef class FixedShapeTensorType(BaseExtensionType): return FixedShapeTensorScalar +cdef class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.bool8_ext_type = type.get() + + def __arrow_ext_class__(self): + return Bool8Array + + def __reduce__(self): + return bool8, () + + def __arrow_ext_scalar_class__(self): + return Bool8Scalar + + cdef class OpaqueType(BaseExtensionType): """ Concrete class for opaque extension type. @@ -5278,6 +5309,49 @@ def fixed_shape_tensor(DataType value_type, shape, dim_names=None, permutation=N return out +def bool8(): + """ + Create instance of bool8 extension type. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(int8) + + Create a table with a bool8 array: + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[-1,0,1,2,null]] + + Returns + ------- + type : Bool8Type + """ + + cdef Bool8Type out = Bool8Type.__new__(Bool8Type) + + c_type = GetResultValue(CBool8Type.Make()) + + out.init(c_type) + + return out + + def opaque(DataType storage_type, str type_name not None, str vendor_name not None): """ Create instance of opaque extension type.