Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-44066: [Python] Add Python wrapper for JsonExtensionType #44070

Merged
merged 20 commits into from
Oct 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions python/pyarrow/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def print_entry(label, value):
union, sparse_union, dense_union,
dictionary,
run_end_encoded,
bool8, fixed_shape_tensor, opaque, uuid,
bool8, fixed_shape_tensor, json_, opaque, uuid,
field,
type_for_alias,
DataType, DictionaryType, StructType,
Expand All @@ -183,7 +183,7 @@ def print_entry(label, value):
FixedSizeBinaryType, Decimal128Type, Decimal256Type,
BaseExtensionType, ExtensionType,
RunEndEncodedType, Bool8Type, FixedShapeTensorType,
OpaqueType, UuidType,
JsonType, OpaqueType, UuidType,
PyExtensionType, UnknownExtensionType,
register_extension_type, unregister_extension_type,
DictionaryMemo,
Expand Down Expand Up @@ -218,7 +218,7 @@ def print_entry(label, value):
MonthDayNanoIntervalArray,
Decimal128Array, Decimal256Array, StructArray, ExtensionArray,
RunEndEncodedArray, Bool8Array, FixedShapeTensorArray,
OpaqueArray, UuidArray,
JsonArray, OpaqueArray, UuidArray,
scalar, NA, _NULL as NULL, Scalar,
NullScalar, BooleanScalar,
Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
Expand All @@ -236,7 +236,7 @@ def print_entry(label, value):
FixedSizeBinaryScalar, DictionaryScalar,
MapScalar, StructScalar, UnionScalar,
RunEndEncodedScalar, Bool8Scalar, ExtensionScalar,
FixedShapeTensorScalar, OpaqueScalar, UuidScalar)
FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar)

# Buffers, allocation
from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager,
Expand Down
27 changes: 27 additions & 0 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -4344,6 +4344,33 @@ cdef class ExtensionArray(Array):
return result


class JsonArray(ExtensionArray):
"""
Concrete class for Arrow arrays of JSON data type.

This does not guarantee that the JSON data actually
is valid JSON.

jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
Examples
--------
Define the extension type for JSON array

>>> import pyarrow as pa
>>> json_type = pa.json_(pa.large_utf8())

Create an extension array

>>> arr = [None, '{ "id":30, "values":["a", "b"] }']
>>> storage = pa.array(arr, pa.large_utf8())
>>> pa.ExtensionArray.from_storage(json_type, storage)
<pyarrow.lib.JsonArray object at ...>
[
null,
"{ "id":30, "values":["a", "b"] }"
]
"""


class UuidArray(ExtensionArray):
"""
Concrete class for Arrow arrays of UUID data type.
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2867,6 +2867,13 @@ cdef extern from "arrow/extension_type.h" namespace "arrow":
shared_ptr[CArray] storage()


cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil:
cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType):

@staticmethod
CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type)


cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil:
cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType):

Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/lib.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,11 @@ cdef class UuidType(BaseExtensionType):
cdef:
const CUuidType* uuid_ext_type

cdef class JsonType(BaseExtensionType):
cdef:
const CJsonType* json_ext_type

rok marked this conversation as resolved.
Show resolved Hide resolved

cdef class PyExtensionType(ExtensionType):
pass

Expand Down
2 changes: 2 additions & 0 deletions python/pyarrow/public-api.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type(
out = OpaqueType.__new__(OpaqueType)
elif extension_name == b"arrow.uuid":
out = UuidType.__new__(UuidType)
elif extension_name == b"arrow.json":
out = JsonType.__new__(JsonType)
else:
out = BaseExtensionType.__new__(BaseExtensionType)
else:
Expand Down
6 changes: 6 additions & 0 deletions python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1044,6 +1044,12 @@ cdef class ExtensionScalar(Scalar):
return pyarrow_wrap_scalar(<shared_ptr[CScalar]> sp_scalar)


class JsonScalar(ExtensionScalar):
"""
Concrete class for JSON extension scalar.
"""


class UuidScalar(ExtensionScalar):
"""
Concrete class for Uuid extension scalar.
Expand Down
11 changes: 11 additions & 0 deletions python/pyarrow/tests/parquet/test_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,3 +510,14 @@ def test_large_binary_overflow():
pa.ArrowInvalid,
match="Parquet cannot store strings with size 2GB or more"):
_write_table(table, writer, use_dictionary=use_dictionary)


@pytest.mark.parametrize("storage_type", (
pa.string(), pa.large_string()))
def test_json_extension_type(storage_type):
data = ['{"a": 1}', '{"b": 2}', None]
arr = pa.array(data, type=pa.json_(storage_type))

table = pa.table([arr], names=["ext"])

_simple_table_roundtrip(table)
53 changes: 53 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1926,3 +1926,56 @@ def test_bool8_scalar():
assert pa.scalar(1, type=pa.bool8()).as_py() is True
assert pa.scalar(2, type=pa.bool8()).as_py() is True
assert pa.scalar(None, type=pa.bool8()).as_py() is None


@pytest.mark.parametrize("storage_type", (
pa.string(), pa.large_string(), pa.string_view()))
def test_json(storage_type, pickle_module):
data = ['{"a": 1}', '{"b": 2}', None]
json_type = pa.json_(storage_type)
storage = pa.array(data, type=storage_type)
array = pa.array(data, type=json_type)
json_arr_class = json_type.__arrow_ext_class__()

assert pa.json_() == pa.json_(pa.utf8())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this means that the storage type is not taken into account for checking equality of the JSON extension type? Should it?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant to test that pa.json_() sets storage to pa.utf8() by default and I'd keep this test. However as noted in #13901 (review) disregarding storage type here is not desirable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I meant to test that pa.json_() sets storage to pa.utf8() by default and I'd keep this test.

Ah, yes of course, that's good to test

assert json_type.extension_name == "arrow.json"
assert json_type.storage_type == storage_type
assert json_type.__class__ is pa.JsonType
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved

assert json_type == pa.json_(storage_type)
assert json_type != storage_type

assert isinstance(array, pa.JsonArray)

assert array.to_pylist() == data
assert array[0].as_py() == data[0]
assert array[2].as_py() is None

# Pickle roundtrip
result = pickle_module.loads(pickle_module.dumps(json_type))
assert result == json_type

# IPC roundtrip
buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"]))
batch = ipc_read_batch(buf)
reconstructed_array = batch.column(0)
assert reconstructed_array.type == json_type
assert reconstructed_array == array
assert isinstance(array, json_arr_class)

assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar
assert isinstance(array[0], pa.JsonScalar)

# cast storage -> extension type
result = storage.cast(json_type)
assert result == array

# cast extension type -> storage type
inner = array.cast(storage_type)
assert inner == storage
rok marked this conversation as resolved.
Show resolved Hide resolved

for storage_type in (pa.int32(), pa.large_binary(), pa.float32()):
with pytest.raises(
pa.ArrowInvalid,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pity this doesn't raise TypeError.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could catch and raise it but it's probably not a good idea.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed. TypeError would have to be raised at the C++ level instead. Anyway, this is out of scope for this PR.

match=f"Invalid storage type for JsonExtensionType: {storage_type}"):
pa.json_(storage_type)
3 changes: 3 additions & 0 deletions python/pyarrow/tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows():
pa.Bool8Array,
pa.Bool8Scalar,
pa.Bool8Type,
pa.JsonArray,
pa.JsonScalar,
pa.JsonType,
])
def test_extension_type_constructor_errors(klass):
# ARROW-2638: prevent calling extension class constructors directly
Expand Down
75 changes: 75 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType):
return ExtensionScalar


cdef class JsonType(BaseExtensionType):
"""
Concrete class for JSON extension type.

Examples
--------
Define the extension type for JSON array

>>> import pyarrow as pa
>>> json_type = pa.json_(pa.large_utf8())

Create an extension array

>>> arr = [None, '{ "id":30, "values":["a", "b"] }']
>>> storage = pa.array(arr, pa.large_utf8())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Side note: it would be nice if one could write json_array = pa.array(arr, json_type).
Perhaps open a feature request?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That would be a nice feature. #44406

>>> pa.ExtensionArray.from_storage(json_type, storage)
<pyarrow.lib.JsonArray object at ...>
[
null,
"{ "id":30, "values":["a", "b"] }"
]
"""

cdef void init(self, const shared_ptr[CDataType]& type) except *:
BaseExtensionType.init(self, type)
self.json_ext_type = <const CJsonType*> type.get()

def __arrow_ext_class__(self):
return JsonArray

def __reduce__(self):
return json_, (self.storage_type,)

def __arrow_ext_scalar_class__(self):
return JsonScalar


cdef class UuidType(BaseExtensionType):
"""
Concrete class for UUID extension type.
Expand Down Expand Up @@ -5296,6 +5333,44 @@ def run_end_encoded(run_end_type, value_type):
return pyarrow_wrap_data_type(ree_type)


def json_(DataType storage_type=utf8()):
"""
Create instance of JSON extension type.

Parameters
----------
storage_type : DataType, default pyarrow.string()
The underlying data type. Can be on of the following types:
string, large_string, string_view.

rok marked this conversation as resolved.
Show resolved Hide resolved
Returns
-------
type : JsonType

Examples
--------
Create an instance of JSON extension type:

>>> import pyarrow as pa
>>> pa.json_(pa.utf8())
JsonType(extension<arrow.json>)

Use the JSON type to create an array:

>>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8()))
<pyarrow.lib.JsonArray object at ...>
[
"{"a": 1}",
"{"b": 2}"
]
"""

cdef JsonType out = JsonType.__new__(JsonType)
c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type))
out.init(c_json_ext_type)
return out


def uuid():
"""
Create UuidType instance.
Expand Down
Loading