Skip to content

Commit

Permalink
GH-29781: [C++][Parquet] Switch to use compliant nested types by defa…
Browse files Browse the repository at this point in the history
…ult (#35146)

### Rationale for this change

This has been a long-standing TODO.

### What changes are included in this PR?

### Are these changes tested?

### Are there any user-facing changes?

**This PR includes breaking changes to public APIs.**
* Closes: #29781

Lead-authored-by: Will Jones <willjones127@gmail.com>
Co-authored-by: Antoine Pitrou <pitrou@free.fr>
Signed-off-by: Antoine Pitrou <antoine@python.org>
  • Loading branch information
wjones127 and pitrou authored May 12, 2023
1 parent 1624d5a commit e324f9a
Show file tree
Hide file tree
Showing 8 changed files with 24 additions and 27 deletions.
8 changes: 4 additions & 4 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand All @@ -1005,7 +1005,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand Down Expand Up @@ -1086,7 +1086,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand All @@ -1102,7 +1102,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -879,8 +879,7 @@ class PARQUET_EXPORT ArrowWriterProperties {
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
truncated_timestamps_allowed_(false),
store_schema_(false),
// TODO: At some point we should flip this.
compliant_nested_types_(false),
compliant_nested_types_(true),
engine_version_(V2),
use_threads_(kArrowDefaultUseThreads),
executor_(NULLPTR) {}
Expand Down Expand Up @@ -935,16 +934,16 @@ class PARQUET_EXPORT ArrowWriterProperties {
/// \brief When enabled, will not preserve Arrow field names for list types.
///
/// Instead of using the field names Arrow uses for the values array of
/// list types (default "item"), will use "entries", as is specified in
/// list types (default "item"), will use "element", as is specified in
/// the Parquet spec.
///
/// This is disabled by default, but will be enabled by default in future.
/// This is enabled by default.
Builder* enable_compliant_nested_types() {
compliant_nested_types_ = true;
return this;
}

/// Preserve Arrow list field name (default behavior).
/// Preserve Arrow list field name.
Builder* disable_compliant_nested_types() {
compliant_nested_types_ = false;
return this;
Expand Down
1 change: 0 additions & 1 deletion docs/source/cpp/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ There are also Arrow-specific settings that can be configured with
std::shared_ptr<ArrowWriterProperties> arrow_props = ArrowWriterProperties::Builder()
.enable_deprecated_int96_timestamps() // default False
->store_schema() // default False
->enable_compliant_nested_types() // default False
->build();
These options mostly dictate how Arrow types are converted to Parquet types.
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
use_deprecated_int96_timestamps=False,
coerce_timestamps=None,
allow_truncated_timestamps=False,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
)
self._set_properties()
self._set_arrow_properties()
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1607,7 +1607,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
coerce_timestamps=None,
allow_truncated_timestamps=False,
writer_engine_version=None,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
store_schema=True) except *:
"""Arrow writer properties"""
cdef:
Expand Down Expand Up @@ -1704,7 +1704,7 @@ cdef class ParquetWriter(_Weakrefable):
column_encoding=None,
writer_engine_version=None,
data_page_version=None,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ def _sanitize_table(table, new_schema, flavor):
The serialized Parquet data page format version to write, defaults to
1.0. This does not impact the file schema logical types and Arrow to
Parquet type casting behavior; for that use the "version" option.
use_compliant_nested_type : bool, default False
use_compliant_nested_type : bool, default True
Whether to write compliant Parquet nested type (lists) as defined
`here <https://github.com/apache/parquet-format/blob/master/
LogicalTypes.md#nested-types>`_, defaults to ``False``.
Expand Down Expand Up @@ -954,7 +954,7 @@ def __init__(self, where, schema, filesystem=None,
column_encoding=None,
writer_engine_version=None,
data_page_version='1.0',
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down Expand Up @@ -3072,7 +3072,7 @@ def write_table(table, where, row_group_size=None, version='2.4',
use_byte_stream_split=False,
column_encoding=None,
data_page_version='1.0',
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down
19 changes: 9 additions & 10 deletions python/pyarrow/tests/parquet/test_compliant_nested_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,15 @@ def test_write_compliant_nested_type_enable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write pandas df with new flag
# verify that we can read/write pandas df with new flag (default behaviour)
_roundtrip_pandas_dataframe(df,
write_kwargs={
'use_compliant_nested_type': True},
write_kwargs={},
use_legacy_dataset=use_legacy_dataset)

# Write to a parquet file with compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema,
use_compliant_nested_type=True,
version='2.6') as writer:
writer.write_table(table)
# Read back as a table
Expand All @@ -86,8 +84,7 @@ def test_write_compliant_nested_type_enable(tempdir,

# Verify that the new table can be read/written correctly
_check_roundtrip(new_table,
use_legacy_dataset=use_legacy_dataset,
use_compliant_nested_type=True)
use_legacy_dataset=use_legacy_dataset)


@pytest.mark.pandas
Expand All @@ -97,14 +94,16 @@ def test_write_compliant_nested_type_disable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write with new flag disabled (default behaviour)
_roundtrip_pandas_dataframe(df, write_kwargs={},
use_legacy_dataset=use_legacy_dataset)
# verify that we can read/write with new flag disabled
_roundtrip_pandas_dataframe(df, write_kwargs={
'use_compliant_nested_type': False},
use_legacy_dataset=use_legacy_dataset)

# Write to a parquet file while disabling compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
with pq.ParquetWriter(path, table.schema, version='2.6',
use_compliant_nested_type=False) as writer:
writer.write_table(table)
new_table = _read_table(path)

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset):
pq.write_table(table, bio)
contents = bio.getvalue()
result = pq.read_table(pa.BufferReader(contents),
read_dictionary=['f0.list.item'],
read_dictionary=['f0.list.element'],
use_legacy_dataset=use_legacy_dataset)

arr = pa.array(data[0])
Expand Down

0 comments on commit e324f9a

Please sign in to comment.