Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-29781: [C++][Parquet] Switch to use compliant nested types by default #35146

Merged
merged 4 commits into from
May 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand All @@ -1005,7 +1005,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand Down Expand Up @@ -1086,7 +1086,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand All @@ -1102,7 +1102,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
// }
// }
{
auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
parquet_fields.push_back(
Expand Down
9 changes: 4 additions & 5 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -879,8 +879,7 @@ class PARQUET_EXPORT ArrowWriterProperties {
coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
truncated_timestamps_allowed_(false),
store_schema_(false),
// TODO: At some point we should flip this.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/// This is disabled by default, but will be enabled by default in future.

Should we change this in Loc 880?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The line 320 in the parquet.rst doc should also be changed.

   std::shared_ptr<ArrowWriterProperties> arrow_props = ArrowWriterProperties::Builder()
      .enable_deprecated_int96_timestamps() // default False
      ->store_schema() // default False
      ->enable_compliant_nested_types() // default False
      ->build();

compliant_nested_types_(false),
compliant_nested_types_(true),
engine_version_(V2),
use_threads_(kArrowDefaultUseThreads),
executor_(NULLPTR) {}
Expand Down Expand Up @@ -935,16 +934,16 @@ class PARQUET_EXPORT ArrowWriterProperties {
/// \brief When enabled, will not preserve Arrow field names for list types.
///
/// Instead of using the field names Arrow uses for the values array of
/// list types (default "item"), will use "entries", as is specified in
/// list types (default "item"), will use "element", as is specified in
/// the Parquet spec.
///
/// This is disabled by default, but will be enabled by default in future.
/// This is enabled by default.
Builder* enable_compliant_nested_types() {
compliant_nested_types_ = true;
return this;
}

/// Preserve Arrow list field name (default behavior).
/// Preserve Arrow list field name.
Builder* disable_compliant_nested_types() {
compliant_nested_types_ = false;
return this;
Expand Down
1 change: 0 additions & 1 deletion docs/source/cpp/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ There are also Arrow-specific settings that can be configured with
std::shared_ptr<ArrowWriterProperties> arrow_props = ArrowWriterProperties::Builder()
.enable_deprecated_int96_timestamps() // default False
->store_schema() // default False
->enable_compliant_nested_types() // default False
->build();
These options mostly dictate how Arrow types are converted to Parquet types.
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
use_deprecated_int96_timestamps=False,
coerce_timestamps=None,
allow_truncated_timestamps=False,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
)
self._set_properties()
self._set_arrow_properties()
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1607,7 +1607,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
coerce_timestamps=None,
allow_truncated_timestamps=False,
writer_engine_version=None,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
store_schema=True) except *:
"""Arrow writer properties"""
cdef:
Expand Down Expand Up @@ -1704,7 +1704,7 @@ cdef class ParquetWriter(_Weakrefable):
column_encoding=None,
writer_engine_version=None,
data_page_version=None,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ def _sanitize_table(table, new_schema, flavor):
The serialized Parquet data page format version to write, defaults to
1.0. This does not impact the file schema logical types and Arrow to
Parquet type casting behavior; for that use the "version" option.
use_compliant_nested_type : bool, default False
use_compliant_nested_type : bool, default True
Whether to write compliant Parquet nested type (lists) as defined
`here <https://github.com/apache/parquet-format/blob/master/
LogicalTypes.md#nested-types>`_, defaults to ``False``.
Expand Down Expand Up @@ -954,7 +954,7 @@ def __init__(self, where, schema, filesystem=None,
column_encoding=None,
writer_engine_version=None,
data_page_version='1.0',
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down Expand Up @@ -3072,7 +3072,7 @@ def write_table(table, where, row_group_size=None, version='2.4',
use_byte_stream_split=False,
column_encoding=None,
data_page_version='1.0',
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down
19 changes: 9 additions & 10 deletions python/pyarrow/tests/parquet/test_compliant_nested_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,15 @@ def test_write_compliant_nested_type_enable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write pandas df with new flag
# verify that we can read/write pandas df with new flag (default behaviour)
_roundtrip_pandas_dataframe(df,
write_kwargs={
'use_compliant_nested_type': True},
write_kwargs={},
use_legacy_dataset=use_legacy_dataset)

# Write to a parquet file with compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema,
use_compliant_nested_type=True,
version='2.6') as writer:
writer.write_table(table)
# Read back as a table
Expand All @@ -86,8 +84,7 @@ def test_write_compliant_nested_type_enable(tempdir,

# Verify that the new table can be read/written correctly
_check_roundtrip(new_table,
use_legacy_dataset=use_legacy_dataset,
use_compliant_nested_type=True)
use_legacy_dataset=use_legacy_dataset)


@pytest.mark.pandas
Expand All @@ -97,14 +94,16 @@ def test_write_compliant_nested_type_disable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write with new flag disabled (default behaviour)
_roundtrip_pandas_dataframe(df, write_kwargs={},
use_legacy_dataset=use_legacy_dataset)
# verify that we can read/write with new flag disabled
_roundtrip_pandas_dataframe(df, write_kwargs={
'use_compliant_nested_type': False},
use_legacy_dataset=use_legacy_dataset)

# Write to a parquet file while disabling compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
with pq.ParquetWriter(path, table.schema, version='2.6',
use_compliant_nested_type=False) as writer:
writer.write_table(table)
new_table = _read_table(path)

Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/tests/parquet/test_data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset):
pq.write_table(table, bio)
contents = bio.getvalue()
result = pq.read_table(pa.BufferReader(contents),
read_dictionary=['f0.list.item'],
read_dictionary=['f0.list.element'],
use_legacy_dataset=use_legacy_dataset)

arr = pa.array(data[0])
Expand Down