diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc index 3ceadc4ebca81..621f2a0e76f5f 100644 --- a/cpp/src/parquet/arrow/arrow_schema_test.cc +++ b/cpp/src/parquet/arrow/arrow_schema_test.cc @@ -988,7 +988,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) { // } // } { - auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL, + auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::UTF8); auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); parquet_fields.push_back( @@ -1005,7 +1005,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) { // } // } { - auto element = PrimitiveNode::Make("string", Repetition::REQUIRED, + auto element = PrimitiveNode::Make("element", Repetition::REQUIRED, ParquetType::BYTE_ARRAY, ConvertedType::UTF8); auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); parquet_fields.push_back( @@ -1086,7 +1086,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) { // } // } { - auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL, + auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::UTF8); auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); parquet_fields.push_back( @@ -1102,7 +1102,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) { // } // } { - auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL, + auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, ConvertedType::UTF8); auto list = GroupNode::Make("list", Repetition::REPEATED, {element}); parquet_fields.push_back( diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 4c532235ca443..0a9864de6266a 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -879,8 +879,7 @@ class PARQUET_EXPORT ArrowWriterProperties { coerce_timestamps_unit_(::arrow::TimeUnit::SECOND), truncated_timestamps_allowed_(false), store_schema_(false), - // TODO: At some point we should flip this. - compliant_nested_types_(false), + compliant_nested_types_(true), engine_version_(V2), use_threads_(kArrowDefaultUseThreads), executor_(NULLPTR) {} @@ -935,16 +934,16 @@ class PARQUET_EXPORT ArrowWriterProperties { /// \brief When enabled, will not preserve Arrow field names for list types. /// /// Instead of using the field names Arrow uses for the values array of - /// list types (default "item"), will use "entries", as is specified in + /// list types (default "item"), will use "element", as is specified in /// the Parquet spec. /// - /// This is disabled by default, but will be enabled by default in future. + /// This is enabled by default. Builder* enable_compliant_nested_types() { compliant_nested_types_ = true; return this; } - /// Preserve Arrow list field name (default behavior). + /// Preserve Arrow list field name. Builder* disable_compliant_nested_types() { compliant_nested_types_ = false; return this; diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index ca555cbbc0d5e..16d0f523f2c4d 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -317,7 +317,6 @@ There are also Arrow-specific settings that can be configured with std::shared_ptr arrow_props = ArrowWriterProperties::Builder() .enable_deprecated_int96_timestamps() // default False ->store_schema() // default False - ->enable_compliant_nested_types() // default False ->build(); These options mostly dictate how Arrow types are converted to Parquet types. diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index f81c6fcb73b32..ad914c77bf31c 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -596,7 +596,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): use_deprecated_int96_timestamps=False, coerce_timestamps=None, allow_truncated_timestamps=False, - use_compliant_nested_type=False, + use_compliant_nested_type=True, ) self._set_properties() self._set_arrow_properties() diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index c54a243b991ff..2fc0494cbccaf 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -1607,7 +1607,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties( coerce_timestamps=None, allow_truncated_timestamps=False, writer_engine_version=None, - use_compliant_nested_type=False, + use_compliant_nested_type=True, store_schema=True) except *: """Arrow writer properties""" cdef: @@ -1704,7 +1704,7 @@ cdef class ParquetWriter(_Weakrefable): column_encoding=None, writer_engine_version=None, data_page_version=None, - use_compliant_nested_type=False, + use_compliant_nested_type=True, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 018902dde4a93..3fa0d6dadd25a 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -822,7 +822,7 @@ def _sanitize_table(table, new_schema, flavor): The serialized Parquet data page format version to write, defaults to 1.0. This does not impact the file schema logical types and Arrow to Parquet type casting behavior; for that use the "version" option. -use_compliant_nested_type : bool, default False +use_compliant_nested_type : bool, default True Whether to write compliant Parquet nested type (lists) as defined `here `_, defaults to ``False``. @@ -954,7 +954,7 @@ def __init__(self, where, schema, filesystem=None, column_encoding=None, writer_engine_version=None, data_page_version='1.0', - use_compliant_nested_type=False, + use_compliant_nested_type=True, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, @@ -3072,7 +3072,7 @@ def write_table(table, where, row_group_size=None, version='2.4', use_byte_stream_split=False, column_encoding=None, data_page_version='1.0', - use_compliant_nested_type=False, + use_compliant_nested_type=True, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index b60b0273b21c0..ca1ad7ee32255 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -64,17 +64,15 @@ def test_write_compliant_nested_type_enable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) - # verify that we can read/write pandas df with new flag + # verify that we can read/write pandas df with new flag (default behaviour) _roundtrip_pandas_dataframe(df, - write_kwargs={ - 'use_compliant_nested_type': True}, + write_kwargs={}, use_legacy_dataset=use_legacy_dataset) # Write to a parquet file with compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') with pq.ParquetWriter(path, table.schema, - use_compliant_nested_type=True, version='2.6') as writer: writer.write_table(table) # Read back as a table @@ -86,8 +84,7 @@ def test_write_compliant_nested_type_enable(tempdir, # Verify that the new table can be read/written correctly _check_roundtrip(new_table, - use_legacy_dataset=use_legacy_dataset, - use_compliant_nested_type=True) + use_legacy_dataset=use_legacy_dataset) @pytest.mark.pandas @@ -97,14 +94,16 @@ def test_write_compliant_nested_type_disable(tempdir, use_legacy_dataset, test_data): # prepare dataframe for testing df = pd.DataFrame(data=test_data) - # verify that we can read/write with new flag disabled (default behaviour) - _roundtrip_pandas_dataframe(df, write_kwargs={}, - use_legacy_dataset=use_legacy_dataset) + # verify that we can read/write with new flag disabled + _roundtrip_pandas_dataframe(df, write_kwargs={ + 'use_compliant_nested_type': False}, + use_legacy_dataset=use_legacy_dataset) # Write to a parquet file while disabling compliant nested type table = pa.Table.from_pandas(df, preserve_index=False) path = str(tempdir / 'data.parquet') - with pq.ParquetWriter(path, table.schema, version='2.6') as writer: + with pq.ParquetWriter(path, table.schema, version='2.6', + use_compliant_nested_type=False) as writer: writer.write_table(table) new_table = _read_table(path) diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 13d94b6a2563b..109d82831c5cd 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -163,7 +163,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset): pq.write_table(table, bio) contents = bio.getvalue() result = pq.read_table(pa.BufferReader(contents), - read_dictionary=['f0.list.item'], + read_dictionary=['f0.list.element'], use_legacy_dataset=use_legacy_dataset) arr = pa.array(data[0])