GH-29781: [C++][Parquet] Switch to use compliant nested types by defa…

…ult (#35146) ### Rationale for this change This has been a long-standing TODO. ### What changes are included in this PR? ### Are these changes tested? ### Are there any user-facing changes? **This PR includes breaking changes to public APIs.** * Closes: #29781 Lead-authored-by: Will Jones <willjones127@gmail.com> Co-authored-by: Antoine Pitrou <pitrou@free.fr> Signed-off-by: Antoine Pitrou <antoine@python.org>
apache · May 12, 2023 · e324f9a · e324f9a
1 parent 1624d5a
commit e324f9a
Show file tree

Hide file tree

Showing 8 changed files with 24 additions and 27 deletions.
diff --git a/cpp/src/parquet/arrow/arrow_schema_test.cc b/cpp/src/parquet/arrow/arrow_schema_test.cc
@@ -988,7 +988,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
   //   }
   // }
   {
-    auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
+    auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
                                        ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
     auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
     parquet_fields.push_back(
@@ -1005,7 +1005,7 @@ TEST_F(TestConvertArrowSchema, ParquetLists) {
   //   }
   // }
   {
-    auto element = PrimitiveNode::Make("string", Repetition::REQUIRED,
+    auto element = PrimitiveNode::Make("element", Repetition::REQUIRED,
                                        ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
     auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
     parquet_fields.push_back(
@@ -1086,7 +1086,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
   //   }
   // }
   {
-    auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
+    auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
                                        ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
     auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
     parquet_fields.push_back(
@@ -1102,7 +1102,7 @@ TEST_F(TestConvertArrowSchema, ParquetOtherLists) {
   //   }
   // }
   {
-    auto element = PrimitiveNode::Make("string", Repetition::OPTIONAL,
+    auto element = PrimitiveNode::Make("element", Repetition::OPTIONAL,
                                        ParquetType::BYTE_ARRAY, ConvertedType::UTF8);
     auto list = GroupNode::Make("list", Repetition::REPEATED, {element});
     parquet_fields.push_back(

diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h
@@ -879,8 +879,7 @@ class PARQUET_EXPORT ArrowWriterProperties {
           coerce_timestamps_unit_(::arrow::TimeUnit::SECOND),
           truncated_timestamps_allowed_(false),
           store_schema_(false),
-          // TODO: At some point we should flip this.
-          compliant_nested_types_(false),
+          compliant_nested_types_(true),
           engine_version_(V2),
           use_threads_(kArrowDefaultUseThreads),
           executor_(NULLPTR) {}
@@ -935,16 +934,16 @@ class PARQUET_EXPORT ArrowWriterProperties {
     /// \brief When enabled, will not preserve Arrow field names for list types.
     ///
     /// Instead of using the field names Arrow uses for the values array of
-    /// list types (default "item"), will use "entries", as is specified in
+    /// list types (default "item"), will use "element", as is specified in
     /// the Parquet spec.
     ///
-    /// This is disabled by default, but will be enabled by default in future.
+    /// This is enabled by default.
     Builder* enable_compliant_nested_types() {
       compliant_nested_types_ = true;
       return this;
     }
 
-    /// Preserve Arrow list field name (default behavior).
+    /// Preserve Arrow list field name.
     Builder* disable_compliant_nested_types() {
       compliant_nested_types_ = false;
       return this;

diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst
@@ -317,7 +317,6 @@ There are also Arrow-specific settings that can be configured with
    std::shared_ptr<ArrowWriterProperties> arrow_props = ArrowWriterProperties::Builder()
       .enable_deprecated_int96_timestamps() // default False
       ->store_schema() // default False
-      ->enable_compliant_nested_types() // default False
       ->build();
 
 These options mostly dictate how Arrow types are converted to Parquet types.

diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx
@@ -596,7 +596,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             use_deprecated_int96_timestamps=False,
             coerce_timestamps=None,
             allow_truncated_timestamps=False,
-            use_compliant_nested_type=False,
+            use_compliant_nested_type=True,
         )
         self._set_properties()
         self._set_arrow_properties()

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
@@ -1607,7 +1607,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
         coerce_timestamps=None,
         allow_truncated_timestamps=False,
         writer_engine_version=None,
-        use_compliant_nested_type=False,
+        use_compliant_nested_type=True,
         store_schema=True) except *:
     """Arrow writer properties"""
     cdef:
@@ -1704,7 +1704,7 @@ cdef class ParquetWriter(_Weakrefable):
                   column_encoding=None,
                   writer_engine_version=None,
                   data_page_version=None,
-                  use_compliant_nested_type=False,
+                  use_compliant_nested_type=True,
                   encryption_properties=None,
                   write_batch_size=None,
                   dictionary_pagesize_limit=None,

diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py
@@ -822,7 +822,7 @@ def _sanitize_table(table, new_schema, flavor):
     The serialized Parquet data page format version to write, defaults to
     1.0. This does not impact the file schema logical types and Arrow to
     Parquet type casting behavior; for that use the "version" option.
-use_compliant_nested_type : bool, default False
+use_compliant_nested_type : bool, default True
     Whether to write compliant Parquet nested type (lists) as defined
     `here <https://github.com/apache/parquet-format/blob/master/
     LogicalTypes.md#nested-types>`_, defaults to ``False``.
@@ -954,7 +954,7 @@ def __init__(self, where, schema, filesystem=None,
                  column_encoding=None,
                  writer_engine_version=None,
                  data_page_version='1.0',
-                 use_compliant_nested_type=False,
+                 use_compliant_nested_type=True,
                  encryption_properties=None,
                  write_batch_size=None,
                  dictionary_pagesize_limit=None,
@@ -3072,7 +3072,7 @@ def write_table(table, where, row_group_size=None, version='2.4',
                 use_byte_stream_split=False,
                 column_encoding=None,
                 data_page_version='1.0',
-                use_compliant_nested_type=False,
+                use_compliant_nested_type=True,
                 encryption_properties=None,
                 write_batch_size=None,
                 dictionary_pagesize_limit=None,

diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py
@@ -64,17 +64,15 @@ def test_write_compliant_nested_type_enable(tempdir,
                                             use_legacy_dataset, test_data):
     # prepare dataframe for testing
     df = pd.DataFrame(data=test_data)
-    # verify that we can read/write pandas df with new flag
+    # verify that we can read/write pandas df with new flag (default behaviour)
     _roundtrip_pandas_dataframe(df,
-                                write_kwargs={
-                                    'use_compliant_nested_type': True},
+                                write_kwargs={},
                                 use_legacy_dataset=use_legacy_dataset)
 
     # Write to a parquet file with compliant nested type
     table = pa.Table.from_pandas(df, preserve_index=False)
     path = str(tempdir / 'data.parquet')
     with pq.ParquetWriter(path, table.schema,
-                          use_compliant_nested_type=True,
                           version='2.6') as writer:
         writer.write_table(table)
     # Read back as a table
@@ -86,8 +84,7 @@ def test_write_compliant_nested_type_enable(tempdir,
 
     # Verify that the new table can be read/written correctly
     _check_roundtrip(new_table,
-                     use_legacy_dataset=use_legacy_dataset,
-                     use_compliant_nested_type=True)
+                     use_legacy_dataset=use_legacy_dataset)
 
 
 @pytest.mark.pandas
@@ -97,14 +94,16 @@ def test_write_compliant_nested_type_disable(tempdir,
                                              use_legacy_dataset, test_data):
     # prepare dataframe for testing
     df = pd.DataFrame(data=test_data)
-    # verify that we can read/write with new flag disabled (default behaviour)
-    _roundtrip_pandas_dataframe(df, write_kwargs={},
-                                use_legacy_dataset=use_legacy_dataset)
+    # verify that we can read/write with new flag disabled
+    _roundtrip_pandas_dataframe(df, write_kwargs={
+        'use_compliant_nested_type': False},
+        use_legacy_dataset=use_legacy_dataset)
 
     # Write to a parquet file while disabling compliant nested type
     table = pa.Table.from_pandas(df, preserve_index=False)
     path = str(tempdir / 'data.parquet')
-    with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
+    with pq.ParquetWriter(path, table.schema, version='2.6',
+                          use_compliant_nested_type=False) as writer:
         writer.write_table(table)
     new_table = _read_table(path)
 

diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py
@@ -163,7 +163,7 @@ def test_direct_read_dictionary_subfield(use_legacy_dataset):
     pq.write_table(table, bio)
     contents = bio.getvalue()
     result = pq.read_table(pa.BufferReader(contents),
-                           read_dictionary=['f0.list.item'],
+                           read_dictionary=['f0.list.element'],
                            use_legacy_dataset=use_legacy_dataset)
 
     arr = pa.array(data[0])