Skip to content

Commit

Permalink
chore: update Python and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
wjones127 committed Apr 17, 2023
1 parent e75a5c3 commit 564000c
Show file tree
Hide file tree
Showing 6 changed files with 17 additions and 19 deletions.
4 changes: 2 additions & 2 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -877,13 +877,13 @@ class PARQUET_EXPORT ArrowWriterProperties {
/// list types (default "item"), will use "entries", as is specified in
/// the Parquet spec.
///
/// This is disabled by default, but will be enabled by default in future.
/// This is enabled by default.
Builder* enable_compliant_nested_types() {
compliant_nested_types_ = true;
return this;
}

/// Preserve Arrow list field name (default behavior).
/// Preserve Arrow list field name.
Builder* disable_compliant_nested_types() {
compliant_nested_types_ = false;
return this;
Expand Down
1 change: 0 additions & 1 deletion docs/source/cpp/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,6 @@ There are also Arrow-specific settings that can be configured with
std::shared_ptr<ArrowWriterProperties> arrow_props = ArrowWriterProperties::Builder()
.enable_deprecated_int96_timestamps() // default False
->store_schema() // default False
->enable_compliant_nested_types() // default False
->build();
These options mostly dictate how Arrow types are converted to Parquet types.
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -596,7 +596,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
use_deprecated_int96_timestamps=False,
coerce_timestamps=None,
allow_truncated_timestamps=False,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
)
self._set_properties()
self._set_arrow_properties()
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1607,7 +1607,7 @@ cdef shared_ptr[ArrowWriterProperties] _create_arrow_writer_properties(
coerce_timestamps=None,
allow_truncated_timestamps=False,
writer_engine_version=None,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
store_schema=True) except *:
"""Arrow writer properties"""
cdef:
Expand Down Expand Up @@ -1704,7 +1704,7 @@ cdef class ParquetWriter(_Weakrefable):
column_encoding=None,
writer_engine_version=None,
data_page_version=None,
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down
6 changes: 3 additions & 3 deletions python/pyarrow/parquet/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,7 +822,7 @@ def _sanitize_table(table, new_schema, flavor):
The serialized Parquet data page format version to write, defaults to
1.0. This does not impact the file schema logical types and Arrow to
Parquet type casting behavior; for that use the "version" option.
use_compliant_nested_type : bool, default False
use_compliant_nested_type : bool, default True
Whether to write compliant Parquet nested type (lists) as defined
`here <https://github.com/apache/parquet-format/blob/master/
LogicalTypes.md#nested-types>`_, defaults to ``False``.
Expand Down Expand Up @@ -954,7 +954,7 @@ def __init__(self, where, schema, filesystem=None,
column_encoding=None,
writer_engine_version=None,
data_page_version='1.0',
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down Expand Up @@ -3072,7 +3072,7 @@ def write_table(table, where, row_group_size=None, version='2.4',
use_byte_stream_split=False,
column_encoding=None,
data_page_version='1.0',
use_compliant_nested_type=False,
use_compliant_nested_type=True,
encryption_properties=None,
write_batch_size=None,
dictionary_pagesize_limit=None,
Expand Down
19 changes: 9 additions & 10 deletions python/pyarrow/tests/parquet/test_compliant_nested_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,17 +64,15 @@ def test_write_compliant_nested_type_enable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write pandas df with new flag
# verify that we can read/write pandas df with new flag (default behaviour)
_roundtrip_pandas_dataframe(df,
write_kwargs={
'use_compliant_nested_type': True},
write_kwargs={},
use_legacy_dataset=use_legacy_dataset)

# Write to a parquet file with compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema,
use_compliant_nested_type=True,
version='2.6') as writer:
writer.write_table(table)
# Read back as a table
Expand All @@ -86,8 +84,7 @@ def test_write_compliant_nested_type_enable(tempdir,

# Verify that the new table can be read/written correctly
_check_roundtrip(new_table,
use_legacy_dataset=use_legacy_dataset,
use_compliant_nested_type=True)
use_legacy_dataset=use_legacy_dataset)


@pytest.mark.pandas
Expand All @@ -97,14 +94,16 @@ def test_write_compliant_nested_type_disable(tempdir,
use_legacy_dataset, test_data):
# prepare dataframe for testing
df = pd.DataFrame(data=test_data)
# verify that we can read/write with new flag disabled (default behaviour)
_roundtrip_pandas_dataframe(df, write_kwargs={},
use_legacy_dataset=use_legacy_dataset)
# verify that we can read/write with new flag disabled
_roundtrip_pandas_dataframe(df, write_kwargs={
'use_compliant_nested_type': False},
use_legacy_dataset=use_legacy_dataset)

# Write to a parquet file while disabling compliant nested type
table = pa.Table.from_pandas(df, preserve_index=False)
path = str(tempdir / 'data.parquet')
with pq.ParquetWriter(path, table.schema, version='2.6') as writer:
with pq.ParquetWriter(path, table.schema, version='2.6',
use_compliant_nested_type=False) as writer:
writer.write_table(table)
new_table = _read_table(path)

Expand Down

0 comments on commit 564000c

Please sign in to comment.