From d87a7a84f3b77ae46b4bb674211497ad33f3906f Mon Sep 17 00:00:00 2001 From: mwish Date: Wed, 11 Oct 2023 01:56:36 +0800 Subject: [PATCH] GH-36882: [C++][Parquet] Use RLE as BOOLEAN default encoding when both data page and version is V2 (#38163) ### Rationale for this change Only use RLE as BOOLEAN default encoding when data page is V2. Previous patch ( https://github.com/apache/arrow/pull/36955 ) set RLE encoding for Boolean type by default. However, parquet-cpp might write format v2 file with page v1 by default. This might cause parquet-cpp generating RLE encoding for boolean type by default. As https://issues.apache.org/jira/browse/PARQUET-2222 says, we still need some talks about that. So, we: 1. Still allow writing RLE on DataPage V2. This keeps same as parquet rust 2. If DataPage V1 is used, don't use RLE as default Boolean encoding. ### What changes are included in this PR? Only use RLE as BOOLEAN default encoding when both data page and version is V2. ### Are these changes tested? Yes ### Are there any user-facing changes? RLE encoding change for Boolean. * Closes: #36882 Lead-authored-by: mwish Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/column_writer.cc | 3 +- cpp/src/parquet/column_writer_test.cc | 78 +++++++++++++------ cpp/src/parquet/metadata.cc | 22 +++--- python/pyarrow/tests/parquet/test_metadata.py | 2 +- 4 files changed, 66 insertions(+), 39 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index be1881d00e035..88829ef5d98f9 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -2336,7 +2336,8 @@ std::shared_ptr ColumnWriter::Make(ColumnChunkMetaDataBuilder* met Encoding::type encoding = properties->encoding(descr->path()); if (encoding == Encoding::UNKNOWN) { encoding = (descr->physical_type() == Type::BOOLEAN && - properties->version() != ParquetVersion::PARQUET_1_0) + properties->version() != ParquetVersion::PARQUET_1_0 && + properties->data_page_version() == ParquetDataPageVersion::V2) ? Encoding::RLE : Encoding::PLAIN; } diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index b2fd9cf930823..0d354f5c1ac0c 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -98,10 +98,11 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { int64_t output_size = SMALL_SIZE, const ColumnProperties& column_properties = ColumnProperties(), const ParquetVersion::type version = ParquetVersion::PARQUET_1_0, + const ParquetDataPageVersion data_page_version = ParquetDataPageVersion::V1, bool enable_checksum = false) { sink_ = CreateOutputStream(); WriterProperties::Builder wp_builder; - wp_builder.version(version); + wp_builder.version(version)->data_page_version(data_page_version); if (column_properties.encoding() == Encoding::PLAIN_DICTIONARY || column_properties.encoding() == Encoding::RLE_DICTIONARY) { wp_builder.enable_dictionary(); @@ -176,7 +177,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { ASSERT_NO_FATAL_FAILURE(this->ReadAndCompare(compression, num_rows, enable_checksum)); } - void TestDictionaryFallbackEncoding(ParquetVersion::type version) { + void TestDictionaryFallbackEncoding(ParquetVersion::type version, + ParquetDataPageVersion data_page_version) { this->GenerateData(VERY_LARGE_SIZE); ColumnProperties column_properties; column_properties.set_dictionary_enabled(true); @@ -187,7 +189,8 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { column_properties.set_encoding(Encoding::RLE_DICTIONARY); } - auto writer = this->BuildWriter(VERY_LARGE_SIZE, column_properties, version); + auto writer = + this->BuildWriter(VERY_LARGE_SIZE, column_properties, version, data_page_version); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); writer->Close(); @@ -204,13 +207,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { if (this->type_num() == Type::BOOLEAN) { // Dictionary encoding is not allowed for boolean type std::set expected; - if (version == ParquetVersion::PARQUET_1_0) { + if (version != ParquetVersion::PARQUET_1_0 && + data_page_version == ParquetDataPageVersion::V2) { + // There is only 1 encoding (RLE) in a fallback case for version 2.0 and data page + // v2 enabled. + expected = {Encoding::RLE}; + } else { // There are 2 encodings (PLAIN, RLE) in a non dictionary encoding case for - // version 1.0. Note that RLE is used for DL/RL. + // version 1.0 or data page v1. Note that RLE is used for DL/RL. expected = {Encoding::PLAIN, Encoding::RLE}; - } else { - // There is only 1 encoding (RLE) in a fallback case for version 2.0 - expected = {Encoding::RLE}; } ASSERT_EQ(encodings, expected); } else if (version == ParquetVersion::PARQUET_1_0) { @@ -231,7 +236,10 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { this->metadata_encoding_stats(); if (this->type_num() == Type::BOOLEAN) { ASSERT_EQ(encoding_stats[0].encoding, - version == ParquetVersion::PARQUET_1_0 ? Encoding::PLAIN : Encoding::RLE); + version != ParquetVersion::PARQUET_1_0 && + data_page_version == ParquetDataPageVersion::V2 + ? Encoding::RLE + : Encoding::PLAIN); ASSERT_EQ(encoding_stats[0].page_type, PageType::DATA_PAGE); } else if (version == ParquetVersion::PARQUET_1_0) { std::vector expected( @@ -262,8 +270,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { enable_statistics); column_properties.set_codec_options( std::make_shared(compression_level)); - std::shared_ptr> writer = this->BuildWriter( - num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); + std::shared_ptr> writer = + this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0, + ParquetDataPageVersion::V1, enable_checksum); writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_); // The behaviour should be independent from the number of Close() calls writer->Close(); @@ -281,8 +290,9 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { enable_statistics); column_properties.set_codec_options( std::make_shared(compression_level)); - std::shared_ptr> writer = this->BuildWriter( - num_rows, column_properties, ParquetVersion::PARQUET_1_0, enable_checksum); + std::shared_ptr> writer = + this->BuildWriter(num_rows, column_properties, ParquetVersion::PARQUET_1_0, + ParquetDataPageVersion::V1, enable_checksum); writer->WriteBatchSpaced(this->values_.size(), nullptr, nullptr, valid_bits.data(), 0, this->values_ptr_); // The behaviour should be independent from the number of Close() calls @@ -693,12 +703,19 @@ TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) { // Test cases for dictionary fallback encoding TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion1_0) { - this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0); + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_1_0, + ParquetDataPageVersion::V1); } TYPED_TEST(TestPrimitiveWriter, DictionaryFallbackVersion2_0) { - this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4); - this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6); + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4, + ParquetDataPageVersion::V1); + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_4, + ParquetDataPageVersion::V2); + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6, + ParquetDataPageVersion::V1); + this->TestDictionaryFallbackEncoding(ParquetVersion::PARQUET_2_6, + ParquetDataPageVersion::V2); } TEST(TestWriter, NullValuesBuffer) { @@ -767,10 +784,13 @@ TEST_F(TestValuesWriterInt32Type, OptionalNullValueChunk) { class TestBooleanValuesWriter : public TestPrimitiveWriter { public: - void TestWithEncoding(ParquetVersion::type version, Encoding::type encoding) { + void TestWithEncoding(ParquetVersion::type version, + ParquetDataPageVersion data_page_version, + const std::set& expected_encodings) { this->SetUpSchema(Repetition::REQUIRED); - auto writer = this->BuildWriter(SMALL_SIZE, ColumnProperties(), version, - /*enable_checksum*/ false); + auto writer = + this->BuildWriter(SMALL_SIZE, ColumnProperties(), version, data_page_version, + /*enable_checksum*/ false); for (int i = 0; i < SMALL_SIZE; i++) { bool value = (i % 2 == 0) ? true : false; writer->WriteBatch(1, nullptr, nullptr, &value); @@ -780,21 +800,29 @@ class TestBooleanValuesWriter : public TestPrimitiveWriter { for (int i = 0; i < SMALL_SIZE; i++) { ASSERT_EQ((i % 2 == 0) ? true : false, this->values_out_[i]) << i; } - const auto& encodings = this->metadata_encodings(); - auto iter = std::find(encodings.begin(), encodings.end(), encoding); - ASSERT_TRUE(iter != encodings.end()); + auto metadata_encodings = this->metadata_encodings(); + std::set metadata_encodings_set{metadata_encodings.begin(), + metadata_encodings.end()}; + EXPECT_EQ(expected_encodings, metadata_encodings_set); } }; // PARQUET-764 // Correct bitpacking for boolean write at non-byte boundaries TEST_F(TestBooleanValuesWriter, AlternateBooleanValues) { - TestWithEncoding(ParquetVersion::PARQUET_1_0, Encoding::PLAIN); + for (auto data_page_version : + {ParquetDataPageVersion::V1, ParquetDataPageVersion::V2}) { + TestWithEncoding(ParquetVersion::PARQUET_1_0, data_page_version, + {Encoding::PLAIN, Encoding::RLE}); + } } -// Default encoding for boolean is RLE when using V2 pages +// Default encoding for boolean is RLE when both V2 format and V2 pages enabled. TEST_F(TestBooleanValuesWriter, RleEncodedBooleanValues) { - TestWithEncoding(ParquetVersion::PARQUET_2_4, Encoding::RLE); + TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V1, + {Encoding::PLAIN, Encoding::RLE}); + TestWithEncoding(ParquetVersion::PARQUET_2_4, ParquetDataPageVersion::V2, + {Encoding::RLE}); } // PARQUET-979 diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 8697cd52515ae..f43187c2dd4e5 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -1501,8 +1502,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { thrift_encoding_stats.push_back(data_enc_stat); add_encoding(data_encoding); } - column_chunk_->meta_data.__set_encodings(thrift_encodings); - column_chunk_->meta_data.__set_encoding_stats(thrift_encoding_stats); + column_chunk_->meta_data.__set_encodings(std::move(thrift_encodings)); + column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats)); const auto& encrypt_md = properties_->column_encryption_properties(column_->path()->ToDotString()); @@ -1521,7 +1522,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { ccmd.__isset.ENCRYPTION_WITH_COLUMN_KEY = true; ccmd.__set_ENCRYPTION_WITH_COLUMN_KEY(eck); } - column_chunk_->__set_crypto_metadata(ccmd); + column_chunk_->__set_crypto_metadata(std::move(ccmd)); bool encrypted_footer = properties_->file_encryption_properties()->encrypted_footer(); @@ -1601,16 +1602,13 @@ std::unique_ptr ColumnChunkMetaDataBuilder::Make( ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( std::shared_ptr props, const ColumnDescriptor* column) - : impl_{std::unique_ptr( - new ColumnChunkMetaDataBuilderImpl(std::move(props), column))} {} + : impl_{std::make_unique(std::move(props), column)} {} ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilder( std::shared_ptr props, const ColumnDescriptor* column, void* contents) - : impl_{std::unique_ptr( - new ColumnChunkMetaDataBuilderImpl( - std::move(props), column, - reinterpret_cast(contents)))} {} + : impl_{std::make_unique( + std::move(props), column, reinterpret_cast(contents))} {} ColumnChunkMetaDataBuilder::~ColumnChunkMetaDataBuilder() = default; @@ -1782,7 +1780,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { key_value_metadata_(std::move(key_value_metadata)) { if (properties_->file_encryption_properties() != nullptr && properties_->file_encryption_properties()->encrypted_footer()) { - crypto_metadata_.reset(new format::FileCryptoMetaData()); + crypto_metadata_ = std::make_unique(); } } @@ -1956,8 +1954,8 @@ std::unique_ptr FileMetaDataBuilder::Make( FileMetaDataBuilder::FileMetaDataBuilder( const SchemaDescriptor* schema, std::shared_ptr props, std::shared_ptr key_value_metadata) - : impl_{std::unique_ptr(new FileMetaDataBuilderImpl( - schema, std::move(props), std::move(key_value_metadata)))} {} + : impl_{std::make_unique(schema, std::move(props), + std::move(key_value_metadata))} {} FileMetaDataBuilder::~FileMetaDataBuilder() = default; diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 82182b0449afd..3efaf1dbf5526 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -128,7 +128,7 @@ def test_parquet_metadata_api(): assert col_meta.is_stats_set is True assert isinstance(col_meta.statistics, pq.Statistics) assert col_meta.compression == 'SNAPPY' - assert set(col_meta.encodings) == {'RLE'} + assert set(col_meta.encodings) == {'PLAIN', 'RLE'} assert col_meta.has_dictionary_page is False assert col_meta.dictionary_page_offset is None assert col_meta.data_page_offset > 0