From 42a0e099126c2e7e7c649526cda5b789e381147d Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 30 Aug 2023 21:57:24 +1200 Subject: [PATCH] GH-37453: [C++][Parquet] Performance fix for WriteBatch (#37454) ### Rationale for this change Reduces the time taken for `TypedColumnWriter::WriteBatch`, which regressed with #35230 ### What changes are included in this PR? This change computes the value for `pages_change_on_record_boundaries` once when a `TypedColumnWriter` is constructed rather than on every call to `WriteBatch`. ### Are these changes tested? This doesn't change behaviour so should be covered by existing tests. ### Are there any user-facing changes? No * Closes: #37453 Authored-by: Adam Reeve Signed-off-by: Antoine Pitrou --- cpp/src/parquet/column_writer.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index e34420b9f6e79..3fca5542a0733 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1219,6 +1219,9 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< page_statistics_ = MakeStatistics(descr_, allocator_); chunk_statistics_ = MakeStatistics(descr_, allocator_); } + pages_change_on_record_boundaries_ = + properties->data_page_version() == ParquetDataPageVersion::V2 || + properties->page_index_enabled(descr_->path()); } int64_t Close() override { return ColumnWriterImpl::Close(); } @@ -1386,8 +1389,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< const WriterProperties* properties() override { return properties_; } bool pages_change_on_record_boundaries() const { - return properties_->data_page_version() == ParquetDataPageVersion::V2 || - properties_->page_index_enabled(descr_->path()); + return pages_change_on_record_boundaries_; } private: @@ -1402,6 +1404,7 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< DictEncoder* current_dict_encoder_; std::shared_ptr page_statistics_; std::shared_ptr chunk_statistics_; + bool pages_change_on_record_boundaries_; // If writing a sequence of ::arrow::DictionaryArray to the writer, we keep the // dictionary passed to DictEncoder::PutDictionary so we can check