From 3af69c88f590152ccf7760b74dbd6f64c1b10123 Mon Sep 17 00:00:00 2001 From: Hattonuri <53221537+Hattonuri@users.noreply.github.com> Date: Tue, 9 Jan 2024 05:46:37 +0300 Subject: [PATCH] GH-39398: [C++][Parquet] Use std::count in ColumnReader ReadLevels (#39397) ### Rationale for this change I've found that for-loop here https://github.com/apache/arrow/blob/7c3480e2f028f5881242f227f42155cf833efee7/cpp/src/parquet/column_reader.cc#L1055-L1073 transforms into 0xc0c2f0 inc %rdx 0xc0c2f3 cmp %rax,%rdx 0xc0c2f6 jge 0xc0c30c 0xc0c2f8 cmp %cx,(%r14,%rdx,2) 0xc0c2fd jne 0xc0c2f0 0xc0c2ff incq 0x0(%rbp) 0xc0c303 mov (%rbx),%rax 0xc0c306 jmp 0xc0c2f0 That means that it uses iteration element by element and changes reference with incq I think that the reason is that values_to_read and num_def_levels are not set as restrict. So the compiler can not optimize this to a more efficient way(for example using simd) On my flamegraph this part showed ~10% of time spent In this file there also some for loops which could easily be changed to std::count, but they do not touch references and I don't know the reason why std::count was not used in the all cpp/src/parquet/ directory - so I didn't change much ### What changes are included in this PR? Using `std::count` in `parquet/column_reader.cc` to avoid loop not being optimized ### Are these changes tested? They are tested with unittest but not benched because I don't know what bench will show performance rise here( ### Are there any user-facing changes? * Closes: #39398 Authored-by: Dmitry Stasenko Signed-off-by: mwish --- cpp/src/parquet/column_reader.cc | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 99978e283b46a..f5d9734aa1e01 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -1062,11 +1062,8 @@ class TypedColumnReaderImpl : public TypedColumnReader, *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels); // TODO(wesm): this tallying of values-to-decode can be performed with better // cache-efficiency if fused with the level decoding. - for (int64_t i = 0; i < *num_def_levels; ++i) { - if (def_levels[i] == this->max_def_level_) { - ++(*values_to_read); - } - } + *values_to_read += + std::count(def_levels, def_levels + *num_def_levels, this->max_def_level_); } else { // Required field, read all values *values_to_read = batch_size; @@ -1195,12 +1192,8 @@ int64_t TypedColumnReaderImpl::ReadBatchSpaced( const bool has_spaced_values = HasSpacedValues(this->descr_); int64_t null_count = 0; if (!has_spaced_values) { - int values_to_read = 0; - for (int64_t i = 0; i < num_def_levels; ++i) { - if (def_levels[i] == this->max_def_level_) { - ++values_to_read; - } - } + int64_t values_to_read = + std::count(def_levels, def_levels + num_def_levels, this->max_def_level_); total_values = this->ReadValues(values_to_read, values); ::arrow::bit_util::SetBitsTo(valid_bits, valid_bits_offset, /*length=*/total_values, @@ -1929,11 +1922,8 @@ class TypedRecordReader : public TypedColumnReaderImpl, // When reading dense we need to figure out number of values to read. const int16_t* def_levels = this->def_levels(); - for (int64_t i = start_levels_position; i < levels_position_; ++i) { - if (def_levels[i] == this->max_def_level_) { - ++(*values_to_read); - } - } + *values_to_read += std::count(def_levels + start_levels_position, + def_levels + levels_position_, this->max_def_level_); ReadValuesDense(*values_to_read); }