From 3af69c88f590152ccf7760b74dbd6f64c1b10123 Mon Sep 17 00:00:00 2001
From: Hattonuri <53221537+Hattonuri@users.noreply.github.com>
Date: Tue, 9 Jan 2024 05:46:37 +0300
Subject: [PATCH] GH-39398: [C++][Parquet] Use std::count in ColumnReader
 ReadLevels (#39397)

### Rationale for this change

I've found that for-loop here
https://github.com/apache/arrow/blob/7c3480e2f028f5881242f227f42155cf833efee7/cpp/src/parquet/column_reader.cc#L1055-L1073

transforms into

0xc0c2f0 <ReadLevels()+96>      inc    %rdx
0xc0c2f3 <ReadLevels()+99>      cmp    %rax,%rdx
0xc0c2f6 <ReadLevels()+102>     jge    0xc0c30c <ReadLevels()+124>
0xc0c2f8 <ReadLevels()+104>     cmp    %cx,(%r14,%rdx,2)
0xc0c2fd <ReadLevels()+109>     jne    0xc0c2f0 <ReadLevels()+96>
0xc0c2ff <ReadLevels()+111>     incq   0x0(%rbp)
0xc0c303 <ReadLevels()+115>     mov    (%rbx),%rax
0xc0c306 <ReadLevels()+118>     jmp    0xc0c2f0 <ReadLevels()+96>

That means that it uses iteration element by element and changes reference with incq
I think that the reason is that values_to_read and num_def_levels are not set as restrict. So the compiler can not optimize this to a more efficient way(for example using simd)

On my flamegraph this part showed ~10% of time spent

In this file there also some for loops which could easily be changed to std::count, but they do not touch references and I don't know the reason why std::count was not used in the all cpp/src/parquet/ directory - so I didn't change much

### What changes are included in this PR?

Using `std::count` in `parquet/column_reader.cc` to avoid loop not being optimized

### Are these changes tested?
They are tested with unittest but not benched because I don't know what bench will show performance rise here(

### Are there any user-facing changes?

* Closes: #39398

Authored-by: Dmitry Stasenko <dmitry.stasenko@pinely.com>
Signed-off-by: mwish <maplewish117@gmail.com>
---
 cpp/src/parquet/column_reader.cc | 22 ++++++----------------
 1 file changed, 6 insertions(+), 16 deletions(-)
diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc
index 99978e283b46a..f5d9734aa1e01 100644
--- a/cpp/src/parquet/column_reader.cc
+++ b/cpp/src/parquet/column_reader.cc
@@ -1062,11 +1062,8 @@ class TypedColumnReaderImpl : public TypedColumnReader<DType>,
       *num_def_levels = this->ReadDefinitionLevels(batch_size, def_levels);
       // TODO(wesm): this tallying of values-to-decode can be performed with better
       // cache-efficiency if fused with the level decoding.
-      for (int64_t i = 0; i < *num_def_levels; ++i) {
-        if (def_levels[i] == this->max_def_level_) {
-          ++(*values_to_read);
-        }
-      }
+      *values_to_read +=
+          std::count(def_levels, def_levels + *num_def_levels, this->max_def_level_);
     } else {
       // Required field, read all values
       *values_to_read = batch_size;
@@ -1195,12 +1192,8 @@ int64_t TypedColumnReaderImpl<DType>::ReadBatchSpaced(
     const bool has_spaced_values = HasSpacedValues(this->descr_);
     int64_t null_count = 0;
     if (!has_spaced_values) {
-      int values_to_read = 0;
-      for (int64_t i = 0; i < num_def_levels; ++i) {
-        if (def_levels[i] == this->max_def_level_) {
-          ++values_to_read;
-        }
-      }
+      int64_t values_to_read =
+          std::count(def_levels, def_levels + num_def_levels, this->max_def_level_);
       total_values = this->ReadValues(values_to_read, values);
       ::arrow::bit_util::SetBitsTo(valid_bits, valid_bits_offset,
                                    /*length=*/total_values,
@@ -1929,11 +1922,8 @@ class TypedRecordReader : public TypedColumnReaderImpl<DType>,
 
     // When reading dense we need to figure out number of values to read.
     const int16_t* def_levels = this->def_levels();
-    for (int64_t i = start_levels_position; i < levels_position_; ++i) {
-      if (def_levels[i] == this->max_def_level_) {
-        ++(*values_to_read);
-      }
-    }
+    *values_to_read += std::count(def_levels + start_levels_position,
+                                  def_levels + levels_position_, this->max_def_level_);
     ReadValuesDense(*values_to_read);
   }