[cherry-pick](branch-3.0) fix parquet cases (#41506 #41526 #41683 #41816

) (#41923) ## Proposed changes pick prs: #41506 #41526 #41683 #41816 --------- Co-authored-by: morningman <morningman@163.com>
apache · Oct 17, 2024 · e634f0d · e634f0d
1 parent 4ce294f
commit e634f0d
Show file tree

Hide file tree

Showing 23 changed files with 504 additions and 57 deletions.
diff --git a/be/src/gutil/endian.h b/be/src/gutil/endian.h
@@ -60,8 +60,8 @@ inline unsigned __int128 gbswap_128(unsigned __int128 host_int) {
 }
 
 inline wide::UInt256 gbswap_256(wide::UInt256 host_int) {
-    wide::UInt256 result{gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]),
-                         gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])};
+    wide::UInt256 result {gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]),
+                          gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])};
     return result;
 }
 
@@ -136,6 +136,9 @@ class LittleEndian {
     static unsigned __int128 FromHost128(unsigned __int128 x) { return x; }
     static unsigned __int128 ToHost128(unsigned __int128 x) { return x; }
 
+    static wide::UInt256 FromHost256(wide::UInt256 x) { return x; }
+    static wide::UInt256 ToHost256(wide::UInt256 x) { return x; }
+
     static bool IsLittleEndian() { return true; }
 
 #elif defined IS_BIG_ENDIAN
@@ -149,6 +152,12 @@ class LittleEndian {
     static uint64 FromHost64(uint64 x) { return gbswap_64(x); }
     static uint64 ToHost64(uint64 x) { return gbswap_64(x); }
 
+    static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); }
+    static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); }
+
+    static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); }
+    static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); }
+
     static bool IsLittleEndian() { return false; }
 
 #endif /* ENDIAN */

diff --git a/be/src/io/fs/buffered_reader.cpp b/be/src/io/fs/buffered_reader.cpp
@@ -778,8 +778,12 @@ BufferedFileStreamReader::BufferedFileStreamReader(io::FileReaderSPtr file, uint
 
 Status BufferedFileStreamReader::read_bytes(const uint8_t** buf, uint64_t offset,
                                             const size_t bytes_to_read, const IOContext* io_ctx) {
-    if (offset < _file_start_offset || offset >= _file_end_offset) {
-        return Status::IOError("Out-of-bounds Access");
+    if (offset < _file_start_offset || offset >= _file_end_offset ||
+        offset + bytes_to_read > _file_end_offset) {
+        return Status::IOError(
+                "Out-of-bounds Access: offset={}, bytes_to_read={}, file_start={}, "
+                "file_end={}",
+                offset, bytes_to_read, _file_start_offset, _file_end_offset);
     }
     int64_t end_offset = offset + bytes_to_read;
     if (_buf_start_offset <= offset && _buf_end_offset >= end_offset) {

diff --git a/be/src/util/bit_util.h b/be/src/util/bit_util.h
@@ -20,6 +20,9 @@
 
 #pragma once
 
+#include <type_traits>
+
+#include "vec/core/wide_integer.h"
 #ifndef __APPLE__
 #include <endian.h>
 #endif
@@ -209,7 +212,11 @@ class BitUtil {
 
     template <typename T>
     static T big_endian_to_host(T value) {
-        if constexpr (std::is_same_v<T, __int128>) {
+        if constexpr (std::is_same_v<T, wide::Int256>) {
+            return BigEndian::ToHost256(value);
+        } else if constexpr (std::is_same_v<T, wide::UInt256>) {
+            return BigEndian::ToHost256(value);
+        } else if constexpr (std::is_same_v<T, __int128>) {
             return BigEndian::ToHost128(value);
         } else if constexpr (std::is_same_v<T, unsigned __int128>) {
             return BigEndian::ToHost128(value);

diff --git a/be/src/util/byte_stream_split.cpp b/be/src/util/byte_stream_split.cpp
@@ -0,0 +1,119 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "byte_stream_split.h"
+
+#include <glog/logging.h>
+
+#include <array>
+#include <cstring>
+#include <vector>
+
+#include "gutil/port.h"
+
+namespace doris {
+
+inline void do_merge_streams(const uint8_t** src_streams, int width, int64_t nvalues,
+                             uint8_t* dest) {
+    // Value empirically chosen to provide the best performance on the author's machine
+    constexpr int kBlockSize = 128;
+
+    while (nvalues >= kBlockSize) {
+        for (int stream = 0; stream < width; ++stream) {
+            // Take kBlockSize bytes from the given stream and spread them
+            // to their logical places in destination.
+            const uint8_t* src = src_streams[stream];
+            for (int i = 0; i < kBlockSize; i += 8) {
+                uint64_t v;
+                std::memcpy(&v, src + i, sizeof(v));
+#ifdef IS_LITTLE_ENDIAN
+                dest[stream + i * width] = static_cast<uint8_t>(v);
+                dest[stream + (i + 1) * width] = static_cast<uint8_t>(v >> 8);
+                dest[stream + (i + 2) * width] = static_cast<uint8_t>(v >> 16);
+                dest[stream + (i + 3) * width] = static_cast<uint8_t>(v >> 24);
+                dest[stream + (i + 4) * width] = static_cast<uint8_t>(v >> 32);
+                dest[stream + (i + 5) * width] = static_cast<uint8_t>(v >> 40);
+                dest[stream + (i + 6) * width] = static_cast<uint8_t>(v >> 48);
+                dest[stream + (i + 7) * width] = static_cast<uint8_t>(v >> 56);
+#elif defined IS_BIG_ENDIAN
+                dest[stream + i * width] = static_cast<uint8_t>(v >> 56);
+                dest[stream + (i + 1) * width] = static_cast<uint8_t>(v >> 48);
+                dest[stream + (i + 2) * width] = static_cast<uint8_t>(v >> 40);
+                dest[stream + (i + 3) * width] = static_cast<uint8_t>(v >> 32);
+                dest[stream + (i + 4) * width] = static_cast<uint8_t>(v >> 24);
+                dest[stream + (i + 5) * width] = static_cast<uint8_t>(v >> 16);
+                dest[stream + (i + 6) * width] = static_cast<uint8_t>(v >> 8);
+                dest[stream + (i + 7) * width] = static_cast<uint8_t>(v);
+#endif
+            }
+            src_streams[stream] += kBlockSize;
+        }
+        dest += width * kBlockSize;
+        nvalues -= kBlockSize;
+    }
+
+    // Epilog
+    for (int stream = 0; stream < width; ++stream) {
+        const uint8_t* src = src_streams[stream];
+        for (int64_t i = 0; i < nvalues; ++i) {
+            dest[stream + i * width] = src[i];
+        }
+    }
+}
+
+template <int kNumStreams>
+void byte_stream_split_decode_scalar(const uint8_t* src, int width, int64_t offset,
+                                     int64_t num_values, int64_t stride, uint8_t* dest) {
+    DCHECK(width == kNumStreams);
+    std::array<const uint8_t*, kNumStreams> src_streams;
+    for (int stream = 0; stream < kNumStreams; ++stream) {
+        src_streams[stream] = &src[stream * stride + offset];
+    }
+    do_merge_streams(src_streams.data(), kNumStreams, num_values, dest);
+}
+
+inline void byte_stream_split_decode_scalar_dynamic(const uint8_t* src, int width, int64_t offset,
+                                                    int64_t num_values, int64_t stride,
+                                                    uint8_t* dest) {
+    std::vector<const uint8_t*> src_streams;
+    src_streams.resize(width);
+    for (int stream = 0; stream < width; ++stream) {
+        src_streams[stream] = &src[stream * stride + offset];
+    }
+    do_merge_streams(src_streams.data(), width, num_values, dest);
+}
+
+// TODO: optimize using simd: https://github.com/apache/arrow/pull/38529
+void byte_stream_split_decode(const uint8_t* src, int width, int64_t offset, int64_t num_values,
+                              int64_t stride, uint8_t* dest) {
+    switch (width) {
+    case 1:
+        memcpy(dest, src + offset * width, num_values);
+        return;
+    case 2:
+        return byte_stream_split_decode_scalar<2>(src, width, offset, num_values, stride, dest);
+    case 4:
+        return byte_stream_split_decode_scalar<4>(src, width, offset, num_values, stride, dest);
+    case 8:
+        return byte_stream_split_decode_scalar<8>(src, width, offset, num_values, stride, dest);
+    case 16:
+        return byte_stream_split_decode_scalar<16>(src, width, offset, num_values, stride, dest);
+    }
+    return byte_stream_split_decode_scalar_dynamic(src, width, offset, num_values, stride, dest);
+}
+
+} // namespace doris
diff --git a/be/src/util/byte_stream_split.h b/be/src/util/byte_stream_split.h
@@ -0,0 +1,37 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace doris {
+
+/**
+ * @brief Decode a byte stream into a byte stream split format.
+ * 
+ * @param src The encoded data by byte stream split.
+ * @param width The width of type.
+ * @param offset The offset of encoded data.
+ * @param num_values The num of values to decode.
+ * @param stride The length of each stream.
+ * @param dest The buffer to store the decoded data.
+ */
+void byte_stream_split_decode(const uint8_t* src, int width, int64_t offset, int64_t num_values,
+                              int64_t stride, uint8_t* dest);
+
+} // namespace doris
diff --git a/be/src/vec/exec/format/parquet/byte_stream_split_decoder.cpp b/be/src/vec/exec/format/parquet/byte_stream_split_decoder.cpp
@@ -0,0 +1,95 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "byte_stream_split_decoder.h"
+
+#include <cstdint>
+
+#include "util/byte_stream_split.h"
+
+namespace doris::vectorized {
+
+Status ByteStreamSplitDecoder::decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                                             ColumnSelectVector& select_vector,
+                                             bool is_dict_filter) {
+    if (select_vector.has_filter()) {
+        return _decode_values<true>(doris_column, data_type, select_vector, is_dict_filter);
+    } else {
+        return _decode_values<false>(doris_column, data_type, select_vector, is_dict_filter);
+    }
+}
+
+template <bool has_filter>
+Status ByteStreamSplitDecoder::_decode_values(MutableColumnPtr& doris_column,
+                                              DataTypePtr& data_type,
+                                              ColumnSelectVector& select_vector,
+                                              bool is_dict_filter) {
+    size_t non_null_size = select_vector.num_values() - select_vector.num_nulls();
+    if (UNLIKELY(_offset + non_null_size > _data->size)) {
+        return Status::IOError(
+                "Out-of-bounds access in parquet data decoder: offset = {}, non_null_size = "
+                "{},size = {}",
+                _offset, non_null_size, _data->size);
+    }
+
+    size_t primitive_length = remove_nullable(data_type)->get_size_of_value_in_memory();
+    size_t data_index = doris_column->size() * primitive_length;
+    size_t scale_size = (select_vector.num_values() - select_vector.num_filtered()) *
+                        (_type_length / primitive_length);
+    doris_column->resize(doris_column->size() + scale_size);
+    char* raw_data = const_cast<char*>(doris_column->get_raw_data().data);
+    ColumnSelectVector::DataReadType read_type;
+    DCHECK(_data->get_size() % _type_length == 0);
+    int64_t stride = _data->get_size() / _type_length;
+
+    while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) {
+        switch (read_type) {
+        case ColumnSelectVector::CONTENT: {
+            byte_stream_split_decode(reinterpret_cast<const uint8_t*>(_data->get_data()),
+                                     _type_length, _offset / _type_length, run_length, stride,
+                                     reinterpret_cast<uint8_t*>(raw_data) + data_index);
+            _offset += run_length * _type_length;
+            data_index += run_length * _type_length;
+            break;
+        }
+        case ColumnSelectVector::NULL_DATA: {
+            data_index += run_length * _type_length;
+            break;
+        }
+        case ColumnSelectVector::FILTERED_CONTENT: {
+            _offset += _type_length * run_length;
+            break;
+        }
+        case ColumnSelectVector::FILTERED_NULL: {
+            // do nothing
+            break;
+        }
+        }
+    }
+    return Status::OK();
+}
+
+Status ByteStreamSplitDecoder::skip_values(size_t num_values) {
+    _offset += _type_length * num_values;
+    if (UNLIKELY(_offset > _data->size)) {
+        return Status::IOError(
+                "Out-of-bounds access in parquet data decoder: offset = {}, size = {}", _offset,
+                _data->size);
+    }
+    return Status::OK();
+}
+}; // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/byte_stream_split_decoder.h b/be/src/vec/exec/format/parquet/byte_stream_split_decoder.h
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "vec/exec/format/parquet/decoder.h"
+
+namespace doris::vectorized {
+class ByteStreamSplitDecoder final : public Decoder {
+public:
+    ByteStreamSplitDecoder() = default;
+    ~ByteStreamSplitDecoder() override = default;
+
+    Status decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                         ColumnSelectVector& select_vector, bool is_dict_filter) override;
+
+    template <bool has_filter>
+    Status _decode_values(MutableColumnPtr& doris_column, DataTypePtr& data_type,
+                          ColumnSelectVector& select_vector, bool is_dict_filter);
+
+    Status skip_values(size_t num_values) override;
+};
+
+} // namespace doris::vectorized
diff --git a/be/src/vec/exec/format/parquet/decoder.cpp b/be/src/vec/exec/format/parquet/decoder.cpp
@@ -24,10 +24,10 @@
 #include "vec/exec/format/parquet/bool_rle_decoder.h"
 #include "vec/exec/format/parquet/byte_array_dict_decoder.h"
 #include "vec/exec/format/parquet/byte_array_plain_decoder.h"
+#include "vec/exec/format/parquet/byte_stream_split_decoder.h"
 #include "vec/exec/format/parquet/delta_bit_pack_decoder.h"
 #include "vec/exec/format/parquet/fix_length_dict_decoder.hpp"
 #include "vec/exec/format/parquet/fix_length_plain_decoder.h"
-#include "vec/exec/format/parquet/schema_desc.h"
 
 namespace doris::vectorized {
 
@@ -118,6 +118,21 @@ Status Decoder::get_decoder(tparquet::Type::type type, tparquet::Encoding::type
             return Status::InternalError("DELTA_LENGTH_BYTE_ARRAY only supports BYTE_ARRAY.");
         }
         break;
+    case tparquet::Encoding::BYTE_STREAM_SPLIT:
+        switch (type) {
+        case tparquet::Type::INT32:
+        case tparquet::Type::INT64:
+        case tparquet::Type::INT96:
+        case tparquet::Type::FLOAT:
+        case tparquet::Type::DOUBLE:
+        case tparquet::Type::FIXED_LEN_BYTE_ARRAY:
+            decoder.reset(new ByteStreamSplitDecoder());
+            break;
+        default:
+            return Status::InternalError("Unsupported type {}(encoding={}) in parquet decoder",
+                                         tparquet::to_string(type), tparquet::to_string(encoding));
+        }
+        break;
     default:
         return Status::InternalError("Unsupported encoding {}(type={}) in parquet decoder",
                                      tparquet::to_string(encoding), tparquet::to_string(type));