From 16fb1d94923399c2184e595aa401bedf48174395 Mon Sep 17 00:00:00 2001 From: Lloyd-Pottiger <60744015+Lloyd-Pottiger@users.noreply.github.com> Date: Wed, 7 Aug 2024 16:32:10 +0800 Subject: [PATCH] Compression: add microbenchmark (#9293) close pingcap/tiflash#8982 Signed-off-by: Lloyd-Pottiger --- dbms/src/IO/Compression/CompressionSettings.h | 6 +- .../IO/Compression/tests/CodecTestSequence.h | 313 ++++++++++++++++++ dbms/src/IO/Compression/tests/bench_codec.cpp | 275 +++++++++++++++ .../tests/gtest_codec_compression.cpp | 309 +---------------- 4 files changed, 607 insertions(+), 296 deletions(-) create mode 100644 dbms/src/IO/Compression/tests/CodecTestSequence.h create mode 100644 dbms/src/IO/Compression/tests/bench_codec.cpp diff --git a/dbms/src/IO/Compression/CompressionSettings.h b/dbms/src/IO/Compression/CompressionSettings.h index 5363b0aca5d..d1db183fe45 100644 --- a/dbms/src/IO/Compression/CompressionSettings.h +++ b/dbms/src/IO/Compression/CompressionSettings.h @@ -1,4 +1,4 @@ -// Copyright 2023 PingCAP, Inc. +// Copyright 2024 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -101,6 +101,10 @@ struct CompressionSettings : settings(settings_) {} + explicit CompressionSettings(CompressionSetting setting) + : settings(1, std::move(setting)) + {} + std::vector settings; }; diff --git a/dbms/src/IO/Compression/tests/CodecTestSequence.h b/dbms/src/IO/Compression/tests/CodecTestSequence.h new file mode 100644 index 00000000000..049ed7c3c2e --- /dev/null +++ b/dbms/src/IO/Compression/tests/CodecTestSequence.h @@ -0,0 +1,313 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +namespace DB::tests +{ + +template +const char * type_name() +{ +#define MAKE_TYPE_NAME(TYPE) \ + if constexpr (std::is_same_v) \ + return #TYPE + + MAKE_TYPE_NAME(UInt8); + MAKE_TYPE_NAME(UInt16); + MAKE_TYPE_NAME(UInt32); + MAKE_TYPE_NAME(UInt64); + MAKE_TYPE_NAME(Int8); + MAKE_TYPE_NAME(Int16); + MAKE_TYPE_NAME(Int32); + MAKE_TYPE_NAME(Int64); + MAKE_TYPE_NAME(Float32); + MAKE_TYPE_NAME(Float64); + +#undef MAKE_TYPE_NAME + + return typeid(T).name(); +} + +template +DataTypePtr makeDataType() +{ +#define MAKE_DATA_TYPE(TYPE) \ + if constexpr (std::is_same_v) \ + return std::make_shared() + + MAKE_DATA_TYPE(UInt8); + MAKE_DATA_TYPE(UInt16); + MAKE_DATA_TYPE(UInt32); + MAKE_DATA_TYPE(UInt64); + MAKE_DATA_TYPE(Int8); + MAKE_DATA_TYPE(Int16); + MAKE_DATA_TYPE(Int32); + MAKE_DATA_TYPE(Int64); + MAKE_DATA_TYPE(Float32); + MAKE_DATA_TYPE(Float64); + +#undef MAKE_DATA_TYPE + + assert(false && "unknown datatype"); + return nullptr; +} + +struct CodecTestSequence +{ + std::string name; + std::vector serialized_data; + DataTypePtr data_type; + UInt8 type_byte; + + CodecTestSequence(std::string name_, std::vector serialized_data_, DataTypePtr data_type_, UInt8 type_byte_) + : name(name_) + , serialized_data(serialized_data_) + , data_type(data_type_) + , type_byte(type_byte_) + {} + + CodecTestSequence & append(const CodecTestSequence & other) + { + assert(data_type->equals(*other.data_type)); + + serialized_data.insert(serialized_data.end(), other.serialized_data.begin(), other.serialized_data.end()); + if (!name.empty()) + name += " + "; + name += other.name; + + return *this; + } +}; + +CodecTestSequence operator+(CodecTestSequence && left, const CodecTestSequence & right) +{ + return left.append(right); +} + +template +CodecTestSequence operator*(CodecTestSequence && left, T times) +{ + std::vector data(std::move(left.serialized_data)); + const size_t initial_size = data.size(); + const size_t final_size = initial_size * times; + + data.reserve(final_size); + + for (T i = 0; i < times; ++i) + { + data.insert(data.end(), data.begin(), data.begin() + initial_size); + } + + return CodecTestSequence{ + left.name + " x " + std::to_string(times), + std::move(data), + std::move(left.data_type), + sizeof(T)}; +} + +std::ostream & operator<<(std::ostream & ostr, const CompressionMethodByte method_byte) +{ + ostr << "Codec{name: " << magic_enum::enum_name(method_byte) << "}"; + return ostr; +} + +std::ostream & operator<<(std::ostream & ostr, const CodecTestSequence & seq) +{ + return ostr << "CodecTestSequence{" + << "name: " << seq.name << ", type name: " << seq.data_type->getName() + << ", data size: " << seq.serialized_data.size() << " bytes" + << "}"; +} + +template +CodecTestSequence makeSeq(Args &&... args) +{ + std::initializer_list vals{static_cast(args)...}; + std::vector data(sizeof(T) * std::size(vals)); + + char * write_pos = data.data(); + for (const auto & v : vals) + { + unalignedStore(write_pos, v); + write_pos += sizeof(v); + } + + return CodecTestSequence{ + (fmt::format("{} values of {}", std::size(vals), type_name())), + std::move(data), + makeDataType(), + sizeof(T)}; +} + +template +CodecTestSequence generateSeq(Generator gen, const char * gen_name, int Begin = 0, int End = 10000) +{ + const auto direction = std::signbit(End - Begin) ? -1 : 1; + std::vector data(sizeof(T) * (End - Begin)); + char * write_pos = data.data(); + + for (auto i = Begin; std::less<>{}(i, End); i += direction) + { + const T v = gen(static_cast(i)); + + unalignedStore(write_pos, v); + write_pos += sizeof(v); + } + + return CodecTestSequence{ + (fmt::format("{} values of {} from {}", (End - Begin), type_name(), gen_name)), + std::move(data), + makeDataType(), + sizeof(T)}; +} + +/////////////////////////////////////////////////////////////////////////////////////////////////// +// Here we use generators to produce test payload for codecs. +// Generator is a callable that can produce infinite number of values, +// output value MUST be of the same type as input value. +/////////////////////////////////////////////////////////////////////////////////////////////////// + +auto SameValueGenerator = [](auto value) { + return [=](auto i) { + return static_cast(value); + }; +}; + +auto SequentialGenerator = [](auto stride = 1) { + return [=](auto i) { + using ValueType = decltype(i); + return static_cast(stride * i); + }; +}; + +template +using uniform_distribution = typename std::conditional_t< + std::is_floating_point_v, + std::uniform_real_distribution, + typename std::conditional_t, std::uniform_int_distribution, void>>; + + +template +struct MonotonicGenerator +{ + explicit MonotonicGenerator(T stride_ = 1, T max_step = 10) + : prev_value(0) + , stride(stride_) + , random_engine(0) + , distribution(0, max_step) + {} + + template + U operator()(U) + { + prev_value = prev_value + stride * distribution(random_engine); + return static_cast(prev_value); + } + +private: + T prev_value; + const T stride; + std::default_random_engine random_engine; + uniform_distribution distribution; +}; + +template +struct RandomGenerator +{ + explicit RandomGenerator( + T seed = 0, + T value_min = std::numeric_limits::min(), + T value_max = std::numeric_limits::max()) + : random_engine(static_cast(seed)) + , distribution(value_min, value_max) + {} + + template + U operator()(U) + { + return static_cast(distribution(random_engine)); + } + +private: + std::default_random_engine random_engine; + uniform_distribution distribution; +}; + +// auto RandomishGenerator = [](auto i) { +// using T = decltype(i); +// double sin_value = sin(static_cast(i * i)) * i; +// if (sin_value < std::numeric_limits::lowest() || sin_value > static_cast(std::numeric_limits::max())) +// return T{}; +// return static_cast(sin_value); +// }; + +auto MinMaxGenerator = []() { + return [step = 0](auto i) mutable { + if (step++ % 2 == 0) + { + return std::numeric_limits::min(); + } + else + { + return std::numeric_limits::max(); + } + }; +}; + +template +struct RepeatGenerator +{ + explicit RepeatGenerator(T seed = 0, size_t min_repeat_count = 4, size_t max_repeat_count = 16) + : random_engine(static_cast(seed)) + , value_distribution(std::numeric_limits::min(), std::numeric_limits::max()) + , repeat_distribution(min_repeat_count, max_repeat_count) + { + generate_next_value(); + } + + template + U operator()(U) + { + if (repeat_count == 0) + { + generate_next_value(); + } + --repeat_count; + return current_value; + } + +private: + void generate_next_value() + { + current_value = value_distribution(random_engine); + repeat_count = repeat_distribution(random_engine); + } + + std::default_random_engine random_engine; + std::uniform_int_distribution value_distribution; + std::uniform_int_distribution repeat_distribution; + T current_value; + size_t repeat_count = 0; +}; + +} // namespace DB::tests diff --git a/dbms/src/IO/Compression/tests/bench_codec.cpp b/dbms/src/IO/Compression/tests/bench_codec.cpp new file mode 100644 index 00000000000..eba3959f5e7 --- /dev/null +++ b/dbms/src/IO/Compression/tests/bench_codec.cpp @@ -0,0 +1,275 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB::bench +{ + +template +static void singleWrite(benchmark::State & state, Args &&... args) +{ + auto args_tuple = std::make_tuple(std::move(args)...); + auto generator = std::get<0>(args_tuple); + auto sequence = tests::generateSeq(generator, "", 0, 8192); + auto file_name = fmt::format("/tmp/tiflash_codec_bench_{}_{}", sequence.name, magic_enum::enum_name(method_byte)); + for (auto _ : state) + { + auto file = std::make_shared(file_name, true, -1, 0755); + auto write_buffer = std::make_shared(file); + CompressionSetting setting(method_byte); + setting.data_type = magic_enum::enum_cast(sizeof(T)).value(); + CompressedWriteBuffer<> compressed(*write_buffer, CompressionSettings(setting)); + compressed.write(sequence.serialized_data.data(), sequence.serialized_data.size()); + compressed.next(); + write_buffer->next(); + write_buffer->sync(); + Poco::File(file_name).remove(); + } +} + +template +static void singleRead(benchmark::State & state, Args &&... args) +{ + auto args_tuple = std::make_tuple(std::move(args)...); + auto generator = std::get<0>(args_tuple); + auto sequence = tests::generateSeq(generator, "", 0, 8192); + auto file_name = fmt::format("/tmp/tiflash_codec_bench_{}_{}", sequence.name, magic_enum::enum_name(method_byte)); + { + auto file = std::make_shared(file_name, true, -1, 0755); + auto write_buffer = std::make_shared(file); + CompressionSetting setting(method_byte); + setting.data_type = magic_enum::enum_cast(sizeof(T)).value(); + CompressedWriteBuffer<> compressed(*write_buffer, CompressionSettings(setting)); + compressed.write(sequence.serialized_data.data(), sequence.serialized_data.size()); + compressed.next(); + write_buffer->next(); + write_buffer->sync(); + } + for (auto _ : state) + { + auto read_buffer = std::make_shared(file_name); + CompressedReadBuffer<> compressed(*read_buffer); + const size_t buffer_size = 32 * 1024; // 32KB + while (!compressed.eof()) + { + char buffer[buffer_size]; + compressed.readBig(buffer, buffer_size); + benchmark::DoNotOptimize(buffer); + } + } + Poco::File(file_name).remove(); +} + +#define BENCH_SINGLE_WRITE_METHOD_GENERATOR_TYPE(name, method, generator, T) \ + template \ + static void name(benchmark::State & state, Args &&... args) \ + { \ + singleWrite(state, args...); \ + } \ + BENCHMARK_CAPTURE(name, generator, generator); + +#define BENCH_SINGLE_READ_METHOD_GENERATOR_TYPE(name, method, generator, T) \ + template \ + static void name(benchmark::State & state, Args &&... args) \ + { \ + singleRead(state, args...); \ + } \ + BENCHMARK_CAPTURE(name, generator, generator); + +#define BENCH_SINGLE_WRITE_GENERATOR_TYPE(name, generator, type) \ + BENCH_SINGLE_WRITE_METHOD_GENERATOR_TYPE(name##LZ4, CompressionMethodByte::LZ4, generator, type); \ + BENCH_SINGLE_WRITE_METHOD_GENERATOR_TYPE(name##Lightweight, CompressionMethodByte::Lightweight, generator, type); + +#define BENCH_SINGLE_READ_GENERATOR_TYPE(name, generator, type) \ + BENCH_SINGLE_READ_METHOD_GENERATOR_TYPE(name##LZ4, CompressionMethodByte::LZ4, generator, type); \ + BENCH_SINGLE_READ_METHOD_GENERATOR_TYPE(name##Lightweight, CompressionMethodByte::Lightweight, generator, type); + +#define BENCH_SINGLE_WRITE_GENERATOR(name, generator) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##UInt8, generator, UInt8); \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##UInt16, generator, UInt16); \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##UInt32, generator, UInt32); \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##UInt64, generator, UInt64); + +#define BENCH_SINGLE_READ_GENERATOR(name, generator) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##UInt8, generator, UInt8); \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##UInt16, generator, UInt16); \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##UInt32, generator, UInt32); \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##UInt64, generator, UInt64); + +#define BENCH_SINGLE_WRITE(name) \ + BENCH_SINGLE_WRITE_GENERATOR(name##SameValue, tests::SameValueGenerator(128)) \ + BENCH_SINGLE_WRITE_GENERATOR(name##Sequential, tests::SequentialGenerator(2)) \ + BENCH_SINGLE_WRITE_GENERATOR(name##SequentialReverse, tests::SequentialGenerator(-2)) \ + BENCH_SINGLE_WRITE_GENERATOR(name##Monotonic, tests::MonotonicGenerator()) \ + BENCH_SINGLE_WRITE_GENERATOR(name##MonotonicReverse, tests::MonotonicGenerator(-1)) \ + BENCH_SINGLE_WRITE_GENERATOR(name##MinMax, tests::MinMaxGenerator()) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RandomUInt8, tests::RandomGenerator(0), UInt8) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RandomUInt16, tests::RandomGenerator(0), UInt16) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RandomUInt32, tests::RandomGenerator(0), UInt32) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RandomUInt64, tests::RandomGenerator(0), UInt64) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##SmallRandomUInt8, tests::RandomGenerator(0, 0, 16), UInt8) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##SmallRandomUInt16, tests::RandomGenerator(0, 0, 16), UInt16) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##SmallRandomUInt32, tests::RandomGenerator(0, 0, 16), UInt32) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##SmallRandomUInt64, tests::RandomGenerator(0, 0, 16), UInt64) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RepeatUInt8, tests::RepeatGenerator(0), UInt8) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RepeatUInt16, tests::RepeatGenerator(0), UInt16) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RepeatUInt32, tests::RepeatGenerator(0), UInt32) \ + BENCH_SINGLE_WRITE_GENERATOR_TYPE(name##RepeatUInt64, tests::RepeatGenerator(0), UInt64) + +#define BENCH_SINGLE_READ(name) \ + BENCH_SINGLE_READ_GENERATOR(name##SameValue, tests::SameValueGenerator(128)) \ + BENCH_SINGLE_READ_GENERATOR(name##Sequential, tests::SequentialGenerator(2)) \ + BENCH_SINGLE_READ_GENERATOR(name##SequentialReverse, tests::SequentialGenerator(-2)) \ + BENCH_SINGLE_READ_GENERATOR(name##Monotonic, tests::MonotonicGenerator()) \ + BENCH_SINGLE_READ_GENERATOR(name##MonotonicReverse, tests::MonotonicGenerator(-1)) \ + BENCH_SINGLE_READ_GENERATOR(name##MinMax, tests::MinMaxGenerator()) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RandomUInt8, tests::RandomGenerator(0), UInt8) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RandomUInt16, tests::RandomGenerator(0), UInt16) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RandomUInt32, tests::RandomGenerator(0), UInt32) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RandomUInt64, tests::RandomGenerator(0), UInt64) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##SmallRandomUInt8, tests::RandomGenerator(0, 0, 16), UInt8) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##SmallRandomUInt16, tests::RandomGenerator(0, 0, 16), UInt16) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##SmallRandomUInt32, tests::RandomGenerator(0, 0, 16), UInt32) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##SmallRandomUInt64, tests::RandomGenerator(0, 0, 16), UInt64) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RepeatUInt8, tests::RepeatGenerator(0), UInt8) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RepeatUInt16, tests::RepeatGenerator(0), UInt16) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RepeatUInt32, tests::RepeatGenerator(0), UInt32) \ + BENCH_SINGLE_READ_GENERATOR_TYPE(name##RepeatUInt64, tests::RepeatGenerator(0), UInt64) + +BENCH_SINGLE_WRITE(CodecSingleWrite) +BENCH_SINGLE_READ(CodecSingleRead) + +#define WRITE_SEQUENCE(generator) \ + { \ + auto sequence = tests::generateSeq(generator, "", 0, 8192); \ + compressed.write(sequence.serialized_data.data(), sequence.serialized_data.size()); \ + compressed.next(); \ + } + +template +static void multipleWrite(benchmark::State & state) +{ + auto file_name = fmt::format("/tmp/tiflash_codec_bench_{}", magic_enum::enum_name(method_byte)); + for (auto _ : state) + { + auto file = std::make_shared(file_name, true, -1, 0755); + auto write_buffer = std::make_shared(file); + CompressionSetting setting(method_byte); + setting.data_type = magic_enum::enum_cast(sizeof(T)).value(); + CompressedWriteBuffer<> compressed(*write_buffer, CompressionSettings(setting)); + + WRITE_SEQUENCE(tests::SameValueGenerator(128)); // Constant + WRITE_SEQUENCE(tests::SequentialGenerator(2)); // ConstantDelta + WRITE_SEQUENCE(tests::SequentialGenerator(-2)); // ConstantDelta + WRITE_SEQUENCE(tests::MonotonicGenerator()); // DeltaFOR + WRITE_SEQUENCE(tests::MonotonicGenerator(-1)); // DeltaFOR + WRITE_SEQUENCE(tests::MinMaxGenerator()); // DeltaFOR, (max - min = -1) + WRITE_SEQUENCE(tests::RandomGenerator(0)); // LZ4 + WRITE_SEQUENCE(tests::RandomGenerator(0, 0, 100)); // FOR + WRITE_SEQUENCE(tests::RepeatGenerator(0)); // RLE + + write_buffer->next(); + write_buffer->sync(); + Poco::File(file_name).remove(); + } +} + +template +static void multipleRead(benchmark::State & state) +{ + auto file_name = fmt::format("/tmp/tiflash_codec_bench_{}", magic_enum::enum_name(method_byte)); + { + auto file = std::make_shared(file_name, true, -1, 0755); + auto write_buffer = std::make_shared(file); + CompressionSetting setting(method_byte); + setting.data_type = magic_enum::enum_cast(sizeof(T)).value(); + CompressedWriteBuffer<> compressed(*write_buffer, CompressionSettings(setting)); + + WRITE_SEQUENCE(tests::SameValueGenerator(128)); + WRITE_SEQUENCE(tests::SequentialGenerator(2)); + WRITE_SEQUENCE(tests::SequentialGenerator(-2)); + WRITE_SEQUENCE(tests::MonotonicGenerator()); + WRITE_SEQUENCE(tests::MonotonicGenerator(-1)); + WRITE_SEQUENCE(tests::MinMaxGenerator()); + WRITE_SEQUENCE(tests::RandomGenerator(0)); + WRITE_SEQUENCE(tests::RandomGenerator(0, 0, 100)); + WRITE_SEQUENCE(tests::RepeatGenerator(0)); + + write_buffer->next(); + write_buffer->sync(); + } + for (auto _ : state) + { + auto read_buffer = std::make_shared(file_name); + CompressedReadBuffer<> compressed(*read_buffer); + constexpr size_t buffer_size = 32 * 1024; // 32KB + while (!compressed.eof()) + { + char buffer[buffer_size]; + compressed.readBig(buffer, buffer_size); + benchmark::DoNotOptimize(buffer); + } + } + Poco::File(file_name).remove(); +} + +#define BENCH_MULTIPLE_WRITE_METHOD_TYPE(name, method, T) \ + static void name(benchmark::State & state) \ + { \ + multipleWrite(state); \ + } \ + BENCHMARK(name); + +#define BENCH_MULTIPLE_READ_METHOD_TYPE(name, method, T) \ + static void name(benchmark::State & state) \ + { \ + multipleRead(state); \ + } \ + BENCHMARK(name); + +#define BENCH_MULTIPLE_WRITE_TYPE(name, T) \ + BENCH_MULTIPLE_WRITE_METHOD_TYPE(name##LZ4, CompressionMethodByte::LZ4, T) \ + BENCH_MULTIPLE_WRITE_METHOD_TYPE(name##Lightweight, CompressionMethodByte::Lightweight, T) + +#define BENCH_MULTIPLE_READ_TYPE(name, T) \ + BENCH_MULTIPLE_READ_METHOD_TYPE(name##LZ4, CompressionMethodByte::LZ4, T) \ + BENCH_MULTIPLE_READ_METHOD_TYPE(name##Lightweight, CompressionMethodByte::Lightweight, T) + +#define BENCH_MULTIPLE_WRITE(name) \ + BENCH_MULTIPLE_WRITE_TYPE(name##UInt8, UInt8) \ + BENCH_MULTIPLE_WRITE_TYPE(name##UInt16, UInt16) \ + BENCH_MULTIPLE_WRITE_TYPE(name##UInt32, UInt32) \ + BENCH_MULTIPLE_WRITE_TYPE(name##UInt64, UInt64) + +#define BENCH_MULTIPLE_READ(name) \ + BENCH_MULTIPLE_READ_TYPE(name##UInt8, UInt8) \ + BENCH_MULTIPLE_READ_TYPE(name##UInt16, UInt16) \ + BENCH_MULTIPLE_READ_TYPE(name##UInt32, UInt32) \ + BENCH_MULTIPLE_READ_TYPE(name##UInt64, UInt64) + +BENCH_MULTIPLE_WRITE(CodecMultipleWrite) +BENCH_MULTIPLE_READ(CodecMultipleRead) + +} // namespace DB::bench diff --git a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp index 35f80b2a296..74cf9774cf9 100644 --- a/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp +++ b/dbms/src/IO/Compression/tests/gtest_codec_compression.cpp @@ -1,4 +1,4 @@ -// Copyright 2023 PingCAP, Inc. +// Copyright 2024 PingCAP, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,16 +14,10 @@ #include #include -#include -#include -#include +#include #include -#include #include -#include -#include - namespace DB::tests { @@ -31,90 +25,6 @@ namespace DB::tests template inline constexpr bool is_pod_v = std::is_trivial_v>; -template -struct AsHexStringHelper -{ - const T & container; -}; - -template -std::ostream & operator<<(std::ostream & ostr, const AsHexStringHelper & helper) -{ - ostr << std::hex; - for (const auto & e : helper.container) - { - ostr << "\\x" << std::setw(2) << std::setfill('0') << (static_cast(e) & 0xFF); - } - - return ostr; -} - -template -AsHexStringHelper AsHexString(const T & container) -{ - static_assert( - sizeof(container[0]) == 1 && is_pod_v>, - "Only works on containers of byte-size PODs."); - - return AsHexStringHelper{container}; -} - -template -std::string bin(const T & value, size_t bits = sizeof(T) * 8) -{ - static const uint8_t MAX_BITS = sizeof(T) * 8; - assert(bits <= MAX_BITS); - - return std::bitset(static_cast(value)).to_string().substr(MAX_BITS - bits, bits); -} - -template -const char * type_name() -{ -#define MAKE_TYPE_NAME(TYPE) \ - if constexpr (std::is_same_v) \ - return #TYPE - - MAKE_TYPE_NAME(UInt8); - MAKE_TYPE_NAME(UInt16); - MAKE_TYPE_NAME(UInt32); - MAKE_TYPE_NAME(UInt64); - MAKE_TYPE_NAME(Int8); - MAKE_TYPE_NAME(Int16); - MAKE_TYPE_NAME(Int32); - MAKE_TYPE_NAME(Int64); - MAKE_TYPE_NAME(Float32); - MAKE_TYPE_NAME(Float64); - -#undef MAKE_TYPE_NAME - - return typeid(T).name(); -} - -template -DataTypePtr makeDataType() -{ -#define MAKE_DATA_TYPE(TYPE) \ - if constexpr (std::is_same_v) \ - return std::make_shared() - - MAKE_DATA_TYPE(UInt8); - MAKE_DATA_TYPE(UInt16); - MAKE_DATA_TYPE(UInt32); - MAKE_DATA_TYPE(UInt64); - MAKE_DATA_TYPE(Int8); - MAKE_DATA_TYPE(Int16); - MAKE_DATA_TYPE(Int32); - MAKE_DATA_TYPE(Int64); - MAKE_DATA_TYPE(Float32); - MAKE_DATA_TYPE(Float64); - -#undef MAKE_DATA_TYPE - - assert(false && "unknown datatype"); - return nullptr; -} - template class BinaryDataAsSequenceOfValuesIterator { @@ -238,116 +148,6 @@ ::testing::AssertionResult EqualByteContainers( } } - -struct CodecTestSequence -{ - std::string name; - std::vector serialized_data; - DataTypePtr data_type; - UInt8 type_byte; - - CodecTestSequence(std::string name_, std::vector serialized_data_, DataTypePtr data_type_, UInt8 type_byte_) - : name(name_) - , serialized_data(serialized_data_) - , data_type(data_type_) - , type_byte(type_byte_) - {} - - CodecTestSequence & append(const CodecTestSequence & other) - { - assert(data_type->equals(*other.data_type)); - - serialized_data.insert(serialized_data.end(), other.serialized_data.begin(), other.serialized_data.end()); - if (!name.empty()) - name += " + "; - name += other.name; - - return *this; - } -}; - -CodecTestSequence operator+(CodecTestSequence && left, const CodecTestSequence & right) -{ - return left.append(right); -} - -template -CodecTestSequence operator*(CodecTestSequence && left, T times) -{ - std::vector data(std::move(left.serialized_data)); - const size_t initial_size = data.size(); - const size_t final_size = initial_size * times; - - data.reserve(final_size); - - for (T i = 0; i < times; ++i) - { - data.insert(data.end(), data.begin(), data.begin() + initial_size); - } - - return CodecTestSequence{ - left.name + " x " + std::to_string(times), - std::move(data), - std::move(left.data_type), - sizeof(T)}; -} - -std::ostream & operator<<(std::ostream & ostr, const CompressionMethodByte method_byte) -{ - ostr << "Codec{name: " << magic_enum::enum_name(method_byte) << "}"; - return ostr; -} - -std::ostream & operator<<(std::ostream & ostr, const CodecTestSequence & seq) -{ - return ostr << "CodecTestSequence{" - << "name: " << seq.name << ", type name: " << seq.data_type->getName() - << ", data size: " << seq.serialized_data.size() << " bytes" - << "}"; -} - -template -CodecTestSequence makeSeq(Args &&... args) -{ - std::initializer_list vals{static_cast(args)...}; - std::vector data(sizeof(T) * std::size(vals)); - - char * write_pos = data.data(); - for (const auto & v : vals) - { - unalignedStore(write_pos, v); - write_pos += sizeof(v); - } - - return CodecTestSequence{ - (fmt::format("{} values of {}", std::size(vals), type_name())), - std::move(data), - makeDataType(), - sizeof(T)}; -} - -template -CodecTestSequence generateSeq(Generator gen, const char * gen_name, B Begin = 0, E End = 10000) -{ - const auto direction = std::signbit(End - Begin) ? -1 : 1; - std::vector data(sizeof(T) * (End - Begin)); - char * write_pos = data.data(); - - for (auto i = Begin; std::less<>{}(i, End); i += direction) - { - const T v = static_cast(gen(i)); - - unalignedStore(write_pos, v); - write_pos += sizeof(v); - } - - return CodecTestSequence{ - (fmt::format("{} values of {} from {}", (End - Begin), type_name(), gen_name)), - std::move(data), - makeDataType(), - sizeof(T)}; -} - CompressionCodecPtr makeCodec(const CompressionMethodByte method_byte, UInt8 type_byte) { CompressionSetting setting(method_byte); @@ -417,99 +217,6 @@ try } CATCH -/////////////////////////////////////////////////////////////////////////////////////////////////// -// Here we use generators to produce test payload for codecs. -// Generator is a callable that can produce infinite number of values, -// output value MUST be of the same type as input value. -/////////////////////////////////////////////////////////////////////////////////////////////////// - -auto SameValueGenerator = [](auto value) { - return [=](auto i) { - return static_cast(value); - }; -}; - -auto SequentialGenerator = [](auto stride = 1) { - return [=](auto i) { - using ValueType = decltype(i); - return static_cast(stride * i); - }; -}; - -template -using uniform_distribution = typename std::conditional_t< - std::is_floating_point_v, - std::uniform_real_distribution, - typename std::conditional_t, std::uniform_int_distribution, void>>; - - -template -struct MonotonicGenerator -{ - explicit MonotonicGenerator(T stride_ = 1, T max_step = 10) - : prev_value(0) - , stride(stride_) - , random_engine(0) - , distribution(0, max_step) - {} - - template - U operator()(U) - { - prev_value = prev_value + stride * distribution(random_engine); - return static_cast(prev_value); - } - -private: - T prev_value; - const T stride; - std::default_random_engine random_engine; - uniform_distribution distribution; -}; - -template -struct RandomGenerator -{ - explicit RandomGenerator( - T seed = 0, - T value_min = std::numeric_limits::min(), - T value_max = std::numeric_limits::max()) - : random_engine(static_cast(seed)) - , distribution(value_min, value_max) - {} - - template - U operator()(U) - { - return static_cast(distribution(random_engine)); - } - -private: - std::default_random_engine random_engine; - uniform_distribution distribution; -}; - -// auto RandomishGenerator = [](auto i) { -// using T = decltype(i); -// double sin_value = sin(static_cast(i * i)) * i; -// if (sin_value < std::numeric_limits::lowest() || sin_value > static_cast(std::numeric_limits::max())) -// return T{}; -// return static_cast(sin_value); -// }; - -auto MinMaxGenerator = []() { - return [step = 0](auto i) mutable { - if (step++ % 2 == 0) - { - return std::numeric_limits::min(); - } - else - { - return std::numeric_limits::max(); - } - }; -}; - // Makes many sequences with generator, first sequence length is 0, second is 1..., third is 2 up to `sequences_count`. template std::vector generatePyramidOfSequences( @@ -683,6 +390,18 @@ INSTANTIATE_TEST_CASE_P( generateSeq(G(RandomGenerator(0, 0, 1000'000'000))), generateSeq(G(RandomGenerator(0, 0, 1000'000'000)))))); + +INSTANTIATE_TEST_CASE_P( + RepeatInt, + CodecTest, + ::testing::Combine( + IntegerCodecsToTest, + ::testing::Values( + generateSeq(G(RepeatGenerator(0))), + generateSeq(G(RepeatGenerator(0))), + generateSeq(G(RepeatGenerator(0))), + generateSeq(G(RepeatGenerator(0)))))); + // INSTANTIATE_TEST_CASE_P( // RandomishInt, // CodecTest,