From 6d7147589a2e29d45c64144fb2e5d36253e77f71 Mon Sep 17 00:00:00 2001 From: fscheibner Date: Mon, 31 Oct 2016 22:27:41 -0400 Subject: [PATCH] PARQUET-745: TypedRowGroupStatistics fails to PlainDecode min and max in ByteArrayType I'm not sure how this code is supposed to work, this seems to solve the issue. The code was added in 176b08c305919551ecfd5fc2f741f7bff4deefdd by @lomereiter I think. Author: fscheibner Closes #176 from flode/stats and squashes the following commits: 7a24906 [fscheibner] fix format f283edb [fscheibner] Pass wether minmax is set b50c985 [fscheibner] format f969a9f [fscheibner] Specialize PlainEncode and PlainDecode 6b5884d [fscheibner] format f17926d [fscheibner] fix typedrowgroup Change-Id: Iafbb09044f31978997436bbd30549b8b22b0a54c --- cpp/src/parquet/column/statistics-test.cc | 27 ++++++++++++++++++++++- cpp/src/parquet/column/statistics.cc | 18 +++++++++++++-- cpp/src/parquet/column/statistics.h | 11 ++++++++- cpp/src/parquet/file/metadata.cc | 3 ++- cpp/src/parquet/file/reader-internal.cc | 8 ------- 5 files changed, 54 insertions(+), 13 deletions(-) diff --git a/cpp/src/parquet/column/statistics-test.cc b/cpp/src/parquet/column/statistics-test.cc index d1e1eebd6f4c2..3ed7aab0785b6 100644 --- a/cpp/src/parquet/column/statistics-test.cc +++ b/cpp/src/parquet/column/statistics-test.cc @@ -66,7 +66,8 @@ class TestRowGroupStatistics : public PrimitiveTypedTest { std::string encoded_max = statistics1.EncodeMax(); TypedStats statistics2( - this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0); + this->schema_.Column(0), encoded_min, encoded_max, + this->values_.size(), 0, 0, true); ASSERT_EQ(encoded_min, statistics2.EncodeMin()); ASSERT_EQ(encoded_max, statistics2.EncodeMax()); @@ -235,6 +236,30 @@ void TestRowGroupStatistics::DeepFree(std::vector& val } } +template<> +void TestRowGroupStatistics::TestMinMaxEncode() { + this->GenerateData(1000); + // Test that we encode min max strings correctly + TypedRowGroupStatistics statistics1(this->schema_.Column(0)); + statistics1.Update(this->values_ptr_, this->values_.size(), 0); + std::string encoded_min = statistics1.EncodeMin(); + std::string encoded_max = statistics1.EncodeMax(); + + // encoded is same as unencoded + ASSERT_EQ(encoded_min, std::string((const char*)statistics1.min().ptr, + statistics1.min().len)); + ASSERT_EQ(encoded_max, std::string((const char*)statistics1.max().ptr, + statistics1.max().len)); + + TypedRowGroupStatistics statistics2( + this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0, true); + + ASSERT_EQ(encoded_min, statistics2.EncodeMin()); + ASSERT_EQ(encoded_max, statistics2.EncodeMax()); + ASSERT_EQ(statistics1.min(), statistics2.min()); + ASSERT_EQ(statistics1.max(), statistics2.max()); +} + using TestTypes = ::testing::Types; diff --git a/cpp/src/parquet/column/statistics.cc b/cpp/src/parquet/column/statistics.cc index d8f87859ecd42..d761571c09ea1 100644 --- a/cpp/src/parquet/column/statistics.cc +++ b/cpp/src/parquet/column/statistics.cc @@ -54,7 +54,8 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const typename DType::c_ template TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, - int64_t null_count, int64_t distinct_count, MemoryAllocator* allocator) + int64_t null_count, int64_t distinct_count, bool has_min_max, + MemoryAllocator* allocator) : allocator_(allocator), min_buffer_(0, allocator_), max_buffer_(0, allocator_) { IncrementNumValues(num_values); IncrementNullCount(null_count); @@ -64,7 +65,7 @@ TypedRowGroupStatistics::TypedRowGroupStatistics(const ColumnDescriptor* if (!encoded_min.empty()) { PlainDecode(encoded_min, &min_); } if (!encoded_max.empty()) { PlainDecode(encoded_max, &max_); } - has_min_max_ = !encoded_min.empty() && !encoded_max.empty(); + has_min_max_ = has_min_max; } template @@ -160,6 +161,19 @@ void TypedRowGroupStatistics::PlainDecode(const std::string& src, T* dst) decoder.Decode(dst, 1); } +template <> +void TypedRowGroupStatistics::PlainEncode( + const T& src, std::string* dst) { + dst->assign(reinterpret_cast(src.ptr), src.len); +} + +template <> +void TypedRowGroupStatistics::PlainDecode( + const std::string& src, T* dst) { + dst->len = src.size(); + dst->ptr = reinterpret_cast(src.c_str()); +} + template class TypedRowGroupStatistics; template class TypedRowGroupStatistics; template class TypedRowGroupStatistics; diff --git a/cpp/src/parquet/column/statistics.h b/cpp/src/parquet/column/statistics.h index dc576c838c546..a3b28213155c4 100644 --- a/cpp/src/parquet/column/statistics.h +++ b/cpp/src/parquet/column/statistics.h @@ -142,7 +142,8 @@ class TypedRowGroupStatistics : public RowGroupStatistics { TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min, const std::string& encoded_max, int64_t num_values, int64_t null_count, - int64_t distinct_count, MemoryAllocator* allocator = default_allocator()); + int64_t distinct_count, bool has_min_max, + MemoryAllocator* allocator = default_allocator()); bool HasMinMax() const override; void Reset() override; @@ -195,6 +196,14 @@ inline void TypedRowGroupStatistics::Copy( *dst = ByteArray(src.len, buffer.data()); } +template <> +void TypedRowGroupStatistics::PlainEncode( + const T& src, std::string* dst); + +template <> +void TypedRowGroupStatistics::PlainDecode( + const std::string& src, T* dst); + using BoolStatistics = TypedRowGroupStatistics; using Int32Statistics = TypedRowGroupStatistics; using Int64Statistics = TypedRowGroupStatistics; diff --git a/cpp/src/parquet/file/metadata.cc b/cpp/src/parquet/file/metadata.cc index d50c9155afcbb..25807061e92b1 100644 --- a/cpp/src/parquet/file/metadata.cc +++ b/cpp/src/parquet/file/metadata.cc @@ -29,7 +29,8 @@ static std::shared_ptr MakeTypedColumnStats( const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { return std::make_shared>(descr, metadata.statistics.min, metadata.statistics.max, metadata.num_values - metadata.statistics.null_count, - metadata.statistics.null_count, metadata.statistics.distinct_count); + metadata.statistics.null_count, metadata.statistics.distinct_count, + metadata.statistics.__isset.max || metadata.statistics.__isset.min); } std::shared_ptr MakeColumnStats( diff --git a/cpp/src/parquet/file/reader-internal.cc b/cpp/src/parquet/file/reader-internal.cc index fa193904e64af..5eda79b649932 100644 --- a/cpp/src/parquet/file/reader-internal.cc +++ b/cpp/src/parquet/file/reader-internal.cc @@ -175,14 +175,6 @@ std::unique_ptr SerializedRowGroup::GetColumnPageReader(int i) { std::move(stream), col->compression(), properties_.allocator())); } -template -static std::shared_ptr MakeColumnStats( - const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) { - return std::make_shared>(descr, metadata.statistics.min, - metadata.statistics.max, metadata.num_values, metadata.statistics.null_count, - metadata.statistics.distinct_count); -} - // ---------------------------------------------------------------------- // SerializedFile: Parquet on-disk layout