Skip to content

Commit

Permalink
PARQUET-745: TypedRowGroupStatistics fails to PlainDecode min and max…
Browse files Browse the repository at this point in the history
… in ByteArrayType

I'm not sure how this code is supposed to work, this seems to solve the issue.

The code was added in 176b08c305919551ecfd5fc2f741f7bff4deefdd by @lomereiter I think.

Author: fscheibner <florian.scheibner@snowflake.net>

Closes apache#176 from flode/stats and squashes the following commits:

7a24906 [fscheibner] fix format
f283edb [fscheibner] Pass wether minmax is set
b50c985 [fscheibner] format
f969a9f [fscheibner] Specialize PlainEncode and PlainDecode
6b5884d [fscheibner] format
f17926d [fscheibner] fix typedrowgroup

Change-Id: Iafbb09044f31978997436bbd30549b8b22b0a54c
  • Loading branch information
flode authored and wesm committed Nov 1, 2016
1 parent b3fa6d7 commit 6d71475
Show file tree
Hide file tree
Showing 5 changed files with 54 additions and 13 deletions.
27 changes: 26 additions & 1 deletion cpp/src/parquet/column/statistics-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ class TestRowGroupStatistics : public PrimitiveTypedTest<TestType> {
std::string encoded_max = statistics1.EncodeMax();

TypedStats statistics2(
this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0);
this->schema_.Column(0), encoded_min, encoded_max,
this->values_.size(), 0, 0, true);

ASSERT_EQ(encoded_min, statistics2.EncodeMin());
ASSERT_EQ(encoded_max, statistics2.EncodeMax());
Expand Down Expand Up @@ -235,6 +236,30 @@ void TestRowGroupStatistics<ByteArrayType>::DeepFree(std::vector<ByteArray>& val
}
}

template<>
void TestRowGroupStatistics<ByteArrayType>::TestMinMaxEncode() {
this->GenerateData(1000);
// Test that we encode min max strings correctly
TypedRowGroupStatistics<ByteArrayType> statistics1(this->schema_.Column(0));
statistics1.Update(this->values_ptr_, this->values_.size(), 0);
std::string encoded_min = statistics1.EncodeMin();
std::string encoded_max = statistics1.EncodeMax();

// encoded is same as unencoded
ASSERT_EQ(encoded_min, std::string((const char*)statistics1.min().ptr,
statistics1.min().len));
ASSERT_EQ(encoded_max, std::string((const char*)statistics1.max().ptr,
statistics1.max().len));

TypedRowGroupStatistics<ByteArrayType> statistics2(
this->schema_.Column(0), encoded_min, encoded_max, this->values_.size(), 0, 0, true);

ASSERT_EQ(encoded_min, statistics2.EncodeMin());
ASSERT_EQ(encoded_max, statistics2.EncodeMax());
ASSERT_EQ(statistics1.min(), statistics2.min());
ASSERT_EQ(statistics1.max(), statistics2.max());
}

using TestTypes = ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
ByteArrayType, FLBAType, BooleanType>;

Expand Down
18 changes: 16 additions & 2 deletions cpp/src/parquet/column/statistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ TypedRowGroupStatistics<DType>::TypedRowGroupStatistics(const typename DType::c_
template <typename DType>
TypedRowGroupStatistics<DType>::TypedRowGroupStatistics(const ColumnDescriptor* schema,
const std::string& encoded_min, const std::string& encoded_max, int64_t num_values,
int64_t null_count, int64_t distinct_count, MemoryAllocator* allocator)
int64_t null_count, int64_t distinct_count, bool has_min_max,
MemoryAllocator* allocator)
: allocator_(allocator), min_buffer_(0, allocator_), max_buffer_(0, allocator_) {
IncrementNumValues(num_values);
IncrementNullCount(null_count);
Expand All @@ -64,7 +65,7 @@ TypedRowGroupStatistics<DType>::TypedRowGroupStatistics(const ColumnDescriptor*

if (!encoded_min.empty()) { PlainDecode(encoded_min, &min_); }
if (!encoded_max.empty()) { PlainDecode(encoded_max, &max_); }
has_min_max_ = !encoded_min.empty() && !encoded_max.empty();
has_min_max_ = has_min_max;
}

template <typename DType>
Expand Down Expand Up @@ -160,6 +161,19 @@ void TypedRowGroupStatistics<DType>::PlainDecode(const std::string& src, T* dst)
decoder.Decode(dst, 1);
}

template <>
void TypedRowGroupStatistics<ByteArrayType>::PlainEncode(
const T& src, std::string* dst) {
dst->assign(reinterpret_cast<const char*>(src.ptr), src.len);
}

template <>
void TypedRowGroupStatistics<ByteArrayType>::PlainDecode(
const std::string& src, T* dst) {
dst->len = src.size();
dst->ptr = reinterpret_cast<const uint8_t*>(src.c_str());
}

template class TypedRowGroupStatistics<BooleanType>;
template class TypedRowGroupStatistics<Int32Type>;
template class TypedRowGroupStatistics<Int64Type>;
Expand Down
11 changes: 10 additions & 1 deletion cpp/src/parquet/column/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,8 @@ class TypedRowGroupStatistics : public RowGroupStatistics {

TypedRowGroupStatistics(const ColumnDescriptor* schema, const std::string& encoded_min,
const std::string& encoded_max, int64_t num_values, int64_t null_count,
int64_t distinct_count, MemoryAllocator* allocator = default_allocator());
int64_t distinct_count, bool has_min_max,
MemoryAllocator* allocator = default_allocator());

bool HasMinMax() const override;
void Reset() override;
Expand Down Expand Up @@ -195,6 +196,14 @@ inline void TypedRowGroupStatistics<ByteArrayType>::Copy(
*dst = ByteArray(src.len, buffer.data());
}

template <>
void TypedRowGroupStatistics<ByteArrayType>::PlainEncode(
const T& src, std::string* dst);

template <>
void TypedRowGroupStatistics<ByteArrayType>::PlainDecode(
const std::string& src, T* dst);

using BoolStatistics = TypedRowGroupStatistics<BooleanType>;
using Int32Statistics = TypedRowGroupStatistics<Int32Type>;
using Int64Statistics = TypedRowGroupStatistics<Int64Type>;
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/parquet/file/metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ static std::shared_ptr<RowGroupStatistics> MakeTypedColumnStats(
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
return std::make_shared<TypedRowGroupStatistics<DType>>(descr, metadata.statistics.min,
metadata.statistics.max, metadata.num_values - metadata.statistics.null_count,
metadata.statistics.null_count, metadata.statistics.distinct_count);
metadata.statistics.null_count, metadata.statistics.distinct_count,
metadata.statistics.__isset.max || metadata.statistics.__isset.min);
}

std::shared_ptr<RowGroupStatistics> MakeColumnStats(
Expand Down
8 changes: 0 additions & 8 deletions cpp/src/parquet/file/reader-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -175,14 +175,6 @@ std::unique_ptr<PageReader> SerializedRowGroup::GetColumnPageReader(int i) {
std::move(stream), col->compression(), properties_.allocator()));
}

template <typename DType>
static std::shared_ptr<RowGroupStatistics> MakeColumnStats(
const format::ColumnMetaData& metadata, const ColumnDescriptor* descr) {
return std::make_shared<TypedRowGroupStatistics<DType>>(descr, metadata.statistics.min,
metadata.statistics.max, metadata.num_values, metadata.statistics.null_count,
metadata.statistics.distinct_count);
}

// ----------------------------------------------------------------------
// SerializedFile: Parquet on-disk layout

Expand Down

0 comments on commit 6d71475

Please sign in to comment.