Skip to content

Commit

Permalink
PARQUET-454: Fix inconsistencies with boolean PLAIN encoding
Browse files Browse the repository at this point in the history
Requires PARQUET-485 (apache#32)

The boolean Encoding::PLAIN code path was using RleDecoder, inconsistent with
other implementations of Parquet. This patch adds an implementation of plain
encoding and uses BitReader instead of RleDecoder to decode plain-encoded
boolean data. Unit tests to verify.

Also closes PR apache#12. Thanks to @edani for reporting.

Author: Wes McKinney <wes@cloudera.com>

Closes apache#34 from wesm/PARQUET-454 and squashes the following commits:

01cb5a7 [Wes McKinney] Use a seed in the data generation
0bf5d8a [Wes McKinney] Fix inconsistencies with boolean PLAIN encoding.

Change-Id: I1be5252c654d4864d14c3cdd70d63c507e0a9403
  • Loading branch information
wesm authored and julienledem committed Feb 2, 2016
1 parent 006f38e commit 9c0066c
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 12 deletions.
2 changes: 2 additions & 0 deletions cpp/src/parquet/encodings/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,5 @@ install(FILES
dictionary-encoding.h
plain-encoding.h
DESTINATION include/parquet/encodings)

ADD_PARQUET_TEST(plain-encoding-test)
3 changes: 1 addition & 2 deletions cpp/src/parquet/encodings/encodings.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@

#include <cstdint>

#include "parquet/exception.h"
#include "parquet/types.h"

#include "parquet/thrift/parquet_constants.h"
#include "parquet/thrift/parquet_types.h"
#include "parquet/util/rle-encoding.h"
#include "parquet/util/bit-stream-utils.inline.h"

Expand Down
65 changes: 65 additions & 0 deletions cpp/src/parquet/encodings/plain-encoding-test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <cstdint>
#include <string>
#include <vector>

#include <gtest/gtest.h>
#include "parquet/util/test-common.h"

#include "parquet/encodings/encodings.h"

using std::string;
using std::vector;

namespace parquet_cpp {

namespace test {

TEST(BooleanTest, TestEncodeDecode) {
// PARQUET-454

size_t nvalues = 100;
size_t nbytes = BitUtil::RoundUp(nvalues, 8) / 8;

// seed the prng so failure is deterministic
vector<bool> draws = flip_coins_seed(nvalues, 0.5, 0);

PlainEncoder<Type::BOOLEAN> encoder(nullptr);
PlainDecoder<Type::BOOLEAN> decoder(nullptr);

std::vector<uint8_t> encode_buffer(nbytes);

size_t encoded_bytes = encoder.Encode(draws, nvalues, &encode_buffer[0]);
ASSERT_EQ(nbytes, encoded_bytes);

std::vector<uint8_t> decode_buffer(nbytes);
const uint8_t* decode_data = &decode_buffer[0];

decoder.SetData(nvalues, &encode_buffer[0], encoded_bytes);
size_t values_decoded = decoder.Decode(&decode_buffer[0], nvalues);
ASSERT_EQ(nvalues, values_decoded);

for (size_t i = 0; i < nvalues; ++i) {
ASSERT_EQ(BitUtil::GetArrayBit(decode_data, i), draws[i]) << i;
}
}

} // namespace test

} // namespace parquet_cpp
51 changes: 41 additions & 10 deletions cpp/src/parquet/encodings/plain-encoding.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "parquet/encodings/encodings.h"

#include <algorithm>
#include <vector>

using parquet::Type;

Expand Down Expand Up @@ -103,19 +104,38 @@ class PlainDecoder<Type::BOOLEAN> : public Decoder<Type::BOOLEAN> {

virtual void SetData(int num_values, const uint8_t* data, int len) {
num_values_ = num_values;
decoder_ = RleDecoder(data, len, 1);
bit_reader_ = BitReader(data, len);
}

// Two flavors of bool decoding
int Decode(uint8_t* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
bool val;
for (int i = 0; i < max_values; ++i) {
if (!bit_reader_.GetValue(1, &val)) {
ParquetException::EofException();
}
BitUtil::SetArrayBit(buffer, i, val);
}
num_values_ -= max_values;
return max_values;
}

virtual int Decode(bool* buffer, int max_values) {
max_values = std::min(max_values, num_values_);
bool val;
for (int i = 0; i < max_values; ++i) {
if (!decoder_.Get(&buffer[i])) ParquetException::EofException();
if (!bit_reader_.GetValue(1, &val)) {
ParquetException::EofException();
}
buffer[i] = val;
}
num_values_ -= max_values;
return max_values;
}

private:
RleDecoder decoder_;
BitReader bit_reader_;
};

// ----------------------------------------------------------------------
Expand All @@ -132,6 +152,24 @@ class PlainEncoder : public Encoder<TYPE> {
virtual size_t Encode(const T* src, int num_values, uint8_t* dst);
};

template <>
class PlainEncoder<Type::BOOLEAN> : public Encoder<Type::BOOLEAN> {
public:
explicit PlainEncoder(const parquet::SchemaElement* schema) :
Encoder<Type::BOOLEAN>(schema, parquet::Encoding::PLAIN) {}

virtual size_t Encode(const std::vector<bool>& src, int num_values,
uint8_t* dst) {
size_t bytes_required = BitUtil::RoundUp(num_values, 8) / 8;
BitWriter bit_writer(dst, bytes_required);
for (size_t i = 0; i < num_values; ++i) {
bit_writer.PutValue(src[i], 1);
}
bit_writer.Flush();
return bit_writer.bytes_written();
}
};

template <int TYPE>
inline size_t PlainEncoder<TYPE>::Encode(const T* buffer, int num_values,
uint8_t* dst) {
Expand All @@ -140,13 +178,6 @@ inline size_t PlainEncoder<TYPE>::Encode(const T* buffer, int num_values,
return nbytes;
}

template <>
inline size_t PlainEncoder<Type::BOOLEAN>::Encode(
const bool* src, int num_values, uint8_t* dst) {
ParquetException::NYI("bool encoding");
return 0;
}

template <>
inline size_t PlainEncoder<Type::BYTE_ARRAY>::Encode(const ByteArray* src,
int num_values, uint8_t* dst) {
Expand Down
8 changes: 8 additions & 0 deletions cpp/src/parquet/util/bit-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,14 @@ class BitUtil {
return v | (static_cast<T>(0x1) << bitpos);
}

static inline bool GetArrayBit(const uint8_t* bits, size_t i) {
return bits[i / 8] & (1 << (i % 8));
}

static inline void SetArrayBit(uint8_t* bits, size_t i, bool is_set) {
bits[i / 8] |= (1 << (i % 8)) * is_set;
}

// Set a specific bit to 0
// Behavior when bitpos is negative is undefined
template<typename T>
Expand Down
27 changes: 27 additions & 0 deletions cpp/src/parquet/util/test-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#define PARQUET_UTIL_TEST_COMMON_H

#include <iostream>
#include <random>
#include <vector>

using std::vector;
Expand Down Expand Up @@ -46,6 +47,32 @@ static inline bool vector_equal(const vector<T>& left, const vector<T>& right) {
return true;
}

static inline vector<bool> flip_coins_seed(size_t n, double p, uint32_t seed) {
std::mt19937 gen(seed);
std::bernoulli_distribution d(p);

vector<bool> draws;
for (size_t i = 0; i < n; ++i) {
draws.push_back(d(gen));
}
return draws;
}


static inline vector<bool> flip_coins(size_t n, double p) {
std::random_device rd;
std::mt19937 gen(rd());

std::bernoulli_distribution d(p);

vector<bool> draws;
for (size_t i = 0; i < n; ++i) {
draws.push_back(d(gen));
}
return draws;
}


} // namespace test

} // namespace parquet_cpp
Expand Down

0 comments on commit 9c0066c

Please sign in to comment.