Skip to content

Commit

Permalink
PARQUET-844: Schema, compression consolidation / flattening
Browse files Browse the repository at this point in the history
Will look at `encodings/`, `file/`, and `column/` directories later

Author: Wes McKinney <wes.mckinney@twosigma.com>

Closes apache#228 from wesm/PARQUET-844 and squashes the following commits:

45b2887 [Wes McKinney] Fix include rename
88f0afe [Wes McKinney] Consolidate schema code and tests into schema.h/schema-internal.h
0385381 [Wes McKinney] Consolidate compression code into a single header

Change-Id: I776f4e34bacb2217f73821fe8f606ff13c5141d4
  • Loading branch information
wesm committed Jan 26, 2017
1 parent 5a21610 commit c016b72
Show file tree
Hide file tree
Showing 46 changed files with 1,288 additions and 1,726 deletions.
4 changes: 4 additions & 0 deletions cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@

# Headers: top level
install(FILES
compression.h
exception.h
schema.h
types.h
DESTINATION include/parquet)

ADD_PARQUET_TEST(compression-test)
ADD_PARQUET_TEST(public-api-test)
ADD_PARQUET_TEST(types-test)
ADD_PARQUET_TEST(reader-test)
ADD_PARQUET_TEST(schema-test)
4 changes: 1 addition & 3 deletions cpp/src/parquet/api/schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,6 @@
#define PARQUET_API_SCHEMA_H

// Schemas
#include "parquet/schema/descriptor.h"
#include "parquet/schema/printer.h"
#include "parquet/schema/types.h"
#include "parquet/schema.h"

#endif // PARQUET_API_SCHEMA_H
3 changes: 1 addition & 2 deletions cpp/src/parquet/column/column-reader-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
#include "parquet/column/page.h"
#include "parquet/column/reader.h"
#include "parquet/column/test-util.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema/types.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/test-common.h"

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#include <unordered_map>

#include "parquet/exception.h"
#include "parquet/schema/types.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
#include "parquet/column/page.h"
#include "parquet/encodings/decoder.h"
#include "parquet/exception.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/parquet/column/scanner-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,7 @@
#include "parquet/column/scanner.h"
#include "parquet/column/test-specialization.h"
#include "parquet/column/test-util.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema/types.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/test-common.h"

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column/scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

#include "parquet/column/reader.h"
#include "parquet/exception.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column/statistics-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#include "parquet/column/writer.h"
#include "parquet/file/reader.h"
#include "parquet/file/writer.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/memory.h"

Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column/statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <memory>
#include <string>

#include "parquet/schema/descriptor.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/column/writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "parquet/column/statistics.h"
#include "parquet/encodings/encoder.h"
#include "parquet/file/metadata.h"
#include "parquet/schema/descriptor.h"
#include "parquet/schema.h"
#include "parquet/types.h"
#include "parquet/util/memory.h"
#include "parquet/util/visibility.h"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include <string>
#include <vector>

#include "parquet/compression/codec.h"
#include "parquet/compression.h"
#include "parquet/util/test-common.h"

using std::string;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,47 @@
// specific language governing permissions and limitations
// under the License.

#include <cstring>
#include <sstream>
#include <cstdint>
#include <memory>
#include <string>

#include "parquet/compression/codec.h"
#include <brotli/decode.h>
#include <brotli/encode.h>
#include <snappy.h>

#include "parquet/compression.h"
#include "parquet/exception.h"
#include "parquet/types.h"

namespace parquet {

std::unique_ptr<Codec> Codec::Create(Compression::type codec_type) {
std::unique_ptr<Codec> result;
switch (codec_type) {
case Compression::UNCOMPRESSED:
break;
case Compression::SNAPPY:
result.reset(new SnappyCodec());
break;
case Compression::GZIP:
result.reset(new GZipCodec());
break;
case Compression::LZO:
ParquetException::NYI("LZO codec not implemented");
break;
case Compression::BROTLI:
result.reset(new BrotliCodec());
break;
default:
ParquetException::NYI("Unrecognized codec");
break;
}
return result;
}

// ----------------------------------------------------------------------
// gzip implementation

// These are magic numbers from zlib.h. Not clear why they are not defined
// there.

Expand Down Expand Up @@ -172,4 +204,56 @@ int64_t GZipCodec::Compress(
return output_length - stream_.avail_out;
}

// ----------------------------------------------------------------------
// Snappy implementation

void SnappyCodec::Decompress(
int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) {
if (!snappy::RawUncompress(reinterpret_cast<const char*>(input),
static_cast<size_t>(input_len), reinterpret_cast<char*>(output_buffer))) {
throw parquet::ParquetException("Corrupt snappy compressed data.");
}
}

int64_t SnappyCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) {
return snappy::MaxCompressedLength(input_len);
}

int64_t SnappyCodec::Compress(int64_t input_len, const uint8_t* input,
int64_t output_buffer_len, uint8_t* output_buffer) {
size_t output_len;
snappy::RawCompress(reinterpret_cast<const char*>(input),
static_cast<size_t>(input_len), reinterpret_cast<char*>(output_buffer),
&output_len);
return output_len;
}

// ----------------------------------------------------------------------
// Brotli implementation

void BrotliCodec::Decompress(
int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) {
size_t output_size = output_len;
if (BrotliDecoderDecompress(input_len, input, &output_size, output_buffer) !=
BROTLI_DECODER_RESULT_SUCCESS) {
throw parquet::ParquetException("Corrupt brotli compressed data.");
}
}

int64_t BrotliCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) {
return BrotliEncoderMaxCompressedSize(input_len);
}

int64_t BrotliCodec::Compress(int64_t input_len, const uint8_t* input,
int64_t output_buffer_len, uint8_t* output_buffer) {
size_t output_len = output_buffer_len;
// TODO: Make quality configurable. We use 8 as a default as it is the best
// trade-off for Parquet workload
if (BrotliEncoderCompress(8, BROTLI_DEFAULT_WINDOW, BROTLI_DEFAULT_MODE, input_len,
input, &output_len, output_buffer) == BROTLI_FALSE) {
throw parquet::ParquetException("Brotli compression failure.");
}
return output_len;
}

} // namespace parquet
File renamed without changes.
23 changes: 0 additions & 23 deletions cpp/src/parquet/compression/CMakeLists.txt

This file was deleted.

53 changes: 0 additions & 53 deletions cpp/src/parquet/compression/brotli-codec.cc

This file was deleted.

50 changes: 0 additions & 50 deletions cpp/src/parquet/compression/codec.cc

This file was deleted.

48 changes: 0 additions & 48 deletions cpp/src/parquet/compression/snappy-codec.cc

This file was deleted.

Loading

0 comments on commit c016b72

Please sign in to comment.