From 6c5a8274bd8ced1adf6632cb876eda34bef4c9ca Mon Sep 17 00:00:00 2001 From: Deepak Majeti Date: Sun, 13 Mar 2016 10:50:26 -0700 Subject: [PATCH] PARQUET-556:Extend RowGroupStatistics to include "min" "max" statistics Also includes a patch to extend the GroupReader API with num_rows() Author: Deepak Majeti Closes #76 from majetideepak/PARQUET-556 and squashes the following commits: 7f2b036 [Deepak Majeti] modified min max Statistics to pointers 4059821 [Deepak Majeti] added a test 7e02810 [Deepak Majeti] PARQUET:556 --- cpp/src/parquet/file/reader-internal.cc | 6 ++++++ cpp/src/parquet/file/reader-internal.h | 1 + cpp/src/parquet/file/reader.cc | 9 ++++++++- cpp/src/parquet/file/reader.h | 4 ++++ cpp/src/parquet/reader-test.cc | 5 +++++ 5 files changed, 24 insertions(+), 1 deletion(-) diff --git a/cpp/src/parquet/file/reader-internal.cc b/cpp/src/parquet/file/reader-internal.cc index c571c72e434a4..89e82983cbf69 100644 --- a/cpp/src/parquet/file/reader-internal.cc +++ b/cpp/src/parquet/file/reader-internal.cc @@ -154,6 +154,10 @@ std::shared_ptr SerializedPageReader::NextPage() { // ---------------------------------------------------------------------- // SerializedRowGroup +int64_t SerializedRowGroup::num_rows() const { + return metadata_->num_rows; +} + int SerializedRowGroup::num_columns() const { return metadata_->columns.size(); } @@ -187,6 +191,8 @@ RowGroupStatistics SerializedRowGroup::GetColumnStats(int i) { result.num_values = meta_data.num_values; result.null_count = meta_data.statistics.null_count; result.distinct_count = meta_data.statistics.distinct_count; + result.max = &meta_data.statistics.max; + result.min = &meta_data.statistics.min; return result; } diff --git a/cpp/src/parquet/file/reader-internal.h b/cpp/src/parquet/file/reader-internal.h index a398cb332997a..b62f2496aad1d 100644 --- a/cpp/src/parquet/file/reader-internal.h +++ b/cpp/src/parquet/file/reader-internal.h @@ -76,6 +76,7 @@ class SerializedRowGroup : public RowGroupReader::Contents { metadata_(metadata) {} virtual int num_columns() const; + virtual int64_t num_rows() const; virtual std::unique_ptr GetColumnPageReader(int i); virtual RowGroupStatistics GetColumnStats(int i); diff --git a/cpp/src/parquet/file/reader.cc b/cpp/src/parquet/file/reader.cc index beace09dd93ed..2937f9e5f3b44 100644 --- a/cpp/src/parquet/file/reader.cc +++ b/cpp/src/parquet/file/reader.cc @@ -49,6 +49,10 @@ int RowGroupReader::num_columns() const { return contents_->num_columns(); } +int64_t RowGroupReader::num_rows() const { + return contents_->num_rows(); +} + std::shared_ptr RowGroupReader::Column(int i) { // TODO: boundschecking const ColumnDescriptor* descr = schema_->Column(i); @@ -153,9 +157,12 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) { RowGroupStatistics stats = group_reader->GetColumnStats(i); stream << "Column " << i << ": " - << stats.num_values << " rows, " + << group_reader->num_rows() << " rows, " + << stats.num_values << " values, " << stats.null_count << " null values, " << stats.distinct_count << " distinct values, " + << *stats.max << " max, " + << *stats.min << " min, " << std::endl; } diff --git a/cpp/src/parquet/file/reader.h b/cpp/src/parquet/file/reader.h index 18b052adab3ca..436d1e83051dc 100644 --- a/cpp/src/parquet/file/reader.h +++ b/cpp/src/parquet/file/reader.h @@ -34,6 +34,8 @@ struct RowGroupStatistics { int64_t num_values; int64_t null_count; int64_t distinct_count; + const std::string* min; + const std::string* max; }; class RowGroupReader { @@ -41,6 +43,7 @@ class RowGroupReader { // Forward declare the PIMPL struct Contents { virtual int num_columns() const = 0; + virtual int64_t num_rows() const = 0; virtual RowGroupStatistics GetColumnStats(int i) = 0; virtual std::unique_ptr GetColumnPageReader(int i) = 0; }; @@ -51,6 +54,7 @@ class RowGroupReader { // column. Ownership is shared with the RowGroupReader. std::shared_ptr Column(int i); int num_columns() const; + int64_t num_rows() const; RowGroupStatistics GetColumnStats(int i) const; diff --git a/cpp/src/parquet/reader-test.cc b/cpp/src/parquet/reader-test.cc index 2c69ce12a1e43..c27348708066d 100644 --- a/cpp/src/parquet/reader-test.cc +++ b/cpp/src/parquet/reader-test.cc @@ -68,6 +68,11 @@ TEST_F(TestAllTypesPlain, TestBatchRead) { int32_t values[4]; // This file only has 8 rows + ASSERT_EQ(8, reader_->num_rows()); + // This file only has 1 row group + ASSERT_EQ(1, reader_->num_row_groups()); + // This row group must have 8 rows + ASSERT_EQ(8, group->num_rows()); ASSERT_TRUE(col->HasNext()); int64_t values_read;