Skip to content

Commit

Permalink
PARQUET-556:Extend RowGroupStatistics to include "min" "max" statistics
Browse files Browse the repository at this point in the history
Also includes a patch to extend the GroupReader API with num_rows()

Author: Deepak Majeti <deepak.majeti@hpe.com>

Closes apache#76 from majetideepak/PARQUET-556 and squashes the following commits:

7f2b036 [Deepak Majeti] modified min max Statistics to pointers
4059821 [Deepak Majeti] added a test
7e02810 [Deepak Majeti] PARQUET:556
  • Loading branch information
Deepak Majeti authored and wesm committed Sep 2, 2018
1 parent 40926e5 commit 6c5a827
Show file tree
Hide file tree
Showing 5 changed files with 24 additions and 1 deletion.
6 changes: 6 additions & 0 deletions cpp/src/parquet/file/reader-internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ std::shared_ptr<Page> SerializedPageReader::NextPage() {
// ----------------------------------------------------------------------
// SerializedRowGroup

int64_t SerializedRowGroup::num_rows() const {
return metadata_->num_rows;
}

int SerializedRowGroup::num_columns() const {
return metadata_->columns.size();
}
Expand Down Expand Up @@ -187,6 +191,8 @@ RowGroupStatistics SerializedRowGroup::GetColumnStats(int i) {
result.num_values = meta_data.num_values;
result.null_count = meta_data.statistics.null_count;
result.distinct_count = meta_data.statistics.distinct_count;
result.max = &meta_data.statistics.max;
result.min = &meta_data.statistics.min;

return result;
}
Expand Down
1 change: 1 addition & 0 deletions cpp/src/parquet/file/reader-internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
metadata_(metadata) {}

virtual int num_columns() const;
virtual int64_t num_rows() const;
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i);
virtual RowGroupStatistics GetColumnStats(int i);

Expand Down
9 changes: 8 additions & 1 deletion cpp/src/parquet/file/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ int RowGroupReader::num_columns() const {
return contents_->num_columns();
}

int64_t RowGroupReader::num_rows() const {
return contents_->num_rows();
}

std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
// TODO: boundschecking
const ColumnDescriptor* descr = schema_->Column(i);
Expand Down Expand Up @@ -153,9 +157,12 @@ void ParquetFileReader::DebugPrint(std::ostream& stream, bool print_values) {
RowGroupStatistics stats = group_reader->GetColumnStats(i);

stream << "Column " << i << ": "
<< stats.num_values << " rows, "
<< group_reader->num_rows() << " rows, "
<< stats.num_values << " values, "
<< stats.null_count << " null values, "
<< stats.distinct_count << " distinct values, "
<< *stats.max << " max, "
<< *stats.min << " min, "
<< std::endl;
}

Expand Down
4 changes: 4 additions & 0 deletions cpp/src/parquet/file/reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,16 @@ struct RowGroupStatistics {
int64_t num_values;
int64_t null_count;
int64_t distinct_count;
const std::string* min;
const std::string* max;
};

class RowGroupReader {
public:
// Forward declare the PIMPL
struct Contents {
virtual int num_columns() const = 0;
virtual int64_t num_rows() const = 0;
virtual RowGroupStatistics GetColumnStats(int i) = 0;
virtual std::unique_ptr<PageReader> GetColumnPageReader(int i) = 0;
};
Expand All @@ -51,6 +54,7 @@ class RowGroupReader {
// column. Ownership is shared with the RowGroupReader.
std::shared_ptr<ColumnReader> Column(int i);
int num_columns() const;
int64_t num_rows() const;

RowGroupStatistics GetColumnStats(int i) const;

Expand Down
5 changes: 5 additions & 0 deletions cpp/src/parquet/reader-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
int32_t values[4];

// This file only has 8 rows
ASSERT_EQ(8, reader_->num_rows());
// This file only has 1 row group
ASSERT_EQ(1, reader_->num_row_groups());
// This row group must have 8 rows
ASSERT_EQ(8, group->num_rows());

ASSERT_TRUE(col->HasNext());
int64_t values_read;
Expand Down

0 comments on commit 6c5a827

Please sign in to comment.