From ab5ebba2822577bc33e19c201f8a1d465e97f30c Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 5 Nov 2022 00:09:27 +0000 Subject: [PATCH 1/4] ARROW-18253: Add additional bounds safety checks --- cpp/src/parquet/arrow/path_internal.cc | 7 +++++++ cpp/src/parquet/arrow/reader.cc | 9 +++++++++ cpp/src/parquet/metadata.cc | 21 ++++++++++++++++----- cpp/src/parquet/schema.cc | 18 +++++++++++++----- 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index b7e638ce724b5..ec2cc013f8224 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -861,6 +861,13 @@ class MultipathLevelBuilderImpl : public MultipathLevelBuilder { ::arrow::Status Write(int leaf_index, ArrowWriteContext* context, CallbackFunction write_leaf_callback) override { + if (ARROW_PREDICT_FALSE(leaf_index < 0 || leaf_index >= GetLeafCount())) { + return Status::Invalid("Column index out of bounds (got ", leaf_index, + ", should be " + "between 0 and ", + GetLeafCount(), ")"); + } + DCHECK_GE(leaf_index, 0); DCHECK_LT(leaf_index, GetLeafCount()); return WritePath(root_range_, &path_builder_->paths()[leaf_index], context, diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 31480133d7acd..813e32a412d56 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -203,6 +203,15 @@ class FileReaderImpl : public FileReader { const std::shared_ptr>& included_leaves, const std::vector& row_groups, std::unique_ptr* out) { + // Should be coverted by GetRecordBatchReader checks but + // manifest_.schema_fields, is a separate variable so be extra careful. + if (ARROW_PREDICT_FALSE(i < 0 || + static_cast(i) >= manifest_.schema_fields.size())) { + return Status::Invalid("Column index out of bounds (got ", i, + ", should be " + "between 0 and ", + manifest_.schema_fields.size(), ")"); + } auto ctx = std::make_shared(); ctx->reader = reader_.get(); ctx->pool = pool_; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 20cb09517ab00..274009f6af337 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -436,7 +436,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { schema_(schema), properties_(properties), writer_version_(writer_version), - file_decryptor_(std::move(file_decryptor)) {} + file_decryptor_(std::move(file_decryptor)) { + if (ARROW_PREDICT_FALSE(row_group_->columns.size() > + std::numeric_limits::max())) { + throw ParquetException("Row group had too many columns: ", + row_group_->columns.size()); + } + } bool Equals(const RowGroupMetaDataImpl& other) const { return *row_group_ == *other.row_group_; @@ -457,10 +463,10 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk(int i) { - if (i < num_columns()) { + if (i >= 0 && i < num_columns()) { return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), properties_, writer_version_, row_group_->ordinal, - static_cast(i), file_decryptor_); + i, file_decryptor_); } throw ParquetException("The file only has ", num_columns(), " columns, requested metadata for column: ", i); @@ -656,7 +662,7 @@ class FileMetaData::FileMetaDataImpl { } std::unique_ptr RowGroup(int i) { - if (!(i < num_row_groups())) { + if (!(i >= 0 && i < num_row_groups())) { std::stringstream ss; ss << "The file only has " << num_row_groups() << " row groups, requested metadata for row group: " << i; @@ -685,7 +691,12 @@ class FileMetaData::FileMetaDataImpl { } format::RowGroup& row_group(int i) { - DCHECK_LT(i, num_row_groups()); + if (!(i >= 0 && i < num_row_groups())) { + std::stringstream ss; + ss << "The file only has " << num_row_groups() + << " row groups, requested metadata for row group: " << i; + throw ParquetException(ss.str()); + } return metadata_->row_groups[i]; } diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 9a7767e79f5ae..5437fa2208a53 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -32,9 +32,6 @@ using parquet::format::SchemaElement; namespace parquet { - -namespace schema { - namespace { void ThrowInvalidLogicalType(const LogicalType& logical_type) { @@ -43,8 +40,19 @@ void ThrowInvalidLogicalType(const LogicalType& logical_type) { throw ParquetException(ss.str()); } +void CheckColumnBounds(int column_index, size_t max_columns) { + if (ARROW_PREDICT_FALSE(column_index < 0 || + static_cast(column_index) >= max_columns)) { + std::stringstream ss; + ss << "Invalid Column Index: " << column_index << " Num columns: " << max_columns; + throw ParquetException(ss.str()); + } +} + } // namespace +namespace schema { + // ---------------------------------------------------------------------- // ColumnPath @@ -874,7 +882,7 @@ bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const { } const ColumnDescriptor* SchemaDescriptor::Column(int i) const { - DCHECK(i >= 0 && i < static_cast(leaves_.size())); + CheckColumnBounds(i, leaves_.size()); return &leaves_[i]; } @@ -899,7 +907,7 @@ int SchemaDescriptor::ColumnIndex(const Node& node) const { } const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const { - DCHECK(i >= 0 && i < static_cast(leaves_.size())); + CheckColumnBounds(i, leaves_.size()); return leaf_to_base_.find(i)->second.get(); } From 790585e81c56f3e59b45dc3a6b37e0b87ca6f6ca Mon Sep 17 00:00:00 2001 From: Micah Kornfield Date: Sat, 5 Nov 2022 00:11:31 +0000 Subject: [PATCH 2/4] remove checks --- cpp/src/parquet/arrow/path_internal.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index ec2cc013f8224..f176f66e13122 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -868,8 +868,6 @@ class MultipathLevelBuilderImpl : public MultipathLevelBuilder { GetLeafCount(), ")"); } - DCHECK_GE(leaf_index, 0); - DCHECK_LT(leaf_index, GetLeafCount()); return WritePath(root_range_, &path_builder_->paths()[leaf_index], context, std::move(write_leaf_callback)); } From ebaeaed64ec72072ab0440b0900eecd705aa9d6a Mon Sep 17 00:00:00 2001 From: emkornfield Date: Fri, 4 Nov 2022 20:44:50 -0700 Subject: [PATCH 3/4] add cast --- cpp/src/parquet/metadata.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 274009f6af337..c97e1949552b7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -438,7 +438,7 @@ class RowGroupMetaData::RowGroupMetaDataImpl { writer_version_(writer_version), file_decryptor_(std::move(file_decryptor)) { if (ARROW_PREDICT_FALSE(row_group_->columns.size() > - std::numeric_limits::max())) { + static_cast(std::numeric_limits::max()))) { throw ParquetException("Row group had too many columns: ", row_group_->columns.size()); } From 7c2b44cace9cce9cc32b776d02e641becd93ce44 Mon Sep 17 00:00:00 2001 From: emkornfield Date: Tue, 8 Nov 2022 12:02:02 -0800 Subject: [PATCH 4/4] Update cpp/src/parquet/arrow/reader.cc Co-authored-by: Antoine Pitrou --- cpp/src/parquet/arrow/reader.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 813e32a412d56..30c7d44438080 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -203,8 +203,8 @@ class FileReaderImpl : public FileReader { const std::shared_ptr>& included_leaves, const std::vector& row_groups, std::unique_ptr* out) { - // Should be coverted by GetRecordBatchReader checks but - // manifest_.schema_fields, is a separate variable so be extra careful. + // Should be covered by GetRecordBatchReader checks but + // manifest_.schema_fields is a separate variable so be extra careful. if (ARROW_PREDICT_FALSE(i < 0 || static_cast(i) >= manifest_.schema_fields.size())) { return Status::Invalid("Column index out of bounds (got ", i,