From 147b5c922efe19d34ef7e7cda635b7d8a07be2eb Mon Sep 17 00:00:00 2001 From: emkornfield Date: Wed, 9 Nov 2022 04:19:19 -0800 Subject: [PATCH] ARROW-18253: [C++][Parquet] Add additional bounds safety checks (#14592) Lead-authored-by: emkornfield Co-authored-by: Micah Kornfield Co-authored-by: Antoine Pitrou Signed-off-by: Antoine Pitrou --- cpp/src/parquet/arrow/path_internal.cc | 9 +++++++-- cpp/src/parquet/arrow/reader.cc | 9 +++++++++ cpp/src/parquet/metadata.cc | 21 ++++++++++++++++----- cpp/src/parquet/schema.cc | 18 +++++++++++++----- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/arrow/path_internal.cc b/cpp/src/parquet/arrow/path_internal.cc index b7e638ce724b5..f176f66e13122 100644 --- a/cpp/src/parquet/arrow/path_internal.cc +++ b/cpp/src/parquet/arrow/path_internal.cc @@ -861,8 +861,13 @@ class MultipathLevelBuilderImpl : public MultipathLevelBuilder { ::arrow::Status Write(int leaf_index, ArrowWriteContext* context, CallbackFunction write_leaf_callback) override { - DCHECK_GE(leaf_index, 0); - DCHECK_LT(leaf_index, GetLeafCount()); + if (ARROW_PREDICT_FALSE(leaf_index < 0 || leaf_index >= GetLeafCount())) { + return Status::Invalid("Column index out of bounds (got ", leaf_index, + ", should be " + "between 0 and ", + GetLeafCount(), ")"); + } + return WritePath(root_range_, &path_builder_->paths()[leaf_index], context, std::move(write_leaf_callback)); } diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index d22746ab8c815..5c3732dd74e5b 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -203,6 +203,15 @@ class FileReaderImpl : public FileReader { const std::shared_ptr>& included_leaves, const std::vector& row_groups, std::unique_ptr* out) { + // Should be covered by GetRecordBatchReader checks but + // manifest_.schema_fields is a separate variable so be extra careful. + if (ARROW_PREDICT_FALSE(i < 0 || + static_cast(i) >= manifest_.schema_fields.size())) { + return Status::Invalid("Column index out of bounds (got ", i, + ", should be " + "between 0 and ", + manifest_.schema_fields.size(), ")"); + } auto ctx = std::make_shared(); ctx->reader = reader_.get(); ctx->pool = pool_; diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 20cb09517ab00..c97e1949552b7 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -436,7 +436,13 @@ class RowGroupMetaData::RowGroupMetaDataImpl { schema_(schema), properties_(properties), writer_version_(writer_version), - file_decryptor_(std::move(file_decryptor)) {} + file_decryptor_(std::move(file_decryptor)) { + if (ARROW_PREDICT_FALSE(row_group_->columns.size() > + static_cast(std::numeric_limits::max()))) { + throw ParquetException("Row group had too many columns: ", + row_group_->columns.size()); + } + } bool Equals(const RowGroupMetaDataImpl& other) const { return *row_group_ == *other.row_group_; @@ -457,10 +463,10 @@ class RowGroupMetaData::RowGroupMetaDataImpl { inline const SchemaDescriptor* schema() const { return schema_; } std::unique_ptr ColumnChunk(int i) { - if (i < num_columns()) { + if (i >= 0 && i < num_columns()) { return ColumnChunkMetaData::Make(&row_group_->columns[i], schema_->Column(i), properties_, writer_version_, row_group_->ordinal, - static_cast(i), file_decryptor_); + i, file_decryptor_); } throw ParquetException("The file only has ", num_columns(), " columns, requested metadata for column: ", i); @@ -656,7 +662,7 @@ class FileMetaData::FileMetaDataImpl { } std::unique_ptr RowGroup(int i) { - if (!(i < num_row_groups())) { + if (!(i >= 0 && i < num_row_groups())) { std::stringstream ss; ss << "The file only has " << num_row_groups() << " row groups, requested metadata for row group: " << i; @@ -685,7 +691,12 @@ class FileMetaData::FileMetaDataImpl { } format::RowGroup& row_group(int i) { - DCHECK_LT(i, num_row_groups()); + if (!(i >= 0 && i < num_row_groups())) { + std::stringstream ss; + ss << "The file only has " << num_row_groups() + << " row groups, requested metadata for row group: " << i; + throw ParquetException(ss.str()); + } return metadata_->row_groups[i]; } diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 9a7767e79f5ae..5437fa2208a53 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -32,9 +32,6 @@ using parquet::format::SchemaElement; namespace parquet { - -namespace schema { - namespace { void ThrowInvalidLogicalType(const LogicalType& logical_type) { @@ -43,8 +40,19 @@ void ThrowInvalidLogicalType(const LogicalType& logical_type) { throw ParquetException(ss.str()); } +void CheckColumnBounds(int column_index, size_t max_columns) { + if (ARROW_PREDICT_FALSE(column_index < 0 || + static_cast(column_index) >= max_columns)) { + std::stringstream ss; + ss << "Invalid Column Index: " << column_index << " Num columns: " << max_columns; + throw ParquetException(ss.str()); + } +} + } // namespace +namespace schema { + // ---------------------------------------------------------------------- // ColumnPath @@ -874,7 +882,7 @@ bool ColumnDescriptor::Equals(const ColumnDescriptor& other) const { } const ColumnDescriptor* SchemaDescriptor::Column(int i) const { - DCHECK(i >= 0 && i < static_cast(leaves_.size())); + CheckColumnBounds(i, leaves_.size()); return &leaves_[i]; } @@ -899,7 +907,7 @@ int SchemaDescriptor::ColumnIndex(const Node& node) const { } const schema::Node* SchemaDescriptor::GetColumnRoot(int i) const { - DCHECK(i >= 0 && i < static_cast(leaves_.size())); + CheckColumnBounds(i, leaves_.size()); return leaf_to_base_.find(i)->second.get(); }