From d7457dfcfb4f5f1db8d7868e932ec6f6964de4b1 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 30 Mar 2024 01:07:23 +0100 Subject: [PATCH 1/2] Parquet: avoid potential assertion/out-of-bounds access when a subset of row groups is selected --- ogr/ogrsf_frmts/arrow_common/ogr_arrow.h | 5 ++ .../arrow_common/ograrrowlayer.hpp | 18 +++--- ogr/ogrsf_frmts/parquet/ogr_parquet.h | 10 +++- ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp | 56 ++++++++++++++----- 4 files changed, 66 insertions(+), 23 deletions(-) diff --git a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h index b0cecfb12fb1..a562887f818c 100644 --- a/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h +++ b/ogr/ogrsf_frmts/arrow_common/ogr_arrow.h @@ -216,6 +216,11 @@ class OGRArrowLayer CPL_NON_FINAL int GetNextArrowArray(struct ArrowArrayStream *, struct ArrowArray *out) override; + virtual void IncrFeatureIdx() + { + ++m_nFeatureIdx; + } + public: virtual ~OGRArrowLayer() override; diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index b69201550619..a536a60bbef8 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -3497,7 +3497,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3585,7 +3585,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3627,7 +3627,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3649,7 +3649,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() break; } - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3665,7 +3665,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() if (m_iFIDArrowColumn < 0) poFeature->SetFID(m_nFeatureIdx); - m_nFeatureIdx++; + IncrFeatureIdx(); m_nIdxInBatch++; return poFeature; @@ -3877,7 +3877,6 @@ inline OGRErr OGRArrowLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, } } - m_nFeatureIdx++; m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -3977,7 +3976,6 @@ inline OGRErr OGRArrowLayer::GetExtent(int iGeomField, OGREnvelope *psExtent, } } - m_nFeatureIdx++; m_nIdxInBatch++; if (m_nIdxInBatch == m_poBatch->num_rows()) { @@ -4407,7 +4405,11 @@ inline int OGRArrowLayer::GetNextArrowArray(struct ArrowArrayStream *stream, OverrideArrowRelease(m_poArrowDS, out_array); const auto nFeatureIdxCur = m_nFeatureIdx; - m_nFeatureIdx += m_nIdxInBatch; + // TODO: We likely have an issue regarding FIDs based on m_nFeatureIdx + // when m_iFIDArrowColumn < 0, only a subset of row groups is + // selected, and this batch goes accross non consecutive row groups. + for (int64_t i = 0; i < m_nIdxInBatch; ++i) + IncrFeatureIdx(); if (m_poAttrQuery || m_poFilterGeom) { diff --git a/ogr/ogrsf_frmts/parquet/ogr_parquet.h b/ogr/ogrsf_frmts/parquet/ogr_parquet.h index b30228dd5450..b47bcd0a9c35 100644 --- a/ogr/ogrsf_frmts/parquet/ogr_parquet.h +++ b/ogr/ogrsf_frmts/parquet/ogr_parquet.h @@ -84,7 +84,13 @@ class OGRParquetLayer final : public OGRParquetLayerBase std::vector m_anMapGeomFieldIndexToParquetColumn{}; bool m_bHasMissingMappingToParquet = false; - std::vector m_anSelectedGroupsStartFeatureIdx{}; + //! Contains pairs of (selected feature idx, total feature idx) break points. + std::vector> m_asFeatureIdxRemapping{}; + //! Iterator over m_asFeatureIdxRemapping + std::vector>::iterator + m_oFeatureIdxRemappingIter{}; + //! Feature index among the potentially restricted set of selected row gropus + int64_t m_nFeatureIdxSelected = 0; std::vector m_anRequestedParquetColumns{}; // only valid when // m_bIgnoredFields is set #ifdef DEBUG @@ -120,6 +126,8 @@ class OGRParquetLayer final : public OGRParquetLayerBase bool FastGetExtent(int iGeomField, OGREnvelope *psExtent) const override; + void IncrFeatureIdx() override; + public: OGRParquetLayer(OGRParquetDataset *poDS, const char *pszLayerName, std::unique_ptr &&arrow_reader, diff --git a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp index aaa74958b38d..bdfb2c4bfc26 100644 --- a/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp +++ b/ogr/ogrsf_frmts/parquet/ogrparquetlayer.cpp @@ -435,6 +435,8 @@ OGRParquetLayer::OGRParquetLayer( EstablishFeatureDefn(); CPLAssert(static_cast(m_aeGeomEncoding.size()) == m_poFeatureDefn->GetGeomFieldCount()); + + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); } /************************************************************************/ @@ -922,6 +924,13 @@ void OGRParquetLayer::ResetReading() m_poRecordBatchReader.reset(); } OGRParquetLayerBase::ResetReading(); + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); + m_nFeatureIdxSelected = 0; + if (!m_asFeatureIdxRemapping.empty()) + { + m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; + ++m_oFeatureIdxRemappingIter; + } } /************************************************************************/ @@ -1028,6 +1037,25 @@ static IsConstraintPossibleRes IsConstraintPossible(int nOperation, T v, T min, return IsConstraintPossibleRes::YES; } +/************************************************************************/ +/* IncrFeatureIdx() */ +/************************************************************************/ + +void OGRParquetLayer::IncrFeatureIdx() +{ + ++m_nFeatureIdxSelected; + ++m_nFeatureIdx; + if (m_iFIDArrowColumn < 0 && !m_asFeatureIdxRemapping.empty() && + m_oFeatureIdxRemappingIter != m_asFeatureIdxRemapping.end()) + { + if (m_nFeatureIdxSelected == m_oFeatureIdxRemappingIter->first) + { + m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; + ++m_oFeatureIdxRemappingIter; + } + } +} + /************************************************************************/ /* ReadNextBatch() */ /************************************************************************/ @@ -1048,7 +1076,7 @@ bool OGRParquetLayer::ReadNextBatch() if (m_poRecordBatchReader == nullptr) { - m_anSelectedGroupsStartFeatureIdx.clear(); + m_asFeatureIdxRemapping.clear(); bool bIterateEverything = false; std::vector anSelectedGroups; @@ -1074,7 +1102,8 @@ bool OGRParquetLayer::ReadNextBatch() OGRFieldType eType = OFTMaxType; OGRFieldSubType eSubType = OFSTNone; std::string osMinTmp, osMaxTmp; - GIntBig nFeatureIdx = 0; + int64_t nFeatureIdxSelected = 0; + int64_t nFeatureIdxTotal = 0; for (int iRowGroup = 0; iRowGroup < nNumGroups && !bIterateEverything; ++iRowGroup) @@ -1152,9 +1181,9 @@ bool OGRParquetLayer::ReadNextBatch() if (iOGRField == OGR_FID_INDEX && m_iFIDParquetColumn < 0) { - sMin.Integer64 = nFeatureIdx; + sMin.Integer64 = nFeatureIdxTotal; sMax.Integer64 = - nFeatureIdx + + nFeatureIdxTotal + poRowGroup->metadata()->num_rows() - 1; eType = OFTInteger64; } @@ -1311,26 +1340,32 @@ bool OGRParquetLayer::ReadNextBatch() if (bSelectGroup) { // CPLDebug("PARQUET", "Selecting row group %d", iRowGroup); - m_anSelectedGroupsStartFeatureIdx.push_back(nFeatureIdx); + m_asFeatureIdxRemapping.emplace_back( + std::make_pair(nFeatureIdxSelected, nFeatureIdxTotal)); anSelectedGroups.push_back(iRowGroup); + nFeatureIdxSelected += poRowGroup->metadata()->num_rows(); } - nFeatureIdx += poRowGroup->metadata()->num_rows(); + nFeatureIdxTotal += poRowGroup->metadata()->num_rows(); } } if (bIterateEverything) { - m_anSelectedGroupsStartFeatureIdx.clear(); + m_asFeatureIdxRemapping.clear(); + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); if (!CreateRecordBatchReader(0)) return false; } else { + m_oFeatureIdxRemappingIter = m_asFeatureIdxRemapping.begin(); if (anSelectedGroups.empty()) { return false; } + m_nFeatureIdx = m_oFeatureIdxRemappingIter->second; + ++m_oFeatureIdxRemappingIter; if (!CreateRecordBatchReader(anSelectedGroups)) { return false; @@ -1362,13 +1397,6 @@ bool OGRParquetLayer::ReadNextBatch() m_poBatch.reset(); return false; } - if (!m_anSelectedGroupsStartFeatureIdx.empty()) - { - CPLAssert( - m_iRecordBatch < - static_cast(m_anSelectedGroupsStartFeatureIdx.size())); - m_nFeatureIdx = m_anSelectedGroupsStartFeatureIdx[m_iRecordBatch]; - } } while (poNextBatch->num_rows() == 0); SetBatch(poNextBatch); From a554f27d6a7519eea2f00ebe85ca58d3c48be894 Mon Sep 17 00:00:00 2001 From: Even Rouault Date: Sat, 30 Mar 2024 01:29:51 +0100 Subject: [PATCH 2/2] Arrow/Parquet: fix inverted logic regarding spatial filtering of multipolygon with GeoArrow interleaved encoding --- ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp index a536a60bbef8..1cef38c0f924 100644 --- a/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp +++ b/ogr/ogrsf_frmts/arrow_common/ograrrowlayer.hpp @@ -3574,7 +3574,7 @@ inline OGRFeature *OGRArrowLayer::GetNextRawFeature() } } - if (nParts != 0 && m_sFilterEnvelope.Intersects(sEnvelope)) + if (nParts != 0 && !m_sFilterEnvelope.Intersects(sEnvelope)) { break; }