From d923780c1319d1ca0203b2a8de52006c21102887 Mon Sep 17 00:00:00 2001 From: Kang Date: Wed, 22 Nov 2023 12:18:17 +0800 Subject: [PATCH 1/2] add BlockConditionsFilteredTime bf zonemap dict detail profile --- be/src/olap/olap_common.h | 3 + .../rowset/segment_v2/segment_iterator.cpp | 159 ++++++++++-------- be/src/vec/exec/scan/new_olap_scan_node.cpp | 3 + be/src/vec/exec/scan/new_olap_scan_node.h | 3 + be/src/vec/exec/scan/new_olap_scanner.cpp | 6 + 5 files changed, 100 insertions(+), 74 deletions(-) diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index e1c5717fcd0684..5ba27a50e4a1f9 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -336,6 +336,9 @@ struct OlapReaderStatistics { // the number of rows filtered by various column indexes. int64_t rows_conditions_filtered = 0; int64_t block_conditions_filtered_ns = 0; + int64_t block_conditions_filtered_bf_ns = 0; + int64_t block_conditions_filtered_zonemap_ns = 0; + int64_t block_conditions_filtered_dict_ns = 0; int64_t index_load_ns = 0; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 65ca6f7e61d4c8..a17942de8bf002 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -447,90 +447,101 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row cids.insert(entry.first); } - // first filter data by bloom filter index - // bloom filter index only use CondColumn - RowRanges bf_row_ranges = RowRanges::create_single(num_rows()); - for (auto& cid : cids) { - if (!_segment->is_same_file_col_type_with_expected( - cid, *_schema, _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { - continue; - } - // get row ranges by bf index of this column, - RowRanges column_bf_row_ranges = RowRanges::create_single(num_rows()); - DCHECK(_opts.col_id_to_predicates.count(cid) > 0); - RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_bloom_filter( - _opts.col_id_to_predicates.at(cid).get(), &column_bf_row_ranges)); - RowRanges::ranges_intersection(bf_row_ranges, column_bf_row_ranges, &bf_row_ranges); - } - - size_t pre_size = condition_row_ranges->count(); - RowRanges::ranges_intersection(*condition_row_ranges, bf_row_ranges, condition_row_ranges); - _opts.stats->rows_bf_filtered += (pre_size - condition_row_ranges->count()); + size_t pre_size = 0; - RowRanges zone_map_row_ranges = RowRanges::create_single(num_rows()); - // second filter data by zone map - for (auto& cid : cids) { - if (!_segment->is_same_file_col_type_with_expected( - cid, *_schema, _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { - continue; + { + SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_bf_ns); + // first filter data by bloom filter index + // bloom filter index only use CondColumn + RowRanges bf_row_ranges = RowRanges::create_single(num_rows()); + for (auto& cid : cids) { + if (!_segment->is_same_file_col_type_with_expected( + cid, *_schema, _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { + continue; + } + // get row ranges by bf index of this column, + RowRanges column_bf_row_ranges = RowRanges::create_single(num_rows()); + DCHECK(_opts.col_id_to_predicates.count(cid) > 0); + RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_bloom_filter( + _opts.col_id_to_predicates.at(cid).get(), &column_bf_row_ranges)); + RowRanges::ranges_intersection(bf_row_ranges, column_bf_row_ranges, &bf_row_ranges); } - // get row ranges by zone map of this column, - RowRanges column_row_ranges = RowRanges::create_single(num_rows()); - DCHECK(_opts.col_id_to_predicates.count(cid) > 0); - RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_zone_map( - _opts.col_id_to_predicates.at(cid).get(), - _opts.del_predicates_for_zone_map.count(cid) > 0 - ? &(_opts.del_predicates_for_zone_map.at(cid)) - : nullptr, - &column_row_ranges)); - // intersect different columns's row ranges to get final row ranges by zone map - RowRanges::ranges_intersection(zone_map_row_ranges, column_row_ranges, - &zone_map_row_ranges); - } - std::shared_ptr runtime_predicate = nullptr; - if (_opts.use_topn_opt) { - auto query_ctx = _opts.runtime_state->get_query_ctx(); - runtime_predicate = query_ctx->get_runtime_predicate().get_predictate(); - if (runtime_predicate && _segment->is_same_file_col_type_with_expected( - runtime_predicate->column_id(), *_schema, - _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { - AndBlockColumnPredicate and_predicate; - auto single_predicate = new SingleColumnBlockPredicate(runtime_predicate.get()); - and_predicate.add_column_predicate(single_predicate); - - RowRanges column_rp_row_ranges = RowRanges::create_single(num_rows()); - RETURN_IF_ERROR( - _column_iterators[runtime_predicate->column_id()]->get_row_ranges_by_zone_map( - &and_predicate, nullptr, &column_rp_row_ranges)); + pre_size = condition_row_ranges->count(); + RowRanges::ranges_intersection(*condition_row_ranges, bf_row_ranges, condition_row_ranges); + _opts.stats->rows_bf_filtered += (pre_size - condition_row_ranges->count()); + } + { + SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_zonemap_ns); + RowRanges zone_map_row_ranges = RowRanges::create_single(num_rows()); + // second filter data by zone map + for (auto& cid : cids) { + if (!_segment->is_same_file_col_type_with_expected( + cid, *_schema, _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { + continue; + } + // get row ranges by zone map of this column, + RowRanges column_row_ranges = RowRanges::create_single(num_rows()); + DCHECK(_opts.col_id_to_predicates.count(cid) > 0); + RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_zone_map( + _opts.col_id_to_predicates.at(cid).get(), + _opts.del_predicates_for_zone_map.count(cid) > 0 + ? &(_opts.del_predicates_for_zone_map.at(cid)) + : nullptr, + &column_row_ranges)); // intersect different columns's row ranges to get final row ranges by zone map - RowRanges::ranges_intersection(zone_map_row_ranges, column_rp_row_ranges, - &zone_map_row_ranges); + RowRanges::ranges_intersection(zone_map_row_ranges, column_row_ranges, + &zone_map_row_ranges); + } + + std::shared_ptr runtime_predicate = nullptr; + if (_opts.use_topn_opt) { + auto query_ctx = _opts.runtime_state->get_query_ctx(); + runtime_predicate = query_ctx->get_runtime_predicate().get_predictate(); + if (runtime_predicate && _segment->is_same_file_col_type_with_expected( + runtime_predicate->column_id(), *_schema, + _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { + AndBlockColumnPredicate and_predicate; + auto single_predicate = new SingleColumnBlockPredicate(runtime_predicate.get()); + and_predicate.add_column_predicate(single_predicate); + + RowRanges column_rp_row_ranges = RowRanges::create_single(num_rows()); + RETURN_IF_ERROR( + _column_iterators[runtime_predicate->column_id()]->get_row_ranges_by_zone_map( + &and_predicate, nullptr, &column_rp_row_ranges)); + + // intersect different columns's row ranges to get final row ranges by zone map + RowRanges::ranges_intersection(zone_map_row_ranges, column_rp_row_ranges, + &zone_map_row_ranges); + } } + + pre_size = condition_row_ranges->count(); + RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges, + condition_row_ranges); + _opts.stats->rows_stats_filtered += (pre_size - condition_row_ranges->count()); } - pre_size = condition_row_ranges->count(); - RowRanges::ranges_intersection(*condition_row_ranges, zone_map_row_ranges, - condition_row_ranges); - _opts.stats->rows_stats_filtered += (pre_size - condition_row_ranges->count()); + { + SCOPED_RAW_TIMER(&_opts.stats->block_conditions_filtered_dict_ns); + /// Low cardinality optimization is currently not very stable, so to prevent data corruption, + /// we are temporarily disabling its use in data compaction. + if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { + RowRanges dict_row_ranges = RowRanges::create_single(num_rows()); + for (auto cid : cids) { + RowRanges tmp_row_ranges = RowRanges::create_single(num_rows()); + DCHECK(_opts.col_id_to_predicates.count(cid) > 0); + RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict( + _opts.col_id_to_predicates.at(cid).get(), &tmp_row_ranges)); + RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges, &dict_row_ranges); + } - /// Low cardinality optimization is currently not very stable, so to prevent data corruption, - /// we are temporarily disabling its use in data compaction. - if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { - RowRanges dict_row_ranges = RowRanges::create_single(num_rows()); - for (auto cid : cids) { - RowRanges tmp_row_ranges = RowRanges::create_single(num_rows()); - DCHECK(_opts.col_id_to_predicates.count(cid) > 0); - RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict( - _opts.col_id_to_predicates.at(cid).get(), &tmp_row_ranges)); - RowRanges::ranges_intersection(dict_row_ranges, tmp_row_ranges, &dict_row_ranges); + pre_size = condition_row_ranges->count(); + RowRanges::ranges_intersection(*condition_row_ranges, dict_row_ranges, + condition_row_ranges); + _opts.stats->rows_dict_filtered += (pre_size - condition_row_ranges->count()); } - - pre_size = condition_row_ranges->count(); - RowRanges::ranges_intersection(*condition_row_ranges, dict_row_ranges, - condition_row_ranges); - _opts.stats->rows_dict_filtered += (pre_size - condition_row_ranges->count()); } return Status::OK(); diff --git a/be/src/vec/exec/scan/new_olap_scan_node.cpp b/be/src/vec/exec/scan/new_olap_scan_node.cpp index a41f651d1d0a6e..7346d345f4e517 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.cpp +++ b/be/src/vec/exec/scan/new_olap_scan_node.cpp @@ -125,6 +125,9 @@ Status NewOlapScanNode::_init_profile() { _block_init_seek_timer = ADD_TIMER(_segment_profile, "BlockInitSeekTime"); _block_init_seek_counter = ADD_COUNTER(_segment_profile, "BlockInitSeekCount", TUnit::UNIT); _block_conditions_filtered_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredTime"); + _block_conditions_filtered_bf_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredBloomFilterTime"); + _block_conditions_filtered_zonemap_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredZonemapTime"); + _block_conditions_filtered_dict_timer = ADD_TIMER(_segment_profile, "BlockConditionsFilteredDictTime"); _rows_vec_cond_filtered_counter = ADD_COUNTER(_segment_profile, "RowsVectorPredFiltered", TUnit::UNIT); diff --git a/be/src/vec/exec/scan/new_olap_scan_node.h b/be/src/vec/exec/scan/new_olap_scan_node.h index af11a6c0551bfd..dfb0f9d928ae99 100644 --- a/be/src/vec/exec/scan/new_olap_scan_node.h +++ b/be/src/vec/exec/scan/new_olap_scan_node.h @@ -160,6 +160,9 @@ class NewOlapScanNode : public VScanNode { RuntimeProfile::Counter* _block_init_seek_timer = nullptr; RuntimeProfile::Counter* _block_init_seek_counter = nullptr; RuntimeProfile::Counter* _block_conditions_filtered_timer = nullptr; + RuntimeProfile::Counter* _block_conditions_filtered_bf_timer = nullptr; + RuntimeProfile::Counter* _block_conditions_filtered_zonemap_timer = nullptr; + RuntimeProfile::Counter* _block_conditions_filtered_dict_timer = nullptr; RuntimeProfile::Counter* _first_read_timer = nullptr; RuntimeProfile::Counter* _second_read_timer = nullptr; RuntimeProfile::Counter* _first_read_seek_timer = nullptr; diff --git a/be/src/vec/exec/scan/new_olap_scanner.cpp b/be/src/vec/exec/scan/new_olap_scanner.cpp index 87095fd6adc7f4..d6c84b193afd4c 100644 --- a/be/src/vec/exec/scan/new_olap_scanner.cpp +++ b/be/src/vec/exec/scan/new_olap_scanner.cpp @@ -588,6 +588,12 @@ void NewOlapScanner::_update_counters_before_close() { COUNTER_UPDATE(olap_parent->_block_init_seek_counter, stats.block_init_seek_num); COUNTER_UPDATE(olap_parent->_block_conditions_filtered_timer, stats.block_conditions_filtered_ns); + COUNTER_UPDATE(olap_parent->_block_conditions_filtered_bf_timer, + stats.block_conditions_filtered_bf_ns); + COUNTER_UPDATE(olap_parent->_block_conditions_filtered_zonemap_timer, + stats.block_conditions_filtered_zonemap_ns); + COUNTER_UPDATE(olap_parent->_block_conditions_filtered_dict_timer, + stats.block_conditions_filtered_dict_ns); COUNTER_UPDATE(olap_parent->_first_read_timer, stats.first_read_ns); COUNTER_UPDATE(olap_parent->_second_read_timer, stats.second_read_ns); COUNTER_UPDATE(olap_parent->_first_read_seek_timer, stats.block_first_read_seek_ns); From 7ad5e013af38da08b18079dbd2cda40b376c88b9 Mon Sep 17 00:00:00 2001 From: Kang Date: Thu, 23 Nov 2023 17:13:06 +0800 Subject: [PATCH 2/2] skip zonemap if predicate does not support_zonemap --- be/src/olap/block_column_predicate.h | 13 +++++++++++++ be/src/olap/column_predicate.h | 2 ++ be/src/olap/match_predicate.h | 2 ++ be/src/olap/rowset/segment_v2/segment_iterator.cpp | 7 ++++++- 4 files changed, 23 insertions(+), 1 deletion(-) diff --git a/be/src/olap/block_column_predicate.h b/be/src/olap/block_column_predicate.h index c91dc0c3678dd2..b29fad56278749 100644 --- a/be/src/olap/block_column_predicate.h +++ b/be/src/olap/block_column_predicate.h @@ -72,6 +72,8 @@ class BlockColumnPredicate { virtual void evaluate_vec(vectorized::MutableColumns& block, uint16_t size, bool* flags) const { } + virtual bool support_zonemap() const { return true; } + virtual bool evaluate_and(const std::pair& statistic) const { LOG(FATAL) << "should not reach here"; return true; @@ -113,6 +115,7 @@ class SingleColumnBlockPredicate : public BlockColumnPredicate { uint16_t selected_size) const override; void evaluate_and(vectorized::MutableColumns& block, uint16_t* sel, uint16_t selected_size, bool* flags) const override; + bool support_zonemap() const override { return _predicate->support_zonemap(); } bool evaluate_and(const std::pair& statistic) const override; bool evaluate_and(const segment_v2::BloomFilter* bf) const override; bool evaluate_and(const StringRef* dict_words, const size_t dict_num) const override; @@ -139,6 +142,16 @@ class MutilColumnBlockPredicate : public BlockColumnPredicate { } } + bool support_zonemap() const override { + for (const auto *child_block_predicate : _block_column_predicate_vec) { + if (!child_block_predicate->support_zonemap()) { + return false; + } + } + + return true; + } + void add_column_predicate(const BlockColumnPredicate* column_predicate) { _block_column_predicate_vec.push_back(column_predicate); } diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index 05e84999a83102..6e0bf7a4157051 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -169,6 +169,8 @@ class ColumnPredicate { virtual void evaluate_or(const vectorized::IColumn& column, const uint16_t* sel, uint16_t size, bool* flags) const {} + virtual bool support_zonemap() const { return true; } + virtual bool evaluate_and(const std::pair& statistic) const { return true; } diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 4232469f4a6ae6..11b8ac31a44fc5 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -50,6 +50,8 @@ class MatchPredicate : public ColumnPredicate { const std::string& get_value() const { return _value; } + bool support_zonemap() const override { return false; } + //evaluate predicate on Bitmap virtual Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* roaring) const override { diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index a17942de8bf002..9a906c994b2cc8 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -481,9 +481,14 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row cid, *_schema, _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { continue; } + DCHECK(_opts.col_id_to_predicates.count(cid) > 0); + // do not check zonemap if predicate does not support zonemap + if (!_opts.col_id_to_predicates.at(cid)->support_zonemap()) { + LOG(WARNING) << "skip zonemap for column " << cid; + continue; + } // get row ranges by zone map of this column, RowRanges column_row_ranges = RowRanges::create_single(num_rows()); - DCHECK(_opts.col_id_to_predicates.count(cid) > 0); RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_zone_map( _opts.col_id_to_predicates.at(cid).get(), _opts.del_predicates_for_zone_map.count(cid) > 0