From b883e3d441bbbf4264a52eecf2887ca0753ba361 Mon Sep 17 00:00:00 2001 From: zzzxl1993 <474696115@qq.com> Date: Sun, 16 Jun 2024 20:03:58 +0800 Subject: [PATCH] [feture](inverted index) add ordered functionality to match_phrase query --- .../inverted_index/query/phrase_query.cpp | 253 ++++++++++++------ .../inverted_index/query/phrase_query.h | 77 ++++-- .../segment_v2/inverted_index/query/query.h | 1 + .../segment_v2/inverted_index_reader.cpp | 3 +- .../test_index_match_phrase_ordered.out | 67 +++++ .../test_index_match_phrase_ordered.groovy | 87 ++++++ 6 files changed, 390 insertions(+), 98 deletions(-) create mode 100644 regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out create mode 100644 regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp index 9d242bce68a528..0ca2dce94e3dd2 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp @@ -21,9 +21,105 @@ namespace doris::segment_v2 { +template +bool PhraseMatcherBase::matches(int32_t doc) { + reset(doc); + return static_cast(this)->next_match(); +} + +template +void PhraseMatcherBase::reset(int32_t doc) { + for (PostingsAndPosition& posting : _postings) { + if (posting._postings.docID() != doc) { + posting._postings.advance(doc); + } + posting._freq = posting._postings.freq(); + posting._pos = -1; + posting._upTo = 0; + } +} + +template +bool PhraseMatcherBase::advance_position(PostingsAndPosition& posting, int32_t target) { + while (posting._pos < target) { + if (posting._upTo == posting._freq) { + return false; + } else { + posting._pos = posting._postings.nextPosition(); + posting._upTo += 1; + } + } + return true; +} + +bool ExactPhraseMatcher::next_match() { + PostingsAndPosition& lead = _postings[0]; + if (lead._upTo < lead._freq) { + lead._pos = lead._postings.nextPosition(); + lead._upTo += 1; + } else { + return false; + } + + while (true) { + int32_t phrasePos = lead._pos - lead._offset; + + bool advance_head = false; + for (size_t j = 1; j < _postings.size(); ++j) { + PostingsAndPosition& posting = _postings[j]; + int32_t expectedPos = phrasePos + posting._offset; + // advance up to the same position as the lead + if (!advance_position(posting, expectedPos)) { + return false; + } + + if (posting._pos != expectedPos) { // we advanced too far + if (advance_position(lead, posting._pos - posting._offset + lead._offset)) { + advance_head = true; + break; + } else { + return false; + } + } + } + if (advance_head) { + continue; + } + + return true; + } + + return false; +} + +bool OrderedSloppyPhraseMatcher::next_match() { + PostingsAndPosition* prev_posting = _postings.data(); + while (prev_posting->_upTo < prev_posting->_freq) { + prev_posting->_pos = prev_posting->_postings.nextPosition(); + prev_posting->_upTo += 1; + if (stretch_to_order(prev_posting) && _match_width <= _allowed_slop) { + return true; + } + } + return false; +} + +bool OrderedSloppyPhraseMatcher::stretch_to_order(PostingsAndPosition* prev_posting) { + _match_width = 0; + for (size_t i = 1; i < _postings.size(); i++) { + PostingsAndPosition& posting = _postings[i]; + if (!advance_position(posting, prev_posting->_pos + 1)) { + return false; + } + _match_width += (posting._pos - (prev_posting->_pos + 1)); + prev_posting = &posting; + } + return true; +} + PhraseQuery::PhraseQuery(const std::shared_ptr& searcher, const TQueryOptions& query_options) - : _searcher(searcher), _query(std::make_unique()) {} + : _searcher(searcher) {} PhraseQuery::~PhraseQuery() { for (auto& term_doc : _term_docs) { @@ -44,16 +140,20 @@ void PhraseQuery::add(const InvertedIndexQueryInfo& query_info) { } _slop = query_info.slop; - if (_slop <= 0) { + if (_slop == 0 || query_info.ordered) { + // Logic for no slop query and ordered phrase query add(query_info.field_name, query_info.terms); } else { + // Simple slop query follows the default phrase query algorithm + auto query = std::make_unique(); for (const auto& term : query_info.terms) { std::wstring ws_term = StringUtil::string_to_wstring(term); auto* t = _CLNEW lucene::index::Term(query_info.field_name.c_str(), ws_term.c_str()); - _query->add(t); + query->add(t); _CLDECDELETE(t); } - _query->setSlop(_slop); + query->setSlop(_slop); + _matcher = std::move(query); } } @@ -73,14 +173,33 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vector iterators; - for (size_t i = 0; i < terms.size(); i++) { - std::wstring ws_term = StringUtil::string_to_wstring(terms[i]); + auto ensureTermPosition = [this, &iterators, &field_name](const std::string& term) { + std::wstring ws_term = StringUtil::string_to_wstring(term); Term* t = _CLNEW Term(field_name.c_str(), ws_term.c_str()); _terms.push_back(t); TermPositions* term_pos = _searcher->getReader()->termPositions(t); _term_docs.push_back(term_pos); iterators.emplace_back(term_pos); - _postings.emplace_back(term_pos, i); + return term_pos; + }; + + if (_slop == 0) { + ExactPhraseMatcher matcher; + for (size_t i = 0; i < terms.size(); i++) { + const auto& term = terms[i]; + auto* term_pos = ensureTermPosition(term); + matcher._postings.emplace_back(term_pos, i); + } + _matcher = matcher; + } else { + OrderedSloppyPhraseMatcher matcher; + for (size_t i = 0; i < terms.size(); i++) { + const auto& term = terms[i]; + auto* term_pos = ensureTermPosition(term); + matcher._postings.emplace_back(term_pos, i); + } + matcher._allowed_slop = _slop; + _matcher = matcher; } std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, const TermIterator& b) { @@ -89,13 +208,17 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vector(_matcher)) { + _searcher->_search( + std::get(_matcher).get(), + [&roaring](const int32_t docid, const float_t /*score*/) { roaring.add(docid); }); + } else { if (_lead1.isEmpty()) { return; } @@ -104,10 +227,6 @@ void PhraseQuery::search(roaring::Roaring& roaring) { return; } search_by_skiplist(roaring); - } else { - _searcher->_search(_query.get(), [&roaring](const int32_t docid, const float_t /*score*/) { - roaring.add(docid); - }); } } @@ -125,8 +244,7 @@ void PhraseQuery::search_by_bitmap(roaring::Roaring& roaring) { void PhraseQuery::search_by_skiplist(roaring::Roaring& roaring) { int32_t doc = 0; while ((doc = do_next(_lead1.nextDoc())) != INT32_MAX) { - reset(); - if (next_match()) { + if (matches(doc)) { roaring.add(doc); } } @@ -169,67 +287,21 @@ int32_t PhraseQuery::do_next(int32_t doc) { } } -bool PhraseQuery::next_match() { - PostingsAndPosition& lead = _postings[0]; - if (lead._upTo < lead._freq) { - lead._pos = lead._postings.nextPosition(); - lead._upTo += 1; - } else { - return false; - } - - while (true) { - int32_t phrasePos = lead._pos - lead._offset; - - bool advance_head = false; - for (size_t j = 1; j < _postings.size(); ++j) { - PostingsAndPosition& posting = _postings[j]; - int32_t expectedPos = phrasePos + posting._offset; - // advance up to the same position as the lead - if (!advance_position(posting, expectedPos)) { - return false; - } - - if (posting._pos != expectedPos) { // we advanced too far - if (advance_position(lead, posting._pos - posting._offset + lead._offset)) { - advance_head = true; - break; +bool PhraseQuery::matches(int32_t doc) { + return std::visit( + [&doc](auto&& m) -> bool { + using T = std::decay_t; + if constexpr (std::is_same_v) { + _CLTHROWA(CL_ERR_IllegalArgument, + "PhraseQueryPtr does not support matches function"); } else { - return false; + return m.matches(doc); } - } - } - if (advance_head) { - continue; - } - - return true; - } - - return false; -} - -bool PhraseQuery::advance_position(PostingsAndPosition& posting, int32_t target) { - while (posting._pos < target) { - if (posting._upTo == posting._freq) { - return false; - } else { - posting._pos = posting._postings.nextPosition(); - posting._upTo += 1; - } - } - return true; -} - -void PhraseQuery::reset() { - for (PostingsAndPosition& posting : _postings) { - posting._freq = posting._postings.freq(); - posting._pos = -1; - posting._upTo = 0; - } + }, + _matcher); } -Status PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& query_info) { +void PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& query_info) { auto is_digits = [](const std::string_view& str) { return std::all_of(str.begin(), str.end(), [](unsigned char c) { return std::isdigit(c); }); }; @@ -240,17 +312,38 @@ Status PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& quer if (tilde_pos < query.size() - 1 && query[tilde_pos] == '~') { size_t slop_pos = tilde_pos + 1; std::string_view slop_str(query.data() + slop_pos, query.size() - slop_pos); - if (is_digits(slop_str)) { - auto result = std::from_chars(slop_str.begin(), slop_str.end(), query_info.slop); - if (result.ec != std::errc()) { - return Status::Error( - "PhraseQuery parser failed: {}", query); + do { + if (slop_str.empty()) { + break; } - query = query.substr(0, last_space_pos); - } + + bool ordered = false; + if (slop_str.size() == 1) { + if (!std::isdigit(slop_str[0])) { + break; + } + } else { + if (slop_str.back() == '+') { + ordered = true; + slop_str.remove_suffix(1); + } + } + + if (is_digits(slop_str)) { + auto result = + std::from_chars(slop_str.begin(), slop_str.end(), query_info.slop); + if (result.ec != std::errc()) { + break; + } + query_info.ordered = ordered; + query = query.substr(0, last_space_pos); + } + } while (false); } } - return Status::OK(); } +template class PhraseMatcherBase; +template class PhraseMatcherBase; + } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h index 41b5f2d2e9706f..253ba782b78181 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h @@ -22,11 +22,66 @@ #include "CLucene/search/PhraseQuery.h" // clang-format on +#include + CL_NS_USE(index) CL_NS_USE(search) namespace doris::segment_v2 { +class PostingsAndPosition { +public: + PostingsAndPosition(const TermPositionIterator& postings, int32_t offset) + : _postings(postings), _offset(offset) {} + + TermPositionIterator _postings; + int32_t _offset = 0; + int32_t _freq = 0; + int32_t _upTo = 0; + int32_t _pos = 0; +}; + +template +class PhraseMatcherBase { +public: + // Handle position information for different types of phrase queries + bool matches(int32_t doc); + +private: + void reset(int32_t doc); + +protected: + bool advance_position(PostingsAndPosition& posting, int32_t target); + +public: + std::vector _postings; +}; + +class ExactPhraseMatcher : public PhraseMatcherBase { +public: + bool next_match(); +}; + +class OrderedSloppyPhraseMatcher : public PhraseMatcherBase { +public: + bool next_match(); + +private: + bool stretch_to_order(PostingsAndPosition* prev_posting); + +public: + int32_t _allowed_slop = 0; + +private: + int32_t _match_width = -1; +}; + +using PhraseQueryPtr = std::unique_ptr; +// ExactPhraseMatcher: x match_phrase 'aaa bbb' +// PhraseQueryPtr: x match_phrase 'aaa bbb ~2', support slop +// OrderedSloppyPhraseMatcher: x match_phrase 'aaa bbb ~2+', ensuring that the words appear in the specified order. +using Matcher = std::variant; + class PhraseQuery : public Query { public: PhraseQuery(const std::shared_ptr& searcher, @@ -38,28 +93,16 @@ class PhraseQuery : public Query { void search(roaring::Roaring& roaring) override; private: - class PostingsAndPosition { - public: - PostingsAndPosition(const TermPositionIterator& postings, int32_t offset) - : _postings(postings), _offset(offset) {} - - TermPositionIterator _postings; - int32_t _offset = 0; - int32_t _freq = 0; - int32_t _upTo = 0; - int32_t _pos = 0; - }; - + // Use bitmap for merging inverted lists void search_by_bitmap(roaring::Roaring& roaring); + // Use skiplist for merging inverted lists void search_by_skiplist(roaring::Roaring& roaring); int32_t do_next(int32_t doc); - bool next_match(); - bool advance_position(PostingsAndPosition& posting, int32_t target); - void reset(); + bool matches(int32_t doc); public: - static Status parser_slop(std::string& query, InvertedIndexQueryInfo& query_info); + static void parser_slop(std::string& query, InvertedIndexQueryInfo& query_info); private: std::shared_ptr _searcher; @@ -73,8 +116,8 @@ class PhraseQuery : public Query { std::vector _terms; std::vector _term_docs; - std::unique_ptr _query; int32_t _slop = 0; + Matcher _matcher; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h index 011229aa667615..cef7fd51f72b58 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h @@ -39,6 +39,7 @@ struct InvertedIndexQueryInfo { std::wstring field_name; std::vector terms; int32_t slop = 0; + bool ordered = false; }; class Query { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 3639bff05c4ce4..95fad8f4ac794a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -323,7 +323,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run query_info.terms.emplace_back(search_str); } else { if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { - RETURN_IF_ERROR(PhraseQuery::parser_slop(search_str, query_info)); + PhraseQuery::parser_slop(search_str, query_info); } InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared( @@ -364,6 +364,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run std::string str_tokens = join(query_info.terms, " "); if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { str_tokens += " " + std::to_string(query_info.slop); + str_tokens += " " + std::to_string(query_info.ordered); } cache_key = {index_file_key, column_name, query_type, str_tokens}; } diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out new file mode 100644 index 00000000000000..d1e04ececd5ea7 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_ordered.out @@ -0,0 +1,67 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +11 + +-- !sql -- +11 + +-- !sql -- +11 + +-- !sql -- +0 + +-- !sql -- +11 + +-- !sql -- +2 + +-- !sql -- +2 + +-- !sql -- +2 + +-- !sql -- +7 + +-- !sql -- +7 + +-- !sql -- +7 + +-- !sql -- +7 + +-- !sql -- +7 + +-- !sql -- +7 + +-- !sql -- +11 + +-- !sql -- +7 + +-- !sql -- +11 + +-- !sql -- +7 + +-- !sql -- +11 + +-- !sql -- +7 + +-- !sql -- +11 + +-- !sql -- +7 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy new file mode 100644 index 00000000000000..137bab70f051de --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_ordered.groovy @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_phrase_ordered", "p0"){ + def indexTbName1 = "test_index_match_phrase_ordered" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `a` int(11) NULL COMMENT "", + `b` string NULL COMMENT "", + INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`a`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + + sql """ INSERT INTO ${indexTbName1} VALUES (1, "the quick brown fox jumped over the lazy dog"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (2, "the quick brown fox jumped over the lazy dog over"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (3, "the quick brown fox jumped over the lazy dog jumped"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (4, "the quick brown fox jumped over the lazy dog fox"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (5, "the quick brown fox jumped over the lazy dog brown"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (6, "the quick brown fox jumped over the lazy dog quick"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (7, "quick brown fox jumped over the lazy dog over"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (8, "quick brown fox jumped over the lazy dog jumped"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (9, "quick brown fox jumped over the lazy dog fox"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (10, "quick brown fox jumped over the lazy dog brown"); """ + sql """ INSERT INTO ${indexTbName1} VALUES (11, "quick brown fox jumped over the lazy dog quick"); """ + + try { + sql "sync" + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the lazy'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the lazy ~1'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the lazy ~1+'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the lazy ~1+ '; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the over ~2'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the over ~2+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the jumped ~2'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the jumped ~2+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the fox ~2'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the fox ~2+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the brown ~2'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the brown ~2+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the quick ~2'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the quick ~2+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the jumped ~3'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the jumped ~3+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the fox ~4'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the fox ~4+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the brown ~5'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the brown ~5+'; """ + + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the quick ~6'; """ + qt_sql """ select count() from ${indexTbName1} where b match_phrase 'the quick ~6+'; """ + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file