Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FTS speedup #7259

Merged
merged 3 commits into from
Jan 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 45 additions & 12 deletions src/tribler/core/components/metadata_store/db/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ def torrent_exists_in_personal_channel(self, infohash):
)

# pylint: disable=unused-argument
def search_keyword(self, query):
def search_keyword(self, query, origin_id=None):
# Requires FTS5 table "FtsIndex" to be generated and populated.
# FTS table is maintained automatically by SQL triggers.
# BM25 ranking is embedded in FTS5.
Expand All @@ -588,16 +588,44 @@ def search_keyword(self, query):
if not query or query == "*":
return []

fts_ids = raw_sql("""
SELECT fts.rowid
FROM (
SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query ORDER BY rowid DESC LIMIT 10000
) fts
LEFT JOIN ChannelNode cn on fts.rowid = cn.rowid
LEFT JOIN main.TorrentState ts on cn.health = ts.rowid
ORDER BY coalesce(ts.seeders, 0) DESC, fts.rowid DESC
LIMIT 1000
""")
if origin_id is not None:
# When filtering a specific channel folder, we want to return all matching results
fts_ids = raw_sql("""
SELECT rowid FROM ChannelNode
WHERE origin_id = $origin_id
AND rowid IN (SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query)
""")
else:
# When searching through an entire database for some text queries, the database can contain hundreds
# of thousands of matching torrents. The ranking of this number of torrents may be very expensive: we need
# to retrieve each matching torrent info and the torrent state from the database for proper ordering.
# They are scattered randomly through the entire database file, so fetching all these torrents is slow.
# Also, the torrent_rank function used inside the final ORDER BY section is written in Python. It is about
# 30 times slower than a possible similar function written in C due to SQLite-Python communication cost.
#
# To speed up the query, we limit and filter search results in several iterations, and each time apply
# a more expensive ranking algorithm:
# * First, we quickly fetch at most 10000 of the most recent torrents that match the search criteria
# and ignore older torrents. This way, we avoid sorting all hundreds of thousands of matching torrents
# in degenerative cases. In typical cases, when the text query is specific enough, the number of
# matching torrents is not that big.
# * Then, we sort these 10000 torrents to prioritize torrents with seeders and restrict the number
# of torrents to just 1000.
# * Finally, in the main query, we apply a slow ranking function to these 1000 torrents to show the most
# relevant torrents at the top of the search result list.
#
# This multistep sort+limit sequence allows speedup queries up to two orders of magnitude. To further
# speed up full-text search queries, we can rewrite the torrent_rank function to C one day.
fts_ids = raw_sql("""
SELECT fts.rowid
FROM (
SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query ORDER BY rowid DESC LIMIT 10000
) fts
LEFT JOIN ChannelNode cn on fts.rowid = cn.rowid
LEFT JOIN main.TorrentState ts on cn.health = ts.rowid
ORDER BY coalesce(ts.seeders, 0) DESC, fts.rowid DESC
LIMIT 1000
""")
return left_join(g for g in self.MetadataNode if g.rowid in fts_ids) # pylint: disable=E1135

@db_session
Expand Down Expand Up @@ -633,7 +661,12 @@ def get_entries_query(

if cls is None:
cls = self.ChannelNode
pony_query = self.search_keyword(txt_filter) if txt_filter else left_join(g for g in cls)

if txt_filter:
pony_query = self.search_keyword(txt_filter, origin_id=origin_id)
else:
pony_query = left_join(g for g in cls)

infohash_set = infohash_set or ({infohash} if infohash else None)
if popular:
if metadata_type != REGULAR_TORRENT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,21 +147,21 @@ async def test_search_with_space(rest_api, metadata_store):
metadata_store.TorrentMetadata(title='abc defxyz', infohash=random_infohash())

s1 = to_fts_query("abc")
assert s1 == '"abc"*'
assert s1 == '"abc"'

s2 = to_fts_query("abc def")
assert s2 == '"abc" "def"*'
assert s2 == '"abc" "def"'

ss2 = to_fts_query(s2)
assert ss2 == s2

parsed = await do_request(rest_api, f'search?txt_filter={s1}', expected_code=200)
results = {item["name"] for item in parsed["results"]}
assert results == {'abc', 'abc.def', 'abc def', 'abc defxyz', 'abcxyz def'}
assert results == {'abc', 'abc.def', 'abc def', 'abc defxyz'}

parsed = await do_request(rest_api, f'search?txt_filter={s2}', expected_code=200)
results = {item["name"] for item in parsed["results"]}
assert results == {'abc.def', 'abc def', 'abc defxyz'} # but not 'abcxyz def'
assert results == {'abc.def', 'abc def'} # but not 'abcxyz def'


async def test_single_snippet_in_search(rest_api, metadata_store, knowledge_db):
Expand Down
16 changes: 9 additions & 7 deletions src/tribler/core/components/metadata_store/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,16 @@ def _add_operation(_obj, _op, _key, _predicate=ResourceType.TAG):


@db_session
def generate_torrent(metadata_store, tags_db, parent):
def generate_torrent(metadata_store, tags_db, parent, title=None):
infohash = random_infohash()

# Give each torrent some health information. For now, we assume all torrents are healthy.
now = int(time.time())
last_check = now - random.randint(3600, 24 * 3600)
category = random.choice(["Video", "Audio", "Documents", "Compressed", "Books", "Science"])
torrent_state = metadata_store.TorrentState(infohash=infohash, seeders=10, last_check=last_check)
metadata_store.TorrentMetadata(title=generate_title(words_count=4), infohash=infohash, origin_id=parent.id_,
health=torrent_state, tags=category)
metadata_store.TorrentMetadata(title=title or generate_title(words_count=4), infohash=infohash,
origin_id=parent.id_, health=torrent_state, tags=category)

tag_torrent(infohash, tags_db)

Expand All @@ -114,7 +114,7 @@ def generate_channel(metadata_store: MetadataStore, tags_db: KnowledgeDatabase,

metadata_store.ChannelNode._my_key = default_eccrypto.generate_key('low')
chan = metadata_store.ChannelMetadata(
title=generate_title(words_count=5), subscribed=subscribed, infohash=random_infohash()
title=title or generate_title(words_count=5), subscribed=subscribed, infohash=random_infohash()
)

# add some collections to the channel
Expand All @@ -131,13 +131,15 @@ def generate_test_channels(metadata_store, tags_db) -> None:
generate_channel(metadata_store, tags_db, subscribed=ind % 2 == 0)

# This one is necessary to test filters, etc
generate_channel(metadata_store, tags_db, title="non-random channel name")
generate_channel(metadata_store, tags_db, title="nonrandom unsubscribed channel name")

# The same, but subscribed
generate_channel(metadata_store, tags_db, title="non-random subscribed channel name", subscribed=True)
generate_channel(metadata_store, tags_db, title="nonrandom subscribed channel name", subscribed=True)

# Now generate a couple of personal channels
chan1 = metadata_store.ChannelMetadata.create_channel(title="personal channel with non-random name")
chan1 = metadata_store.ChannelMetadata.create_channel(title="personal channel with nonrandom name")
generate_torrent(metadata_store, tags_db, chan1, title='Some torrent with nonrandom name')
generate_torrent(metadata_store, tags_db, chan1, title='Another torrent with nonrandom name')

with open(PNG_FILE, "rb") as f:
pic_bytes = f.read()
Expand Down
6 changes: 3 additions & 3 deletions src/tribler/core/utilities/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,9 @@ def test_to_fts_query():
assert to_fts_query(None) is None
assert to_fts_query('') is None
assert to_fts_query(' ') is None
assert to_fts_query(' abc') == '"abc"*'
assert to_fts_query('abc def') == '"abc" "def"*'
assert to_fts_query('[abc, def]: xyz?!') == '"abc" "def" "xyz"*'
assert to_fts_query(' abc') == '"abc"'
assert to_fts_query('abc def') == '"abc" "def"'
assert to_fts_query('[abc, def]: xyz?!') == '"abc" "def" "xyz"'


def test_extract_tags():
Expand Down
2 changes: 1 addition & 1 deletion src/tribler/core/utilities/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def to_fts_query(text):
if not words:
return None

return ' '.join(words) + '*'
return ' '.join(words)


def show_system_popup(title, text):
Expand Down
16 changes: 10 additions & 6 deletions src/tribler/gui/tests/test_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,21 @@ def tst_channels_widget(window, widget, widget_name, sort_column=1, test_filter=
widget.content_table.sortByColumn(sort_column, 1)
wait_for_list_populated(widget.content_table)
screenshot(window, name=f"{widget_name}-sorted")
max_items = min(widget.content_table.model().channel_info["total"], 50)
assert widget.content_table.verticalHeader().count() <= max_items
total = widget.content_table.model().channel_info.get("total")
if total is not None:
max_items = min(total, 50)
assert widget.content_table.verticalHeader().count() <= max_items

# Filter
if test_filter:
old_num_items = widget.content_table.verticalHeader().count()
QTest.keyClick(widget.channel_torrents_filter_input, 'r')
widget.channel_torrents_filter_input.setText("nonrandom")
widget.controller.on_filter_input_return_pressed()
wait_for_list_populated(widget.content_table)
screenshot(window, name=f"{widget_name}-filtered")
assert widget.content_table.verticalHeader().count() <= old_num_items
QTest.keyPress(widget.channel_torrents_filter_input, Qt.Key_Backspace)
widget.channel_torrents_filter_input.setText("")
widget.controller.on_filter_input_return_pressed()
wait_for_list_populated(widget.content_table)

if test_subscribe:
Expand Down Expand Up @@ -380,14 +384,14 @@ def test_download_details(window):
@pytest.mark.guitest
def test_search_suggestions(window):
QTest.keyClick(window.top_search_bar, 't')
QTest.keyClick(window.top_search_bar, 'r')
QTest.keyClick(window.top_search_bar, 'o')
wait_for_signal(window.received_search_completions)
screenshot(window, name="search_suggestions")


@pytest.mark.guitest
def test_search(window):
window.top_search_bar.setText("a") # This is likely to trigger some search results
window.top_search_bar.setText("torrent") # This is likely to trigger some search results
QTest.keyClick(window.top_search_bar, Qt.Key_Enter)
QTest.qWait(100)
screenshot(window, name="search_loading_page")
Expand Down
8 changes: 8 additions & 0 deletions src/tribler/gui/widgets/tablecontentmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,14 @@ def perform_query(self, **kwargs):
txt_filter = to_fts_query(self.text_filter)
if txt_filter:
kwargs.update({"txt_filter": txt_filter})
# Global full-text search queries should not request the total number of rows for several reasons:
# * The total number of rows is useful for paginated queries, and FTS queries in Tribler are not paginated.
# * Our goal is to display the most relevant results for the user at the top of the search result list.
# The user doesn't need to see that the database has exactly 300001 results for the "MP3" search.
# In other words, we should search like Google, not Altavista.
# * The result list also integrates the results from remote peers that are not from the local database.
if 'origin_id' not in kwargs:
kwargs.pop("include_total", None)

if self.max_rowid is not None:
kwargs["max_rowid"] = self.max_rowid
Expand Down
4 changes: 2 additions & 2 deletions src/tribler/gui/widgets/triblertablecontrollers.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, table_view, *args, filter_input=None, **kwargs):

self.filter_input = filter_input
if self.filter_input:
connect(self.filter_input.textChanged, self._on_filter_input_change)
connect(self.filter_input.returnPressed, self.on_filter_input_return_pressed)

def set_model(self, model):
self.model = model
Expand Down Expand Up @@ -71,7 +71,7 @@ def _get_sort_parameters(self):
sort_asc = self.table_view.horizontalHeader().sortIndicatorOrder()
return sort_by, sort_asc

def _on_filter_input_change(self, _):
def on_filter_input_return_pressed(self):
self.model.text_filter = self.filter_input.text().lower()
self.model.reset()

Expand Down