Skip to content

Commit

Permalink
Merge pull request #7259 from kozlovsky/fts_speedup
Browse files Browse the repository at this point in the history
FTS speedup
  • Loading branch information
kozlovsky authored Jan 18, 2023
2 parents 63de413 + 6abf27f commit 5304864
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 35 deletions.
57 changes: 45 additions & 12 deletions src/tribler/core/components/metadata_store/db/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,7 +579,7 @@ def torrent_exists_in_personal_channel(self, infohash):
)

# pylint: disable=unused-argument
def search_keyword(self, query):
def search_keyword(self, query, origin_id=None):
# Requires FTS5 table "FtsIndex" to be generated and populated.
# FTS table is maintained automatically by SQL triggers.
# BM25 ranking is embedded in FTS5.
Expand All @@ -588,16 +588,44 @@ def search_keyword(self, query):
if not query or query == "*":
return []

fts_ids = raw_sql("""
SELECT fts.rowid
FROM (
SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query ORDER BY rowid DESC LIMIT 10000
) fts
LEFT JOIN ChannelNode cn on fts.rowid = cn.rowid
LEFT JOIN main.TorrentState ts on cn.health = ts.rowid
ORDER BY coalesce(ts.seeders, 0) DESC, fts.rowid DESC
LIMIT 1000
""")
if origin_id is not None:
# When filtering a specific channel folder, we want to return all matching results
fts_ids = raw_sql("""
SELECT rowid FROM ChannelNode
WHERE origin_id = $origin_id
AND rowid IN (SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query)
""")
else:
# When searching through an entire database for some text queries, the database can contain hundreds
# of thousands of matching torrents. The ranking of this number of torrents may be very expensive: we need
# to retrieve each matching torrent info and the torrent state from the database for proper ordering.
# They are scattered randomly through the entire database file, so fetching all these torrents is slow.
# Also, the torrent_rank function used inside the final ORDER BY section is written in Python. It is about
# 30 times slower than a possible similar function written in C due to SQLite-Python communication cost.
#
# To speed up the query, we limit and filter search results in several iterations, and each time apply
# a more expensive ranking algorithm:
# * First, we quickly fetch at most 10000 of the most recent torrents that match the search criteria
# and ignore older torrents. This way, we avoid sorting all hundreds of thousands of matching torrents
# in degenerative cases. In typical cases, when the text query is specific enough, the number of
# matching torrents is not that big.
# * Then, we sort these 10000 torrents to prioritize torrents with seeders and restrict the number
# of torrents to just 1000.
# * Finally, in the main query, we apply a slow ranking function to these 1000 torrents to show the most
# relevant torrents at the top of the search result list.
#
# This multistep sort+limit sequence allows speedup queries up to two orders of magnitude. To further
# speed up full-text search queries, we can rewrite the torrent_rank function to C one day.
fts_ids = raw_sql("""
SELECT fts.rowid
FROM (
SELECT rowid FROM FtsIndex WHERE FtsIndex MATCH $query ORDER BY rowid DESC LIMIT 10000
) fts
LEFT JOIN ChannelNode cn on fts.rowid = cn.rowid
LEFT JOIN main.TorrentState ts on cn.health = ts.rowid
ORDER BY coalesce(ts.seeders, 0) DESC, fts.rowid DESC
LIMIT 1000
""")
return left_join(g for g in self.MetadataNode if g.rowid in fts_ids) # pylint: disable=E1135

@db_session
Expand Down Expand Up @@ -633,7 +661,12 @@ def get_entries_query(

if cls is None:
cls = self.ChannelNode
pony_query = self.search_keyword(txt_filter) if txt_filter else left_join(g for g in cls)

if txt_filter:
pony_query = self.search_keyword(txt_filter, origin_id=origin_id)
else:
pony_query = left_join(g for g in cls)

infohash_set = infohash_set or ({infohash} if infohash else None)
if popular:
if metadata_type != REGULAR_TORRENT:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,21 +147,21 @@ async def test_search_with_space(rest_api, metadata_store):
metadata_store.TorrentMetadata(title='abc defxyz', infohash=random_infohash())

s1 = to_fts_query("abc")
assert s1 == '"abc"*'
assert s1 == '"abc"'

s2 = to_fts_query("abc def")
assert s2 == '"abc" "def"*'
assert s2 == '"abc" "def"'

ss2 = to_fts_query(s2)
assert ss2 == s2

parsed = await do_request(rest_api, f'search?txt_filter={s1}', expected_code=200)
results = {item["name"] for item in parsed["results"]}
assert results == {'abc', 'abc.def', 'abc def', 'abc defxyz', 'abcxyz def'}
assert results == {'abc', 'abc.def', 'abc def', 'abc defxyz'}

parsed = await do_request(rest_api, f'search?txt_filter={s2}', expected_code=200)
results = {item["name"] for item in parsed["results"]}
assert results == {'abc.def', 'abc def', 'abc defxyz'} # but not 'abcxyz def'
assert results == {'abc.def', 'abc def'} # but not 'abcxyz def'


async def test_single_snippet_in_search(rest_api, metadata_store, knowledge_db):
Expand Down
16 changes: 9 additions & 7 deletions src/tribler/core/components/metadata_store/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,16 @@ def _add_operation(_obj, _op, _key, _predicate=ResourceType.TAG):


@db_session
def generate_torrent(metadata_store, tags_db, parent):
def generate_torrent(metadata_store, tags_db, parent, title=None):
infohash = random_infohash()

# Give each torrent some health information. For now, we assume all torrents are healthy.
now = int(time.time())
last_check = now - random.randint(3600, 24 * 3600)
category = random.choice(["Video", "Audio", "Documents", "Compressed", "Books", "Science"])
torrent_state = metadata_store.TorrentState(infohash=infohash, seeders=10, last_check=last_check)
metadata_store.TorrentMetadata(title=generate_title(words_count=4), infohash=infohash, origin_id=parent.id_,
health=torrent_state, tags=category)
metadata_store.TorrentMetadata(title=title or generate_title(words_count=4), infohash=infohash,
origin_id=parent.id_, health=torrent_state, tags=category)

tag_torrent(infohash, tags_db)

Expand All @@ -114,7 +114,7 @@ def generate_channel(metadata_store: MetadataStore, tags_db: KnowledgeDatabase,

metadata_store.ChannelNode._my_key = default_eccrypto.generate_key('low')
chan = metadata_store.ChannelMetadata(
title=generate_title(words_count=5), subscribed=subscribed, infohash=random_infohash()
title=title or generate_title(words_count=5), subscribed=subscribed, infohash=random_infohash()
)

# add some collections to the channel
Expand All @@ -131,13 +131,15 @@ def generate_test_channels(metadata_store, tags_db) -> None:
generate_channel(metadata_store, tags_db, subscribed=ind % 2 == 0)

# This one is necessary to test filters, etc
generate_channel(metadata_store, tags_db, title="non-random channel name")
generate_channel(metadata_store, tags_db, title="nonrandom unsubscribed channel name")

# The same, but subscribed
generate_channel(metadata_store, tags_db, title="non-random subscribed channel name", subscribed=True)
generate_channel(metadata_store, tags_db, title="nonrandom subscribed channel name", subscribed=True)

# Now generate a couple of personal channels
chan1 = metadata_store.ChannelMetadata.create_channel(title="personal channel with non-random name")
chan1 = metadata_store.ChannelMetadata.create_channel(title="personal channel with nonrandom name")
generate_torrent(metadata_store, tags_db, chan1, title='Some torrent with nonrandom name')
generate_torrent(metadata_store, tags_db, chan1, title='Another torrent with nonrandom name')

with open(PNG_FILE, "rb") as f:
pic_bytes = f.read()
Expand Down
6 changes: 3 additions & 3 deletions src/tribler/core/utilities/tests/test_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,9 @@ def test_to_fts_query():
assert to_fts_query(None) is None
assert to_fts_query('') is None
assert to_fts_query(' ') is None
assert to_fts_query(' abc') == '"abc"*'
assert to_fts_query('abc def') == '"abc" "def"*'
assert to_fts_query('[abc, def]: xyz?!') == '"abc" "def" "xyz"*'
assert to_fts_query(' abc') == '"abc"'
assert to_fts_query('abc def') == '"abc" "def"'
assert to_fts_query('[abc, def]: xyz?!') == '"abc" "def" "xyz"'


def test_extract_tags():
Expand Down
2 changes: 1 addition & 1 deletion src/tribler/core/utilities/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def to_fts_query(text):
if not words:
return None

return ' '.join(words) + '*'
return ' '.join(words)


def show_system_popup(title, text):
Expand Down
16 changes: 10 additions & 6 deletions src/tribler/gui/tests/test_gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,17 +217,21 @@ def tst_channels_widget(window, widget, widget_name, sort_column=1, test_filter=
widget.content_table.sortByColumn(sort_column, 1)
wait_for_list_populated(widget.content_table)
screenshot(window, name=f"{widget_name}-sorted")
max_items = min(widget.content_table.model().channel_info["total"], 50)
assert widget.content_table.verticalHeader().count() <= max_items
total = widget.content_table.model().channel_info.get("total")
if total is not None:
max_items = min(total, 50)
assert widget.content_table.verticalHeader().count() <= max_items

# Filter
if test_filter:
old_num_items = widget.content_table.verticalHeader().count()
QTest.keyClick(widget.channel_torrents_filter_input, 'r')
widget.channel_torrents_filter_input.setText("nonrandom")
widget.controller.on_filter_input_return_pressed()
wait_for_list_populated(widget.content_table)
screenshot(window, name=f"{widget_name}-filtered")
assert widget.content_table.verticalHeader().count() <= old_num_items
QTest.keyPress(widget.channel_torrents_filter_input, Qt.Key_Backspace)
widget.channel_torrents_filter_input.setText("")
widget.controller.on_filter_input_return_pressed()
wait_for_list_populated(widget.content_table)

if test_subscribe:
Expand Down Expand Up @@ -380,14 +384,14 @@ def test_download_details(window):
@pytest.mark.guitest
def test_search_suggestions(window):
QTest.keyClick(window.top_search_bar, 't')
QTest.keyClick(window.top_search_bar, 'r')
QTest.keyClick(window.top_search_bar, 'o')
wait_for_signal(window.received_search_completions)
screenshot(window, name="search_suggestions")


@pytest.mark.guitest
def test_search(window):
window.top_search_bar.setText("a") # This is likely to trigger some search results
window.top_search_bar.setText("torrent") # This is likely to trigger some search results
QTest.keyClick(window.top_search_bar, Qt.Key_Enter)
QTest.qWait(100)
screenshot(window, name="search_loading_page")
Expand Down
8 changes: 8 additions & 0 deletions src/tribler/gui/widgets/tablecontentmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,14 @@ def perform_query(self, **kwargs):
txt_filter = to_fts_query(self.text_filter)
if txt_filter:
kwargs.update({"txt_filter": txt_filter})
# Global full-text search queries should not request the total number of rows for several reasons:
# * The total number of rows is useful for paginated queries, and FTS queries in Tribler are not paginated.
# * Our goal is to display the most relevant results for the user at the top of the search result list.
# The user doesn't need to see that the database has exactly 300001 results for the "MP3" search.
# In other words, we should search like Google, not Altavista.
# * The result list also integrates the results from remote peers that are not from the local database.
if 'origin_id' not in kwargs:
kwargs.pop("include_total", None)

if self.max_rowid is not None:
kwargs["max_rowid"] = self.max_rowid
Expand Down
4 changes: 2 additions & 2 deletions src/tribler/gui/widgets/triblertablecontrollers.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __init__(self, table_view, *args, filter_input=None, **kwargs):

self.filter_input = filter_input
if self.filter_input:
connect(self.filter_input.textChanged, self._on_filter_input_change)
connect(self.filter_input.returnPressed, self.on_filter_input_return_pressed)

def set_model(self, model):
self.model = model
Expand Down Expand Up @@ -71,7 +71,7 @@ def _get_sort_parameters(self):
sort_asc = self.table_view.horizontalHeader().sortIndicatorOrder()
return sort_by, sort_asc

def _on_filter_input_change(self, _):
def on_filter_input_return_pressed(self):
self.model.text_filter = self.filter_input.text().lower()
self.model.reset()

Expand Down

0 comments on commit 5304864

Please sign in to comment.