Skip to content

Commit

Permalink
Search: use multi-fields for Wildcard queries (#7613)
Browse files Browse the repository at this point in the history
Wildcard queries are slow (on .org it returns 502, on .com since the db
is small it works just fine).

https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl.html#query-dsl-allow-expensive-queries

These type of queries can be optimized by using the Wildcard field
https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html#wildcard-field-type.

Currently that field isn't implemented in the ES dependencies,
but is just a subclass of the Keyword field (I'll see if I can send a PR
upstream).

As we still want to make use of the SimpleString queries,
I'm using the multi-fields feature.

This change is editing the index,
so we need to rebuild the index and re-index.

To test it locally:

- `inv docker.manage 'search_index --rebuild'`.
- `inv docker.manage reindex_elasticsearch`.
- Enable the `DEFAULT_TO_FUZZY_SEARCH` feature flag on a project.

I'm not sure how to write tests for this one,
and we can't test if this is really fast in production...
But testing it locally works and gives better results for both, sections and
domains!
  • Loading branch information
stsewd authored Jun 9, 2021
1 parent 75955e1 commit b523a78
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 38 deletions.
79 changes: 70 additions & 9 deletions readthedocs/search/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from django.conf import settings
from django_elasticsearch_dsl import Document, Index, fields
from elasticsearch import Elasticsearch
from elasticsearch_dsl.field import Keyword

from readthedocs.projects.models import HTMLFile, Project

Expand All @@ -17,6 +18,12 @@
log = logging.getLogger(__name__)


# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl).
class WildcardField(Keyword, fields.DEDField):

name = 'wildcard'


class RTDDocTypeMixin:

def update(self, *args, **kwargs):
Expand All @@ -31,6 +38,13 @@ def update(self, *args, **kwargs):
@project_index.document
class ProjectDocument(RTDDocTypeMixin, Document):

"""
Document representation of a Project.
We use multi-fields to be able to perform other kind of queries over the same field.
``raw`` fields are used for Wildcard queries.
"""

# Metadata
url = fields.TextField(attr='get_absolute_url')
users = fields.NestedField(
Expand All @@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document):
)
language = fields.KeywordField()

name = fields.TextField(
attr='name',
fields={
'raw': WildcardField(),
},
)
slug = fields.TextField(
attr='slug',
fields={
'raw': WildcardField(),
},
)
description = fields.TextField(
attr='description',
fields={
'raw': WildcardField(),
},
)

modified_model_field = 'modified_date'

class Django:
model = Project
fields = ('name', 'slug', 'description')
fields = []
ignore_signals = True


Expand All @@ -61,6 +94,11 @@ class PageDocument(RTDDocTypeMixin, Document):
instead of [python.submodule].
See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html # noqa
We use multi-fields to be able to perform other kind of queries over the same field.
``raw`` fields are used for Wildcard queries.
https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html
Some text fields use the ``with_positions_offsets`` term vector,
this is to have faster highlighting on big documents.
See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html
Expand All @@ -75,13 +113,27 @@ class PageDocument(RTDDocTypeMixin, Document):
rank = fields.IntegerField()

# Searchable content
title = fields.TextField(attr='processed_json.title')
title = fields.TextField(
attr='processed_json.title',
fields={
'raw': WildcardField(),
},
)
sections = fields.NestedField(
attr='processed_json.sections',
properties={
'id': fields.KeywordField(),
'title': fields.TextField(),
'content': fields.TextField(term_vector='with_positions_offsets'),
'title': fields.TextField(
fields={
'raw': WildcardField(),
},
),
'content': fields.TextField(
term_vector='with_positions_offsets',
fields={
'raw': WildcardField(),
},
),
}
)
domains = fields.NestedField(
Expand All @@ -93,11 +145,20 @@ class PageDocument(RTDDocTypeMixin, Document):

# For showing in the search result
'type_display': fields.TextField(),
'docstrings': fields.TextField(term_vector='with_positions_offsets'),

# Simple analyzer breaks on `.`,
# otherwise search results are too strict for this use case
'name': fields.TextField(analyzer='simple'),
'docstrings': fields.TextField(
term_vector='with_positions_offsets',
fields={
'raw': WildcardField(),
},
),
'name': fields.TextField(
# Simple analyzer breaks on `.`,
# otherwise search results are too strict for this use case
analyzer='simple',
fields={
'raw': WildcardField(),
},
),
}
)

Expand Down
48 changes: 32 additions & 16 deletions readthedocs/search/faceted_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,21 +87,13 @@ def _get_queries(self, *, query, fields):
"""
Get a list of query objects according to the query.
If the query is a *single term* (a single word)
we try to match partial words and substrings
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag).
If the query is a phrase or contains the syntax from a simple query string,
we use the SimpleQueryString query.
If the query is a single term we try to match partial words and substrings
(available only with the DEFAULT_TO_FUZZY_SEARCH feature flag),
otherwise we use the SimpleQueryString query.
"""
is_single_term = (
not self.use_advanced_query and
query and len(query.split()) <= 1 and
not self._is_advanced_query(query)
)
get_queries_function = (
self._get_single_term_queries
if is_single_term
if self._is_single_term(query)
else self._get_text_queries
)

Expand Down Expand Up @@ -150,6 +142,7 @@ def _get_single_term_queries(self, query, fields):
The score of "and" should be higher as it satisfies both "or" and "and".
We use the Wildcard query with the query surrounded by ``*`` to match substrings.
We use the raw fields (Wildcard fields) instead of the normal field for performance.
For valid options, see:
Expand All @@ -164,8 +157,9 @@ def _get_single_term_queries(self, query, fields):
)
queries.append(query_string)
for field in fields:
# Remove boosting from the field
field = re.sub(r'\^.*$', '', field)
# Remove boosting from the field,
# and query from the raw field.
field = re.sub(r'\^.*$', '.raw', field)
kwargs = {
field: {'value': f'*{query}*'},
}
Expand All @@ -188,6 +182,21 @@ def _get_fuzzy_query(self, *, query, fields, operator):
prefix_length=1,
)

def _is_single_term(self, query):
"""
Check if the query is a single term.
A query is a single term if it is a single word,
if it doesn't contain the syntax from a simple query string,
and if `self.use_advanced_query` is False.
"""
is_single_term = (
not self.use_advanced_query and
query and len(query.split()) <= 1 and
not self._is_advanced_query(query)
)
return is_single_term

def _is_advanced_query(self, query):
"""
Check if query looks like to be using the syntax from a simple query string.
Expand Down Expand Up @@ -333,11 +342,18 @@ def _get_nested_query(self, *, query, path, fields):
fields=fields,
)

raw_fields = (
raw_fields = [
# Remove boosting from the field
re.sub(r'\^.*$', '', field)
for field in fields
)
]

# Highlight from the raw fields too, if it is a single term.
if self._is_single_term(query):
raw_fields.extend([
re.sub(r'\^.*$', '.raw', field)
for field in fields
])

highlight = dict(
self._highlight_options,
Expand Down
40 changes: 28 additions & 12 deletions readthedocs/search/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,28 @@
VersionData = namedtuple('VersionData', ['slug', 'docs_url'])


def get_raw_field(obj, field, default=None):
"""Get the ``raw`` version of this field or fallback to the original field."""
return (
getattr(obj, f'{field}.raw', default)
or getattr(obj, field, default)
)


class ProjectHighlightSerializer(serializers.Serializer):

name = serializers.ListField(child=serializers.CharField(), default=list)
slug = serializers.ListField(child=serializers.CharField(), default=list)
description = serializers.ListField(child=serializers.CharField(), default=list)
name = serializers.SerializerMethodField()
slug = serializers.SerializerMethodField()
description = serializers.SerializerMethodField()

def get_name(self, obj):
return list(get_raw_field(obj, 'name', []))

def get_slug(self, obj):
return list(get_raw_field(obj, 'slug', []))

def get_description(self, obj):
return list(get_raw_field(obj, 'description', []))


class ProjectSearchSerializer(serializers.Serializer):
Expand All @@ -44,7 +61,10 @@ class ProjectSearchSerializer(serializers.Serializer):

class PageHighlightSerializer(serializers.Serializer):

title = serializers.ListField(child=serializers.CharField(), default=list)
title = serializers.SerializerMethodField()

def get_title(self, obj):
return list(get_raw_field(obj, 'title', []))


class PageSearchSerializer(serializers.Serializer):
Expand Down Expand Up @@ -166,12 +186,10 @@ class DomainHighlightSerializer(serializers.Serializer):
content = serializers.SerializerMethodField()

def get_name(self, obj):
name = getattr(obj, 'domains.name', [])
return list(name)
return list(get_raw_field(obj, 'domains.name', []))

def get_content(self, obj):
docstring = getattr(obj, 'domains.docstrings', [])
return list(docstring)
return list(get_raw_field(obj, 'domains.docstrings', []))


class DomainSearchSerializer(serializers.Serializer):
Expand All @@ -190,12 +208,10 @@ class SectionHighlightSerializer(serializers.Serializer):
content = serializers.SerializerMethodField()

def get_title(self, obj):
title = getattr(obj, 'sections.title', [])
return list(title)
return list(get_raw_field(obj, 'sections.title', []))

def get_content(self, obj):
content = getattr(obj, 'sections.content', [])
return list(content)
return list(get_raw_field(obj, 'sections.content', []))


class SectionSearchSerializer(serializers.Serializer):
Expand Down
7 changes: 6 additions & 1 deletion readthedocs/search/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -605,7 +605,12 @@ def test_search_single_query(self, api_client):

results = resp.data['results']
assert len(results) > 0
assert 'Index' in results[0]['title']
assert 'Support' in results[0]['title']
# find is more closer than index, so is listed first.
highlights = results[0]['blocks'][0]['highlights']
assert '<span>find</span>' in highlights['content'][0]

assert 'Index' in results[1]['title']

# Query with a partial word, but we want to match that
search_params = {
Expand Down

0 comments on commit b523a78

Please sign in to comment.