diff --git a/readthedocs/search/documents.py b/readthedocs/search/documents.py index b8c05ec9972..f09ee3c7ea4 100644 --- a/readthedocs/search/documents.py +++ b/readthedocs/search/documents.py @@ -3,6 +3,7 @@ from django.conf import settings from django_elasticsearch_dsl import Document, Index, fields from elasticsearch import Elasticsearch +from elasticsearch_dsl.field import Keyword from readthedocs.projects.models import HTMLFile, Project @@ -17,6 +18,12 @@ log = logging.getLogger(__name__) +# TODO: send this upstream (elasticsearch_dsl and django_elasticsearch_dsl). +class WildcardField(Keyword, fields.DEDField): + + name = 'wildcard' + + class RTDDocTypeMixin: def update(self, *args, **kwargs): @@ -31,6 +38,13 @@ def update(self, *args, **kwargs): @project_index.document class ProjectDocument(RTDDocTypeMixin, Document): + """ + Document representation of a Project. + + We use multi-fields to be able to perform other kind of queries over the same field. + ``raw`` fields are used for Wildcard queries. + """ + # Metadata url = fields.TextField(attr='get_absolute_url') users = fields.NestedField( @@ -41,11 +55,30 @@ class ProjectDocument(RTDDocTypeMixin, Document): ) language = fields.KeywordField() + name = fields.TextField( + attr='name', + fields={ + 'raw': WildcardField(), + }, + ) + slug = fields.TextField( + attr='slug', + fields={ + 'raw': WildcardField(), + }, + ) + description = fields.TextField( + attr='description', + fields={ + 'raw': WildcardField(), + }, + ) + modified_model_field = 'modified_date' class Django: model = Project - fields = ('name', 'slug', 'description') + fields = [] ignore_signals = True @@ -61,6 +94,11 @@ class PageDocument(RTDDocTypeMixin, Document): instead of [python.submodule]. See more at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html # noqa + We use multi-fields to be able to perform other kind of queries over the same field. + ``raw`` fields are used for Wildcard queries. + + https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-analyzers.html + Some text fields use the ``with_positions_offsets`` term vector, this is to have faster highlighting on big documents. See more at https://www.elastic.co/guide/en/elasticsearch/reference/7.9/term-vector.html @@ -75,13 +113,27 @@ class PageDocument(RTDDocTypeMixin, Document): rank = fields.IntegerField() # Searchable content - title = fields.TextField(attr='processed_json.title') + title = fields.TextField( + attr='processed_json.title', + fields={ + 'raw': WildcardField(), + }, + ) sections = fields.NestedField( attr='processed_json.sections', properties={ 'id': fields.KeywordField(), - 'title': fields.TextField(), - 'content': fields.TextField(term_vector='with_positions_offsets'), + 'title': fields.TextField( + fields={ + 'raw': WildcardField(), + }, + ), + 'content': fields.TextField( + term_vector='with_positions_offsets', + fields={ + 'raw': WildcardField(), + }, + ), } ) domains = fields.NestedField( @@ -93,11 +145,20 @@ class PageDocument(RTDDocTypeMixin, Document): # For showing in the search result 'type_display': fields.TextField(), - 'docstrings': fields.TextField(term_vector='with_positions_offsets'), - - # Simple analyzer breaks on `.`, - # otherwise search results are too strict for this use case - 'name': fields.TextField(analyzer='simple'), + 'docstrings': fields.TextField( + term_vector='with_positions_offsets', + fields={ + 'raw': WildcardField(), + }, + ), + 'name': fields.TextField( + # Simple analyzer breaks on `.`, + # otherwise search results are too strict for this use case + analyzer='simple', + fields={ + 'raw': WildcardField(), + }, + ), } ) diff --git a/readthedocs/search/faceted_search.py b/readthedocs/search/faceted_search.py index 33e0064bb11..91b6fffdb13 100644 --- a/readthedocs/search/faceted_search.py +++ b/readthedocs/search/faceted_search.py @@ -87,21 +87,13 @@ def _get_queries(self, *, query, fields): """ Get a list of query objects according to the query. - If the query is a *single term* (a single word) - we try to match partial words and substrings - (available only with the DEFAULT_TO_FUZZY_SEARCH feature flag). - - If the query is a phrase or contains the syntax from a simple query string, - we use the SimpleQueryString query. + If the query is a single term we try to match partial words and substrings + (available only with the DEFAULT_TO_FUZZY_SEARCH feature flag), + otherwise we use the SimpleQueryString query. """ - is_single_term = ( - not self.use_advanced_query and - query and len(query.split()) <= 1 and - not self._is_advanced_query(query) - ) get_queries_function = ( self._get_single_term_queries - if is_single_term + if self._is_single_term(query) else self._get_text_queries ) @@ -150,6 +142,7 @@ def _get_single_term_queries(self, query, fields): The score of "and" should be higher as it satisfies both "or" and "and". We use the Wildcard query with the query surrounded by ``*`` to match substrings. + We use the raw fields (Wildcard fields) instead of the normal field for performance. For valid options, see: @@ -164,8 +157,9 @@ def _get_single_term_queries(self, query, fields): ) queries.append(query_string) for field in fields: - # Remove boosting from the field - field = re.sub(r'\^.*$', '', field) + # Remove boosting from the field, + # and query from the raw field. + field = re.sub(r'\^.*$', '.raw', field) kwargs = { field: {'value': f'*{query}*'}, } @@ -188,6 +182,21 @@ def _get_fuzzy_query(self, *, query, fields, operator): prefix_length=1, ) + def _is_single_term(self, query): + """ + Check if the query is a single term. + + A query is a single term if it is a single word, + if it doesn't contain the syntax from a simple query string, + and if `self.use_advanced_query` is False. + """ + is_single_term = ( + not self.use_advanced_query and + query and len(query.split()) <= 1 and + not self._is_advanced_query(query) + ) + return is_single_term + def _is_advanced_query(self, query): """ Check if query looks like to be using the syntax from a simple query string. @@ -333,11 +342,18 @@ def _get_nested_query(self, *, query, path, fields): fields=fields, ) - raw_fields = ( + raw_fields = [ # Remove boosting from the field re.sub(r'\^.*$', '', field) for field in fields - ) + ] + + # Highlight from the raw fields too, if it is a single term. + if self._is_single_term(query): + raw_fields.extend([ + re.sub(r'\^.*$', '.raw', field) + for field in fields + ]) highlight = dict( self._highlight_options, diff --git a/readthedocs/search/serializers.py b/readthedocs/search/serializers.py index 08e52ecdce9..495ea6c1736 100644 --- a/readthedocs/search/serializers.py +++ b/readthedocs/search/serializers.py @@ -25,11 +25,28 @@ VersionData = namedtuple('VersionData', ['slug', 'docs_url']) +def get_raw_field(obj, field, default=None): + """Get the ``raw`` version of this field or fallback to the original field.""" + return ( + getattr(obj, f'{field}.raw', default) + or getattr(obj, field, default) + ) + + class ProjectHighlightSerializer(serializers.Serializer): - name = serializers.ListField(child=serializers.CharField(), default=list) - slug = serializers.ListField(child=serializers.CharField(), default=list) - description = serializers.ListField(child=serializers.CharField(), default=list) + name = serializers.SerializerMethodField() + slug = serializers.SerializerMethodField() + description = serializers.SerializerMethodField() + + def get_name(self, obj): + return list(get_raw_field(obj, 'name', [])) + + def get_slug(self, obj): + return list(get_raw_field(obj, 'slug', [])) + + def get_description(self, obj): + return list(get_raw_field(obj, 'description', [])) class ProjectSearchSerializer(serializers.Serializer): @@ -44,7 +61,10 @@ class ProjectSearchSerializer(serializers.Serializer): class PageHighlightSerializer(serializers.Serializer): - title = serializers.ListField(child=serializers.CharField(), default=list) + title = serializers.SerializerMethodField() + + def get_title(self, obj): + return list(get_raw_field(obj, 'title', [])) class PageSearchSerializer(serializers.Serializer): @@ -166,12 +186,10 @@ class DomainHighlightSerializer(serializers.Serializer): content = serializers.SerializerMethodField() def get_name(self, obj): - name = getattr(obj, 'domains.name', []) - return list(name) + return list(get_raw_field(obj, 'domains.name', [])) def get_content(self, obj): - docstring = getattr(obj, 'domains.docstrings', []) - return list(docstring) + return list(get_raw_field(obj, 'domains.docstrings', [])) class DomainSearchSerializer(serializers.Serializer): @@ -190,12 +208,10 @@ class SectionHighlightSerializer(serializers.Serializer): content = serializers.SerializerMethodField() def get_title(self, obj): - title = getattr(obj, 'sections.title', []) - return list(title) + return list(get_raw_field(obj, 'sections.title', [])) def get_content(self, obj): - content = getattr(obj, 'sections.content', []) - return list(content) + return list(get_raw_field(obj, 'sections.content', [])) class SectionSearchSerializer(serializers.Serializer): diff --git a/readthedocs/search/tests/test_api.py b/readthedocs/search/tests/test_api.py index 1efa212295d..ba9fd753fc5 100644 --- a/readthedocs/search/tests/test_api.py +++ b/readthedocs/search/tests/test_api.py @@ -605,7 +605,12 @@ def test_search_single_query(self, api_client): results = resp.data['results'] assert len(results) > 0 - assert 'Index' in results[0]['title'] + assert 'Support' in results[0]['title'] + # find is more closer than index, so is listed first. + highlights = results[0]['blocks'][0]['highlights'] + assert 'find' in highlights['content'][0] + + assert 'Index' in results[1]['title'] # Query with a partial word, but we want to match that search_params = {