-
Notifications
You must be signed in to change notification settings - Fork 375
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable search data from SearchEngine
component
#3037
Merged
frascuchon
merged 14 commits into
develop
from
3017-api-enable-search-data-from-searchengine-component
Jun 1, 2023
Merged
Changes from 11 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
dfeac41
WIP: Support text search from SearchEngine
frascuchon 19f3f7e
Support text search from SearchEngine
frascuchon e04ac44
Merge branch '3017-api-enable-search-data-from-searchengine-component…
frascuchon 179fc01
Using match query and single-query-string
frascuchon 54e4920
Revert import elasticsearch
frascuchon beda487
Merge branch 'develop' into 3017-api-enable-search-data-from-searchen…
frascuchon dfb3da2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] 46566f2
Merge branch 'develop' into 3017-api-enable-search-data-from-searchen…
frascuchon 6e4c855
Simplify search text queries
frascuchon da9d842
Adapt tests
frascuchon 791d036
Remove refs to pagination
frascuchon 8caa5ac
Fix tests
frascuchon 391f217
Refactor search engine
frascuchon 1cbd04a
Text search compatible with both elastic and opensearch
frascuchon File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,8 @@ | |
# limitations under the License. | ||
|
||
import dataclasses | ||
from typing import Any, Dict, Iterable, Optional | ||
from typing import Any, Dict, Iterable, List, Optional, Union | ||
from uuid import UUID | ||
|
||
from opensearchpy import AsyncOpenSearch, helpers | ||
from pydantic import BaseModel | ||
|
@@ -27,7 +28,9 @@ | |
QuestionType, | ||
Record, | ||
ResponseStatus, | ||
User, | ||
) | ||
from argilla.server.schemas.v1.datasets import ResponseStatusFilter | ||
from argilla.server.settings import settings | ||
|
||
|
||
|
@@ -50,6 +53,7 @@ class UserResponse(BaseModel): | |
|
||
|
||
class SearchDocument(BaseModel): | ||
id: UUID | ||
fields: Dict[str, Any] | ||
|
||
responses: Optional[Dict[str, UserResponse]] | ||
|
@@ -59,6 +63,34 @@ class Config: | |
getter_dict = SearchDocumentGetter | ||
|
||
|
||
@dataclasses.dataclass | ||
class TextFieldQuery: | ||
q: str | ||
field: Optional[str] = None | ||
|
||
|
||
@dataclasses.dataclass | ||
class Query: | ||
text: TextFieldQuery | ||
|
||
|
||
@dataclasses.dataclass | ||
class UserResponseStatusFilter: | ||
user: User | ||
statuses: List[ResponseStatusFilter] | ||
|
||
|
||
@dataclasses.dataclass | ||
class SearchResponseItem: | ||
record_id: UUID | ||
score: Optional[float] | ||
|
||
|
||
@dataclasses.dataclass | ||
class SearchResponses: | ||
items: List[SearchResponseItem] | ||
|
||
|
||
@dataclasses.dataclass | ||
class SearchEngine: | ||
config: Dict[str, Any] | ||
|
@@ -68,6 +100,7 @@ def __post_init__(self): | |
|
||
async def create_index(self, dataset: Dataset): | ||
fields = { | ||
"id": {"type": "keyword"}, | ||
"responses": {"dynamic": True, "type": "object"}, | ||
} | ||
|
||
|
@@ -96,27 +129,6 @@ async def create_index(self, dataset: Dataset): | |
index_name = self._index_name_for_dataset(dataset) | ||
await self.client.indices.create(index=index_name, body=dict(mappings=mappings)) | ||
|
||
def _field_mapping_for_question(self, question: Question): | ||
settings = question.parsed_settings | ||
|
||
if settings.type == QuestionType.rating: | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html | ||
return {"type": "integer"} | ||
elif settings.type in [QuestionType.text, QuestionType.label_selection, QuestionType.multi_label_selection]: | ||
# TODO: Review mapping for label selection. Could make sense to use `keyword` mapping instead. See https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/text.html | ||
return {"type": "text", "index": False} | ||
else: | ||
raise ValueError(f"ElasticSearch mappings for Question of type {settings.type} cannot be generated") | ||
|
||
def _es_mapping_for_field(self, field: Field): | ||
field_type = field.settings["type"] | ||
|
||
if field_type == FieldType.text: | ||
return {"type": "text"} | ||
else: | ||
raise ValueError(f"ElasticSearch mappings for Field of type {field_type} cannot be generated") | ||
|
||
async def add_records(self, dataset: Dataset, records: Iterable[Record]): | ||
index_name = self._index_name_for_dataset(dataset) | ||
|
||
|
@@ -139,6 +151,90 @@ async def add_records(self, dataset: Dataset, records: Iterable[Record]): | |
if errors: | ||
raise RuntimeError(errors) | ||
|
||
async def search( | ||
self, | ||
dataset: Dataset, | ||
query: Union[Query, str], | ||
user_response_status_filter: Optional[UserResponseStatusFilter] = None, | ||
limit: int = 100, | ||
) -> SearchResponses: | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html | ||
|
||
if isinstance(query, str): | ||
query = Query(text=TextFieldQuery(q=query)) | ||
|
||
bool_query = {"must": []} | ||
if not query.text.field: | ||
# https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-combined-fields-query.html | ||
text_query = { | ||
"combined_fields": { | ||
"query": query.text.q, | ||
"fields": [f"fields.{field.name}" for field in dataset.fields], | ||
"operator": "and", | ||
} | ||
} | ||
else: | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html | ||
text_query = { | ||
"match_phrase": { | ||
f"fields.{query.text.field}": { | ||
"query": query.text.q, | ||
"operator": "and", | ||
} | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
|
||
bool_query["must"].append(text_query) | ||
|
||
if user_response_status_filter: | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-terms-query.html | ||
user_response_status_field = f"responses.{user_response_status_filter.user.username}.status" | ||
bool_query["filter"] = [{"terms": {user_response_status_field: user_response_status_filter.statuses}}] | ||
|
||
body = { | ||
"_source": False, | ||
"query": {"bool": bool_query}, | ||
"sort": ["_score", {"id": "asc"}], | ||
} | ||
# TODO: Work on search pagination after endpoint integration | ||
next_page_token = None | ||
if next_page_token: | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/paginate-search-results.html | ||
body["search_after"] = next_page_token | ||
|
||
response = await self.client.search(index=self._index_name_for_dataset(dataset), size=limit, body=body) | ||
|
||
items = [] | ||
next_page_token = None | ||
for hit in response["hits"]["hits"]: | ||
items.append(SearchResponseItem(record_id=hit["_id"], score=hit["_score"])) | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/paginate-search-results.html | ||
next_page_token = hit.get("_sort") | ||
|
||
return SearchResponses(items=items) | ||
|
||
def _field_mapping_for_question(self, question: Question): | ||
settings = question.parsed_settings | ||
|
||
if settings.type == QuestionType.rating: | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/number.html | ||
return {"type": "integer"} | ||
elif settings.type in [QuestionType.text, QuestionType.label_selection, QuestionType.multi_label_selection]: | ||
# TODO: Review mapping for label selection. Could make sense to use `keyword` mapping instead. | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/keyword.html | ||
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/text.html | ||
return {"type": "text", "index": False} | ||
else: | ||
raise ValueError(f"ElasticSearch mappings for Question of type {settings.type} cannot be generated") | ||
|
||
def _es_mapping_for_field(self, field: Field): | ||
field_type = field.settings["type"] | ||
|
||
if field_type == FieldType.text: | ||
return {"type": "text"} | ||
else: | ||
raise ValueError(f"ElasticSearch mappings for Field of type {field_type} cannot be generated") | ||
|
||
@staticmethod | ||
def _index_name_for_dataset(dataset: Dataset): | ||
return f"rg.{dataset.id}" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we could move this to a function returning
text_query
as it would be easier to test with unit tests.