diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b512307..75c40842 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## Unreleased ### Features +- You can now specify a `hybrid_index` when creating an index for the document index to use hybrid (semantic and keyword) search. - `min_score` and `max_results` are now optional parameters in `DocumentIndexClient.SearchQuery`. - `k` is now an optional parameter in `DocumentIndexRetriever`. diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index f4963a18..a2b3f5c1 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -290,6 +290,65 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Hybrid Search\n", + "\n", + "The Document Index supports hybrid search, which combines results of semantic search and keyword search.\n", + "\n", + "In order to use hybrid search, we need to create a hybrid index and assign it to the collection:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# change this value if you want to use an index of a different name\n", + "HYBRID_INDEX = \"intelligence-layer-sdk-demo-hybrid-index\"\n", + "\n", + "index_path = IndexPath(namespace=NAMESPACE, index=HYBRID_INDEX)\n", + "\n", + "# customise the parameters of the index here\n", + "index_configuration = IndexConfiguration(\n", + " chunk_size=64, chunk_overlap=0, embedding_type=\"asymmetric\", hybrid_index=\"bm25\"\n", + ")\n", + "\n", + "# create the namespace-wide index resource\n", + "document_index.create_index(index_path, index_configuration)\n", + "\n", + "# assign the index to the collection\n", + "document_index.assign_index_to_collection(collection_path, HYBRID_INDEX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we now search on the hybrid index, we will not only get chunks with a semantic similarity but also chunks that match the keywords in the query:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "document_index_retriever = DocumentIndexRetriever(\n", + " document_index=document_index,\n", + " index_name=HYBRID_INDEX,\n", + " namespace=NAMESPACE,\n", + " collection=COLLECTION,\n", + " k=5,\n", + " threshold=0.5,\n", + ")\n", + "\n", + "document_index_retriever.get_relevant_documents_with_scores(query=\"25 April\")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 2a3821fe..736be336 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -36,11 +36,13 @@ class IndexConfiguration(BaseModel): chunk_overlap: The maximum number of tokens of overlap between consecutive chunks. Must be less than `chunk_size`. chunk_size: The maximum size of the chunks in tokens to be used for the index. + hybrid_index: If set to "bm25", combine vector search and keyword search (bm25) results. """ embedding_type: Literal["symmetric", "asymmetric"] chunk_overlap: int = Field(default=0, ge=0) chunk_size: int = Field(..., gt=0, le=2046) + hybrid_index: Literal["bm25"] | None = None @model_validator(mode="after") def validate_chunk_overlap(self) -> Self: @@ -502,6 +504,7 @@ def create_index( data = { "chunk_size": index_configuration.chunk_size, "embedding_type": index_configuration.embedding_type, + "hybrid_index": index_configuration.hybrid_index, } response = requests.put(url, data=dumps(data), headers=self.headers) self._raise_for_status(response) diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py index 51bbb805..5e930ab1 100644 --- a/tests/connectors/document_index/test_document_index.py +++ b/tests/connectors/document_index/test_document_index.py @@ -57,7 +57,8 @@ def filter_index_config() -> dict[str, dict[str, str]]: @fixture def collection_path(aleph_alpha_namespace: str) -> CollectionPath: return CollectionPath( - namespace=aleph_alpha_namespace, collection="intelligence-layer-sdk-ci" + namespace=aleph_alpha_namespace, + collection="intelligence-layer-sdk-ci-2024-09-26", ) @@ -200,6 +201,27 @@ def test_document_index_searches_asymmetrically( assert "Mark" in search_result[0].section +def test_document_index_hybrid_search_combines_semantic_and_keyword_search( + document_index: DocumentIndexClient, collection_path: CollectionPath +) -> None: + document_path = DocumentPath( + collection_path=collection_path, + document_name="test_document_index_hybrid_search_combines_semantic_and_keyword_search", # is always there + ) + search_query = SearchQuery( + query="Which food do humans like?", + max_results=10, + min_score=0.0, + ) + search_results = document_index.search( + document_path.collection_path, "asymmetric-hybrid", search_query + ) + + assert "Poison in food is bad" in search_results[0].section + assert "All people like pasta" in search_results[1].section + assert "Chairs are not food" in search_results[2].section + + @pytest.mark.internal @pytest.mark.parametrize( "document_name", @@ -273,7 +295,7 @@ def test_document_list_all_documents( ) -> None: filter_result = document_index.documents(collection_path) - assert len(filter_result) == 6 + assert len(filter_result) == 7 def test_document_list_max_n_documents( @@ -333,6 +355,7 @@ def test_document_indexes_are_returned( assert index_configuration.embedding_type == "asymmetric" assert index_configuration.chunk_overlap == 0 assert index_configuration.chunk_size == 512 + assert index_configuration.hybrid_index is None def test_create_filter_indexes_in_namespace(