feat(document-index): add hybrid_index to index_configuration (#1052)

Aleph-Alpha · Sep 27, 2024 · c537072 · c537072
1 parent 4facd7e
commit c537072
Show file tree

Hide file tree

Showing 4 changed files with 88 additions and 2 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # Changelog
 ## Unreleased
 ### Features
+- You can now specify a `hybrid_index` when creating an index for the document index to use hybrid (semantic and keyword) search.
 - `min_score` and `max_results` are now optional parameters in `DocumentIndexClient.SearchQuery`.
 - `k` is now an optional parameter in `DocumentIndexRetriever`.
 

diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb
@@ -290,6 +290,65 @@
     ")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Hybrid Search\n",
+    "\n",
+    "The Document Index supports hybrid search, which combines results of semantic search and keyword search.\n",
+    "\n",
+    "In order to use hybrid search, we need to create a hybrid index and assign it to the collection:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# change this value if you want to use an index of a different name\n",
+    "HYBRID_INDEX = \"intelligence-layer-sdk-demo-hybrid-index\"\n",
+    "\n",
+    "index_path = IndexPath(namespace=NAMESPACE, index=HYBRID_INDEX)\n",
+    "\n",
+    "# customise the parameters of the index here\n",
+    "index_configuration = IndexConfiguration(\n",
+    "    chunk_size=64, chunk_overlap=0, embedding_type=\"asymmetric\", hybrid_index=\"bm25\"\n",
+    ")\n",
+    "\n",
+    "# create the namespace-wide index resource\n",
+    "document_index.create_index(index_path, index_configuration)\n",
+    "\n",
+    "# assign the index to the collection\n",
+    "document_index.assign_index_to_collection(collection_path, HYBRID_INDEX)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we now search on the hybrid index, we will not only get chunks with a semantic similarity but also chunks that match the keywords in the query:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "document_index_retriever = DocumentIndexRetriever(\n",
+    "    document_index=document_index,\n",
+    "    index_name=HYBRID_INDEX,\n",
+    "    namespace=NAMESPACE,\n",
+    "    collection=COLLECTION,\n",
+    "    k=5,\n",
+    "    threshold=0.5,\n",
+    ")\n",
+    "\n",
+    "document_index_retriever.get_relevant_documents_with_scores(query=\"25 April\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py
@@ -36,11 +36,13 @@ class IndexConfiguration(BaseModel):
         chunk_overlap: The maximum number of tokens of overlap between consecutive chunks. Must be
             less than `chunk_size`.
         chunk_size: The maximum size of the chunks in tokens to be used for the index.
+        hybrid_index: If set to "bm25", combine vector search and keyword search (bm25) results.
     """
 
     embedding_type: Literal["symmetric", "asymmetric"]
     chunk_overlap: int = Field(default=0, ge=0)
     chunk_size: int = Field(..., gt=0, le=2046)
+    hybrid_index: Literal["bm25"] | None = None
 
     @model_validator(mode="after")
     def validate_chunk_overlap(self) -> Self:
@@ -502,6 +504,7 @@ def create_index(
         data = {
             "chunk_size": index_configuration.chunk_size,
             "embedding_type": index_configuration.embedding_type,
+            "hybrid_index": index_configuration.hybrid_index,
         }
         response = requests.put(url, data=dumps(data), headers=self.headers)
         self._raise_for_status(response)

diff --git a/tests/connectors/document_index/test_document_index.py b/tests/connectors/document_index/test_document_index.py
@@ -57,7 +57,8 @@ def filter_index_config() -> dict[str, dict[str, str]]:
 @fixture
 def collection_path(aleph_alpha_namespace: str) -> CollectionPath:
     return CollectionPath(
-        namespace=aleph_alpha_namespace, collection="intelligence-layer-sdk-ci"
+        namespace=aleph_alpha_namespace,
+        collection="intelligence-layer-sdk-ci-2024-09-26",
     )
 
 
@@ -200,6 +201,27 @@ def test_document_index_searches_asymmetrically(
     assert "Mark" in search_result[0].section
 
 
+def test_document_index_hybrid_search_combines_semantic_and_keyword_search(
+    document_index: DocumentIndexClient, collection_path: CollectionPath
+) -> None:
+    document_path = DocumentPath(
+        collection_path=collection_path,
+        document_name="test_document_index_hybrid_search_combines_semantic_and_keyword_search",  # is always there
+    )
+    search_query = SearchQuery(
+        query="Which food do humans like?",
+        max_results=10,
+        min_score=0.0,
+    )
+    search_results = document_index.search(
+        document_path.collection_path, "asymmetric-hybrid", search_query
+    )
+
+    assert "Poison in food is bad" in search_results[0].section
+    assert "All people like pasta" in search_results[1].section
+    assert "Chairs are not food" in search_results[2].section
+
+
 @pytest.mark.internal
 @pytest.mark.parametrize(
     "document_name",
@@ -273,7 +295,7 @@ def test_document_list_all_documents(
 ) -> None:
     filter_result = document_index.documents(collection_path)
 
-    assert len(filter_result) == 6
+    assert len(filter_result) == 7
 
 
 def test_document_list_max_n_documents(
@@ -333,6 +355,7 @@ def test_document_indexes_are_returned(
     assert index_configuration.embedding_type == "asymmetric"
     assert index_configuration.chunk_overlap == 0
     assert index_configuration.chunk_size == 512
+    assert index_configuration.hybrid_index is None
 
 
 def test_create_filter_indexes_in_namespace(