Skip to content

Commit

Permalink
feat(document-index): add hybrid_index to index_configuration (#1052)
Browse files Browse the repository at this point in the history
  • Loading branch information
TilTheunissenAA authored Sep 27, 2024
1 parent 4facd7e commit c537072
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 2 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changelog
## Unreleased
### Features
- You can now specify a `hybrid_index` when creating an index for the document index to use hybrid (semantic and keyword) search.
- `min_score` and `max_results` are now optional parameters in `DocumentIndexClient.SearchQuery`.
- `k` is now an optional parameter in `DocumentIndexRetriever`.

Expand Down
59 changes: 59 additions & 0 deletions src/documentation/document_index.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,65 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Hybrid Search\n",
"\n",
"The Document Index supports hybrid search, which combines results of semantic search and keyword search.\n",
"\n",
"In order to use hybrid search, we need to create a hybrid index and assign it to the collection:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# change this value if you want to use an index of a different name\n",
"HYBRID_INDEX = \"intelligence-layer-sdk-demo-hybrid-index\"\n",
"\n",
"index_path = IndexPath(namespace=NAMESPACE, index=HYBRID_INDEX)\n",
"\n",
"# customise the parameters of the index here\n",
"index_configuration = IndexConfiguration(\n",
" chunk_size=64, chunk_overlap=0, embedding_type=\"asymmetric\", hybrid_index=\"bm25\"\n",
")\n",
"\n",
"# create the namespace-wide index resource\n",
"document_index.create_index(index_path, index_configuration)\n",
"\n",
"# assign the index to the collection\n",
"document_index.assign_index_to_collection(collection_path, HYBRID_INDEX)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If we now search on the hybrid index, we will not only get chunks with a semantic similarity but also chunks that match the keywords in the query:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"document_index_retriever = DocumentIndexRetriever(\n",
" document_index=document_index,\n",
" index_name=HYBRID_INDEX,\n",
" namespace=NAMESPACE,\n",
" collection=COLLECTION,\n",
" k=5,\n",
" threshold=0.5,\n",
")\n",
"\n",
"document_index_retriever.get_relevant_documents_with_scores(query=\"25 April\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@ class IndexConfiguration(BaseModel):
chunk_overlap: The maximum number of tokens of overlap between consecutive chunks. Must be
less than `chunk_size`.
chunk_size: The maximum size of the chunks in tokens to be used for the index.
hybrid_index: If set to "bm25", combine vector search and keyword search (bm25) results.
"""

embedding_type: Literal["symmetric", "asymmetric"]
chunk_overlap: int = Field(default=0, ge=0)
chunk_size: int = Field(..., gt=0, le=2046)
hybrid_index: Literal["bm25"] | None = None

@model_validator(mode="after")
def validate_chunk_overlap(self) -> Self:
Expand Down Expand Up @@ -502,6 +504,7 @@ def create_index(
data = {
"chunk_size": index_configuration.chunk_size,
"embedding_type": index_configuration.embedding_type,
"hybrid_index": index_configuration.hybrid_index,
}
response = requests.put(url, data=dumps(data), headers=self.headers)
self._raise_for_status(response)
Expand Down
27 changes: 25 additions & 2 deletions tests/connectors/document_index/test_document_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def filter_index_config() -> dict[str, dict[str, str]]:
@fixture
def collection_path(aleph_alpha_namespace: str) -> CollectionPath:
return CollectionPath(
namespace=aleph_alpha_namespace, collection="intelligence-layer-sdk-ci"
namespace=aleph_alpha_namespace,
collection="intelligence-layer-sdk-ci-2024-09-26",
)


Expand Down Expand Up @@ -200,6 +201,27 @@ def test_document_index_searches_asymmetrically(
assert "Mark" in search_result[0].section


def test_document_index_hybrid_search_combines_semantic_and_keyword_search(
document_index: DocumentIndexClient, collection_path: CollectionPath
) -> None:
document_path = DocumentPath(
collection_path=collection_path,
document_name="test_document_index_hybrid_search_combines_semantic_and_keyword_search", # is always there
)
search_query = SearchQuery(
query="Which food do humans like?",
max_results=10,
min_score=0.0,
)
search_results = document_index.search(
document_path.collection_path, "asymmetric-hybrid", search_query
)

assert "Poison in food is bad" in search_results[0].section
assert "All people like pasta" in search_results[1].section
assert "Chairs are not food" in search_results[2].section


@pytest.mark.internal
@pytest.mark.parametrize(
"document_name",
Expand Down Expand Up @@ -273,7 +295,7 @@ def test_document_list_all_documents(
) -> None:
filter_result = document_index.documents(collection_path)

assert len(filter_result) == 6
assert len(filter_result) == 7


def test_document_list_max_n_documents(
Expand Down Expand Up @@ -333,6 +355,7 @@ def test_document_indexes_are_returned(
assert index_configuration.embedding_type == "asymmetric"
assert index_configuration.chunk_overlap == 0
assert index_configuration.chunk_size == 512
assert index_configuration.hybrid_index is None


def test_create_filter_indexes_in_namespace(
Expand Down

0 comments on commit c537072

Please sign in to comment.