From 4facd7e539b756616bde71dc07724dfe77d1940e Mon Sep 17 00:00:00 2001 From: Michael Barlow <25936840+Michael-JB@users.noreply.github.com> Date: Fri, 27 Sep 2024 16:23:18 +0200 Subject: [PATCH] [DI-206] feat: make Document Index search parameters optional (#1053) * [DI-206] feat: make Document Index search parameters optional This commit aligns optional parameters in the IL with the Document Index API. * [DI-206] docs: move DocumentIndexRetriever constructor args - move init docstring to constructor - move changes to correct place in changelog --- CHANGELOG.md | 8 ++++-- src/documentation/document_index.ipynb | 2 -- .../document_index/document_index.py | 4 +-- .../retrievers/document_index_retriever.py | 27 ++++++++++--------- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9649b938c..6b512307f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,13 +1,17 @@ # Changelog ## Unreleased ### Features -... +- `min_score` and `max_results` are now optional parameters in `DocumentIndexClient.SearchQuery`. +- `k` is now an optional parameter in `DocumentIndexRetriever`. + ### Fixes ... ### Deprecations ... ### Breaking Changes - - The default model for `Llama3InstructModel` is now `llama-3.1-8b-instruct` instead of `llama-3-8b-instruct`. We also removed the llama3.0 models from the recommended models of the `Llama3InstructModel`. +- The default model for `Llama3InstructModel` is now `llama-3.1-8b-instruct` instead of `llama-3-8b-instruct`. We also removed the llama3.0 models from the recommended models of the `Llama3InstructModel`. +- The default value of `threshold` in the `DocumentIndexRetriever` has changed from `0.5` to `0.0`. This accommodates fusion scoring for searches over hybrid indexes. + ## 6.0.0 diff --git a/src/documentation/document_index.ipynb b/src/documentation/document_index.ipynb index 6961e3a31..f4963a18e 100644 --- a/src/documentation/document_index.ipynb +++ b/src/documentation/document_index.ipynb @@ -283,7 +283,6 @@ " namespace=NAMESPACE,\n", " collection=COLLECTION,\n", " k=5,\n", - " threshold=0.5,\n", ")\n", "\n", "document_index_retriever.get_relevant_documents_with_scores(\n", @@ -402,7 +401,6 @@ " namespace=NAMESPACE,\n", " collection=COLLECTION,\n", " k=5,\n", - " threshold=0.2,\n", ")" ] }, diff --git a/src/intelligence_layer/connectors/document_index/document_index.py b/src/intelligence_layer/connectors/document_index/document_index.py index 5b8ec9973..2a3821fed 100644 --- a/src/intelligence_layer/connectors/document_index/document_index.py +++ b/src/intelligence_layer/connectors/document_index/document_index.py @@ -255,8 +255,8 @@ class SearchQuery(BaseModel): """ query: str - max_results: int = Field(..., ge=0) - min_score: float = Field(..., ge=0.0, le=1.0) + max_results: int = Field(ge=0, default=1) + min_score: float = Field(ge=0.0, le=1.0, default=0.0) filters: Optional[list[Filters]] = None diff --git a/src/intelligence_layer/connectors/retrievers/document_index_retriever.py b/src/intelligence_layer/connectors/retrievers/document_index_retriever.py index 271fe53df..2dbf3e675 100644 --- a/src/intelligence_layer/connectors/retrievers/document_index_retriever.py +++ b/src/intelligence_layer/connectors/retrievers/document_index_retriever.py @@ -20,16 +20,7 @@ class DocumentIndexRetriever(BaseRetriever[DocumentPath]): """Search through documents within collections in the `DocumentIndexClient`. - We initialize this Retriever with a collection & namespace names, and we can find the documents in the collection - most semanticly similar to our query. - - Args: - document_index: Client offering functionality for search. - index_name: The name of the index to be used. - namespace: The namespace within the `DocumentIndexClient` where all collections are stored. - collection: The collection within the namespace that holds the desired documents. - k: The (top) number of documents to be returned by search. - threshold: The mimumum value of cosine similarity between the query vector and the document vector. + This retriever lets you search for relevant documents in the given Document Index collection. Example: >>> import os @@ -45,9 +36,21 @@ def __init__( index_name: str, namespace: str, collection: str, - k: int, - threshold: float = 0.5, + k: int = 1, + threshold: float = 0.0, ) -> None: + """Initialize the DocumentIndexRetriever. + + Args: + document_index: The Document Index client. + index_name: The name of the Document Index index to use. + namespace: The Document Index namespace. + collection: The Document Index collection to use. This is the search context for the retriever. + k: The number of most-relevant documents to return when searching. Defaults to 1. + threshold: The minimum score for search results. For semantic indexes, this is the cosine + similarity between the query and the document chunk. For hybrid indexes, this corresponds + to fusion rank. Defaults to 0.0. + """ self._document_index = document_index self._index_name = index_name self._collection_path = CollectionPath(