From b842c261e79ef6795eada5b63d6c32c5d84f419e Mon Sep 17 00:00:00 2001 From: Karen Shaw Date: Mon, 11 Mar 2024 17:13:46 +0000 Subject: [PATCH] Implement hybrid search --- README.md | 2 +- chat/src/event_config.py | 2 +- chat/src/handlers/opensearch_neural_search.py | 134 +++++++++++------- chat/src/helpers/response.py | 2 +- chat/src/setup.py | 24 ++-- chat/template.yaml | 22 +-- chat/test/test_event_config.py | 2 +- dev/env.json | 2 +- node/src/api/opensearch.js | 10 +- node/src/environment.js | 6 +- node/test/test-helpers/index.js | 2 +- node/test/unit/aws/environment.test.js | 4 +- template.yaml | 18 +-- 13 files changed, 116 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index 71cec8b9..a9b0b6b8 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ The `env.json` file contains environment variable values for the lambda function Some of the values can be found as follows: - `API_TOKEN_SECRET` - already defined; value has to exist but doesn't matter in dev mode -- `ELASTICSEARCH_ENDPOINT` - run the following command: +- `OPENSEARCH_ENDPOINT` - run the following command: ``` aws secretsmanager get-secret-value \ --secret-id dev-environment/config/meadow --query SecretString \ diff --git a/chat/src/event_config.py b/chat/src/event_config.py index b9da4881..7794c8e3 100644 --- a/chat/src/event_config.py +++ b/chat/src/event_config.py @@ -20,7 +20,7 @@ K_VALUE = 5 MAX_K = 100 TEMPERATURE = 0.2 -TEXT_KEY = "title" +TEXT_KEY = "id" VERSION = "2023-07-01-preview" @dataclass diff --git a/chat/src/handlers/opensearch_neural_search.py b/chat/src/handlers/opensearch_neural_search.py index 56055637..29e65ee3 100644 --- a/chat/src/handlers/opensearch_neural_search.py +++ b/chat/src/handlers/opensearch_neural_search.py @@ -1,63 +1,87 @@ from langchain_core.documents import Document from langchain_core.vectorstores import VectorStore from opensearchpy import OpenSearch -from typing import Any, List - -class OpensearchNeuralSearch(VectorStore): - """Read-only OpenSearch vectorstore with neural search.""" - - def __init__( - self, - endpoint: str, - index: str, - model_id: str, - vector_field: str = "embedding", - search_pipeline: str = None, - **kwargs: Any - ): - self.client = OpenSearch(hosts=[{"host": endpoint, "port": "443", "use_ssl": True}], **kwargs) - self.index = index - self.model_id = model_id - self.vector_field = vector_field - self.search_pipeline = search_pipeline - - # Allow for hybrid searching - # Allow for different types of searches - # Allow for _source override - - def similarity_search( - self, - query: str, - k: int = 10, - subquery: Any = None, - **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query.""" - dsl = { - 'size': k, - 'query': { - 'hybrid': { - 'queries': [ - { - 'neural': { - self.vector_field: { - 'query_text': query, - 'model_id': self.model_id, - 'k': k +from typing import Any, List, Tuple + + +class OpenSearchNeuralSearch(VectorStore): + """Read-only OpenSearch vectorstore with neural search.""" + + def __init__( + self, + endpoint: str, + index: str, + model_id: str, + vector_field: str = "embedding", + search_pipeline: str = None, + text_field: str = "id", + **kwargs: Any, + ): + self.client = OpenSearch( + hosts=[{"host": endpoint, "port": "443", "use_ssl": True}], **kwargs + ) + self.index = index + self.model_id = model_id + self.vector_field = vector_field + self.search_pipeline = search_pipeline + self.text_field = text_field + + def similarity_search( + self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to the embedding vector.""" + docs_with_scores = self.similarity_search_with_score( + query, k, subquery, **kwargs + ) + return [doc[0] for doc in docs_with_scores] + + def similarity_search_with_score( + self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query.""" + dsl = { + "size": k, + "query": { + "hybrid": { + "queries": [ + { + "neural": { + self.vector_field: { + "query_text": query, + "model_id": self.model_id, + "k": k, + } + } + } + ] } - } - } - ] + }, } - } - } - if (subquery): - dsl['query']['hybrid']['queries'].append(subquery) - - for key, value in kwargs.items(): - dsl[key] = value + if subquery: + dsl["query"]["hybrid"]["queries"].append(subquery) + + for key, value in kwargs.items(): + dsl[key] = value + + response = self.client.search(index=self.index, body=dsl) - response = self.client.search(index=self.index, body=dsl) + documents_with_scores = [ + ( + Document( + page_content=hit["_source"][self.text_field], + metadata=(hit["_source"]), + ), + hit["_score"], + ) + for hit in response["hits"]["hits"] + ] - return response # replace this \ No newline at end of file + return documents_with_scores + + def add_texts(self, texts: List[str], metadatas: List[dict], **kwargs: Any) -> None: + pass + + @classmethod + def from_texts(cls, texts: List[str], metadatas: List[dict], **kwargs: Any) -> None: + pass \ No newline at end of file diff --git a/chat/src/helpers/response.py b/chat/src/helpers/response.py index a3b946d4..0e6c8a02 100644 --- a/chat/src/helpers/response.py +++ b/chat/src/helpers/response.py @@ -49,7 +49,7 @@ def extract_prompt_value(v): def prepare_response(config): try: docs = config.opensearch.similarity_search( - config.question, k=config.k, vector_field="embedding", text_field="id" + query=config.question, k=config.k ) original_question = get_and_send_original_question(config, docs) response = config.chain({"question": config.question, "input_documents": docs}) diff --git a/chat/src/setup.py b/chat/src/setup.py index 39a99338..ba3cb72c 100644 --- a/chat/src/setup.py +++ b/chat/src/setup.py @@ -1,7 +1,5 @@ -from content_handler import ContentHandler from langchain_community.chat_models import AzureChatOpenAI -from langchain_community.embeddings import SagemakerEndpointEmbeddings -from langchain_community.vectorstores import OpenSearchVectorSearch +from handlers.opensearch_neural_search import OpenSearchNeuralSearch from opensearchpy import OpenSearch, RequestsHttpConnection from requests_aws4auth import AWS4Auth import os @@ -22,7 +20,7 @@ def opensearch_client(region_name=os.getenv("AWS_REGION")): print(region_name) session = boto3.Session(region_name=region_name) awsauth = AWS4Auth(region=region_name, service="es", refreshable_credentials=session.get_credentials()) - endpoint = os.getenv("ELASTICSEARCH_ENDPOINT") + endpoint = os.getenv("OPENSEARCH_ENDPOINT") return OpenSearch( hosts=[{'host': endpoint, 'port': 443}], @@ -35,20 +33,14 @@ def opensearch_vector_store(region_name=os.getenv("AWS_REGION")): session = boto3.Session(region_name=region_name) awsauth = AWS4Auth(region=region_name, service="es", refreshable_credentials=session.get_credentials()) - sagemaker_client = session.client(service_name="sagemaker-runtime", region_name=session.region_name) - embeddings = SagemakerEndpointEmbeddings( - client=sagemaker_client, - region_name=session.region_name, - endpoint_name=os.getenv("EMBEDDING_ENDPOINT"), - content_handler=ContentHandler() - ) - - docsearch = OpenSearchVectorSearch( - index_name=prefix("dc-v2-work"), - embedding_function=embeddings, - opensearch_url="https://" + os.getenv("ELASTICSEARCH_ENDPOINT"), + docsearch = OpenSearchNeuralSearch( + index=prefix("dc-v2-work"), + model_id=os.getenv("OPENSEARCH_MODEL_ID"), + endpoint=os.getenv("OPENSEARCH_ENDPOINT"), connection_class=RequestsHttpConnection, http_auth=awsauth, + search_pipeline=prefix("dc-v2-work-pipeline"), + text_field= "id" ) return docsearch diff --git a/chat/template.yaml b/chat/template.yaml index d7696246..1d4985eb 100644 --- a/chat/template.yaml +++ b/chat/template.yaml @@ -8,21 +8,18 @@ Parameters: AzureOpenaiApiKey: Type: String Description: Azure OpenAI API Key - AzureOpenaiEmbeddingDeploymentId: - Type: String - Description: Azure OpenAI Embedding Deployment ID AzureOpenaiLlmDeploymentId: Type: String Description: Azure OpenAI LLM Deployment ID AzureOpenaiResourceName: Type: String Description: Azure OpenAI Resource Name - ElasticsearchEndpoint: + OpenSearchEndpoint: Type: String - Description: Elasticsearch URL - EmbeddingEndpoint: + Description: OpenSearch Endpoint + OpenSearchModelId: Type: String - Description: Sagemaker Inference Endpoint + Description: OpenSearch Model ID Resources: ApiGwAccountConfig: Type: "AWS::ApiGateway::Account" @@ -199,11 +196,10 @@ Resources: Variables: API_TOKEN_SECRET: !Ref ApiTokenSecret AZURE_OPENAI_API_KEY: !Ref AzureOpenaiApiKey - AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID: !Ref AzureOpenaiEmbeddingDeploymentId AZURE_OPENAI_LLM_DEPLOYMENT_ID: !Ref AzureOpenaiLlmDeploymentId AZURE_OPENAI_RESOURCE_NAME: !Ref AzureOpenaiResourceName - ELASTICSEARCH_ENDPOINT: !Ref ElasticsearchEndpoint - EMBEDDING_ENDPOINT: !Ref EmbeddingEndpoint + OPENSEARCH_ENDPOINT: !Ref OpenSearchEndpoint + OPENSEARCH_MODEL_ID: !Ref OpenSearchModelId Policies: - Statement: - Effect: Allow @@ -217,12 +213,6 @@ Resources: - 'es:ESHttpGet' - 'es:ESHttpPost' Resource: '*' - - Statement: - - Effect: Allow - Action: - - 'sagemaker:InvokeEndpoint' - - 'sagemaker:InvokeEndpointAsync' - Resource: !Sub 'arn:aws:sagemaker:${AWS::Region}:${AWS::AccountId}:endpoint/${EmbeddingEndpoint}' Metadata: BuildMethod: nodejs18.x Deployment: diff --git a/chat/test/test_event_config.py b/chat/test/test_event_config.py index 55f8381d..1be422d5 100644 --- a/chat/test/test_event_config.py +++ b/chat/test/test_event_config.py @@ -57,7 +57,7 @@ def test_attempt_override_without_superuser_status(self): "question": "test question", "ref": "test ref", "temperature": 0.2, - "text_key": "title", + "text_key": "id", } self.assertEqual(actual.azure_endpoint, expected_output["azure_endpoint"]) self.assertEqual(actual.attributes, expected_output["attributes"]) diff --git a/dev/env.json b/dev/env.json index ffcaadf5..8daeb73f 100644 --- a/dev/env.json +++ b/dev/env.json @@ -1,7 +1,7 @@ { "Parameters": { "API_TOKEN_SECRET": "DEVELOPMENT_SECRET", - "ELASTICSEARCH_ENDPOINT": "", + "OPENSEARCH_ENDPOINT": "", "ENV_PREFIX": "", "DC_URL": "" } diff --git a/node/src/api/opensearch.js b/node/src/api/opensearch.js index 82926223..9c6d98bb 100644 --- a/node/src/api/opensearch.js +++ b/node/src/api/opensearch.js @@ -1,6 +1,6 @@ const { HttpRequest } = require("@aws-sdk/protocol-http"); const { awsFetch } = require("../aws/fetch"); -const { elasticsearchEndpoint, prefix } = require("../environment"); +const { openSearchEndpoint, prefix } = require("../environment"); const Honeybadger = require("../honeybadger-setup"); async function getCollection(id, opts) { @@ -65,7 +65,7 @@ function isVisible(doc, { allowPrivate, allowUnpublished }) { } function initRequest(path) { - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); return new HttpRequest({ method: "GET", @@ -80,7 +80,7 @@ function initRequest(path) { async function search(targets, body, optionsQuery = {}) { Honeybadger.addBreadcrumb("Searching", { metadata: { targets, body } }); - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); const request = new HttpRequest({ method: "POST", @@ -98,7 +98,7 @@ async function search(targets, body, optionsQuery = {}) { } async function scroll(scrollId) { - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); const request = new HttpRequest({ method: "POST", @@ -114,7 +114,7 @@ async function scroll(scrollId) { } async function deleteScroll(scrollId) { - const endpoint = elasticsearchEndpoint(); + const endpoint = openSearchEndpoint(); const request = new HttpRequest({ method: "DELETE", diff --git a/node/src/environment.js b/node/src/environment.js index 3443075c..44958569 100644 --- a/node/src/environment.js +++ b/node/src/environment.js @@ -40,8 +40,8 @@ function dcUrl() { return process.env.DC_URL; } -function elasticsearchEndpoint() { - return process.env.ELASTICSEARCH_ENDPOINT; +function openSearchEndpoint() { + return process.env.OPENSEARCH_ENDPOINT; } function prefix(value) { @@ -61,7 +61,7 @@ module.exports = { appInfo, dcApiEndpoint, dcUrl, - elasticsearchEndpoint, + openSearchEndpoint, prefix, region, }; diff --git a/node/test/test-helpers/index.js b/node/test/test-helpers/index.js index 8045b1ed..84e4afa3 100644 --- a/node/test/test-helpers/index.js +++ b/node/test/test-helpers/index.js @@ -46,7 +46,7 @@ function mockIndex() { const mock = nock("https://index.test.library.northwestern.edu"); beforeEach(function () { - process.env.ELASTICSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; + process.env.OPENSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; }); afterEach(function () { diff --git a/node/test/unit/aws/environment.test.js b/node/test/unit/aws/environment.test.js index 304aaf57..b3391e1f 100644 --- a/node/test/unit/aws/environment.test.js +++ b/node/test/unit/aws/environment.test.js @@ -9,8 +9,8 @@ describe("environment", function () { helpers.saveEnvironment(); it("returns the index endpoint", function () { - process.env.ELASTICSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; - expect(environment.elasticsearchEndpoint()).to.eq( + process.env.OPENSEARCH_ENDPOINT = "index.test.library.northwestern.edu"; + expect(environment.openSearchEndpoint()).to.eq( "index.test.library.northwestern.edu" ); }); diff --git a/template.yaml b/template.yaml index 9e4184d5..1180ca1a 100644 --- a/template.yaml +++ b/template.yaml @@ -19,7 +19,7 @@ Globals: API_TOKEN_SECRET: !Ref ApiTokenSecret DC_API_ENDPOINT: !Ref DcApiEndpoint DC_URL: !Ref DcUrl - ELASTICSEARCH_ENDPOINT: !Ref ElasticsearchEndpoint + OPENSEARCH_ENDPOINT: !Ref OpenSearchEndpoint ENV_PREFIX: !Ref EnvironmentPrefix HONEYBADGER_API_KEY: !Ref HoneybadgerApiKey HONEYBADGER_ENV: !Ref HoneybadgerEnv @@ -35,9 +35,6 @@ Parameters: AzureOpenaiApiKey: Type: String Description: Azure OpenAI API Key - AzureOpenaiEmbeddingDeploymentId: - Type: String - Description: Azure OpenAI Embedding Deployment ID AzureOpenaiLlmDeploymentId: Type: String Description: Azure OpenAI LLM Deployment ID @@ -59,12 +56,12 @@ Parameters: DcUrl: Type: String Description: URL of Digital Collections website - ElasticsearchEndpoint: + OpenSearchModelId: Type: String - Description: Elasticsearch url - EmbeddingEndpoint: + Description: OpenSearch Model ID + OpenSearchEndpoint: Type: String - Description: Sagemaker Inference Endpoint + Description: OpenSearch endpoint EnvironmentPrefix: Type: String Description: Index Prefix @@ -653,11 +650,10 @@ Resources: Parameters: ApiTokenSecret: !Ref ApiTokenSecret AzureOpenaiApiKey: !Ref AzureOpenaiApiKey - AzureOpenaiEmbeddingDeploymentId: !Ref AzureOpenaiEmbeddingDeploymentId AzureOpenaiLlmDeploymentId: !Ref AzureOpenaiLlmDeploymentId AzureOpenaiResourceName: !Ref AzureOpenaiResourceName - ElasticsearchEndpoint: !Ref ElasticsearchEndpoint - EmbeddingEndpoint: !Ref EmbeddingEndpoint + OpenSearchEndpoint: !Ref OpenSearchEndpoint + OpenSearchModelId: !Ref OpenSearchModelId chatWebsocketEndpoint: Type: AWS::Serverless::Function Properties: