Opensearch neural search (#189)

* Implement neural search vector store * Create OpensearchNeuralSearch vector store * Implement infrastructure for hybrid search (not yet live) * Remove duplicate azure_endpoint assignment --------- Co-authored-by: Michael B. Klein <mbklein@gmail.com> Co-authored-by: Brendan Quinn <brendan-quinn@northwestern.edu>
nulib · Mar 11, 2024 · 7f9b6ac · 7f9b6ac
1 parent e29ddc1
commit 7f9b6ac
Show file tree

Hide file tree

Showing 13 changed files with 124 additions and 60 deletions.
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ The `env.json` file contains environment variable values for the lambda function
 Some of the values can be found as follows:
 
 - `API_TOKEN_SECRET` - already defined; value has to exist but doesn't matter in dev mode
-- `ELASTICSEARCH_ENDPOINT` - run the following command:
+- `OPENSEARCH_ENDPOINT` - run the following command:
   ```
   aws secretsmanager get-secret-value \
     --secret-id dev-environment/config/meadow --query SecretString \

diff --git a/chat/src/event_config.py b/chat/src/event_config.py
@@ -20,7 +20,7 @@
 K_VALUE = 5
 MAX_K = 100
 TEMPERATURE = 0.2
-TEXT_KEY = "title"
+TEXT_KEY = "id"
 VERSION = "2023-07-01-preview"
 
 @dataclass
@@ -63,7 +63,6 @@ def __post_init__(self):
         self.attributes = self._get_attributes()
         self.azure_endpoint = self._get_azure_endpoint()
         self.azure_resource_name = self._get_azure_resource_name()
-        self.azure_endpoint = self._get_azure_endpoint()
         self.debug_mode = self._is_debug_mode_enabled()
         self.deployment_name = self._get_deployment_name()
         self.is_logged_in = self.api_token.is_logged_in()

diff --git a/chat/src/handlers/opensearch_neural_search.py b/chat/src/handlers/opensearch_neural_search.py
@@ -0,0 +1,87 @@
+from langchain_core.documents import Document
+from langchain_core.vectorstores import VectorStore
+from opensearchpy import OpenSearch
+from typing import Any, List, Tuple
+
+
+class OpenSearchNeuralSearch(VectorStore):
+    """Read-only OpenSearch vectorstore with neural search."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        index: str,
+        model_id: str,
+        vector_field: str = "embedding",
+        search_pipeline: str = None,
+        text_field: str = "id",
+        **kwargs: Any,
+    ):
+        self.client = OpenSearch(
+            hosts=[{"host": endpoint, "port": "443", "use_ssl": True}], **kwargs
+        )
+        self.index = index
+        self.model_id = model_id
+        self.vector_field = vector_field
+        self.search_pipeline = search_pipeline
+        self.text_field = text_field
+
+    def similarity_search(
+        self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any
+    ) -> List[Document]:
+        """Return docs most similar to the embedding vector."""
+        docs_with_scores = self.similarity_search_with_score(
+            query, k, subquery, **kwargs
+        )
+        return [doc[0] for doc in docs_with_scores]
+
+    def similarity_search_with_score(
+        self, query: str, k: int = 10, subquery: Any = None, **kwargs: Any
+    ) -> List[Tuple[Document, float]]:
+        """Return docs most similar to query."""
+        dsl = {
+            "size": k,
+            "query": {
+                "hybrid": {
+                    "queries": [
+                        {
+                            "neural": {
+                                self.vector_field: {
+                                    "query_text": query,
+                                    "model_id": self.model_id,
+                                    "k": k,
+                                }
+                            }
+                        }
+                    ]
+                }
+            },
+        }
+
+        if subquery:
+            dsl["query"]["hybrid"]["queries"].append(subquery)
+
+        for key, value in kwargs.items():
+            dsl[key] = value
+
+        response = self.client.search(index=self.index, body=dsl)
+
+        documents_with_scores = [
+            (
+                Document(
+                    page_content=hit["_source"][self.text_field],
+                    metadata=(hit["_source"]),
+                ),
+                hit["_score"],
+            )
+            for hit in response["hits"]["hits"]
+        ]
+
+        return documents_with_scores
+
+    def add_texts(self, texts: List[str], metadatas: List[dict], **kwargs: Any) -> None:
+       pass
+
+    @classmethod
+    def from_texts(cls, texts: List[str], metadatas: List[dict], **kwargs: Any) -> None:
+       pass
diff --git a/chat/src/helpers/response.py b/chat/src/helpers/response.py
@@ -49,7 +49,7 @@ def extract_prompt_value(v):
 def prepare_response(config):
     try:
         docs = config.opensearch.similarity_search(
-            config.question, k=config.k, vector_field="embedding", text_field="id"
+            query=config.question, k=config.k
         )
         original_question = get_and_send_original_question(config, docs)
         response = config.chain({"question": config.question, "input_documents": docs})

diff --git a/chat/src/setup.py b/chat/src/setup.py
@@ -1,7 +1,5 @@
-from content_handler import ContentHandler
 from langchain_community.chat_models import AzureChatOpenAI
-from langchain_community.embeddings import SagemakerEndpointEmbeddings
-from langchain_community.vectorstores import OpenSearchVectorSearch
+from handlers.opensearch_neural_search import OpenSearchNeuralSearch
 from opensearchpy import OpenSearch, RequestsHttpConnection
 from requests_aws4auth import AWS4Auth
 import os
@@ -22,7 +20,7 @@ def opensearch_client(region_name=os.getenv("AWS_REGION")):
     print(region_name)
     session = boto3.Session(region_name=region_name)
     awsauth = AWS4Auth(region=region_name, service="es", refreshable_credentials=session.get_credentials())
-    endpoint = os.getenv("ELASTICSEARCH_ENDPOINT")
+    endpoint = os.getenv("OPENSEARCH_ENDPOINT")
 
     return OpenSearch(
         hosts=[{'host': endpoint, 'port': 443}],
@@ -35,20 +33,14 @@ def opensearch_vector_store(region_name=os.getenv("AWS_REGION")):
     session = boto3.Session(region_name=region_name)
     awsauth = AWS4Auth(region=region_name, service="es", refreshable_credentials=session.get_credentials())
 
-    sagemaker_client = session.client(service_name="sagemaker-runtime", region_name=session.region_name)
-    embeddings = SagemakerEndpointEmbeddings(
-        client=sagemaker_client,
-        region_name=session.region_name,
-        endpoint_name=os.getenv("EMBEDDING_ENDPOINT"),
-        content_handler=ContentHandler()
-    )
-
-    docsearch = OpenSearchVectorSearch(
-        index_name=prefix("dc-v2-work"),
-        embedding_function=embeddings,
-        opensearch_url="https://" + os.getenv("ELASTICSEARCH_ENDPOINT"),
+    docsearch = OpenSearchNeuralSearch(
+        index=prefix("dc-v2-work"),
+        model_id=os.getenv("OPENSEARCH_MODEL_ID"),
+        endpoint=os.getenv("OPENSEARCH_ENDPOINT"),
         connection_class=RequestsHttpConnection,
         http_auth=awsauth,
+        search_pipeline=prefix("dc-v2-work-pipeline"),
+        text_field= "id"
     )
     return docsearch
 

diff --git a/chat/template.yaml b/chat/template.yaml
@@ -8,21 +8,18 @@ Parameters:
   AzureOpenaiApiKey:
     Type: String
     Description: Azure OpenAI API Key
-  AzureOpenaiEmbeddingDeploymentId:
-    Type: String
-    Description: Azure OpenAI Embedding Deployment ID
   AzureOpenaiLlmDeploymentId:
     Type: String
     Description: Azure OpenAI LLM Deployment ID
   AzureOpenaiResourceName:
     Type: String
     Description: Azure OpenAI Resource Name
-  ElasticsearchEndpoint:
+  OpenSearchEndpoint:
     Type: String
-    Description: Elasticsearch URL
-  EmbeddingEndpoint:
+    Description: OpenSearch Endpoint
+  OpenSearchModelId:
     Type: String
-    Description: Sagemaker Inference Endpoint
+    Description: OpenSearch Model ID
 Resources:
   ApiGwAccountConfig:
     Type: "AWS::ApiGateway::Account"
@@ -199,11 +196,10 @@ Resources:
         Variables:
           API_TOKEN_SECRET: !Ref ApiTokenSecret
           AZURE_OPENAI_API_KEY: !Ref AzureOpenaiApiKey
-          AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID: !Ref AzureOpenaiEmbeddingDeploymentId
           AZURE_OPENAI_LLM_DEPLOYMENT_ID: !Ref AzureOpenaiLlmDeploymentId
           AZURE_OPENAI_RESOURCE_NAME: !Ref AzureOpenaiResourceName
-          ELASTICSEARCH_ENDPOINT: !Ref ElasticsearchEndpoint
-          EMBEDDING_ENDPOINT: !Ref EmbeddingEndpoint
+          OPENSEARCH_ENDPOINT: !Ref OpenSearchEndpoint
+          OPENSEARCH_MODEL_ID: !Ref OpenSearchModelId
       Policies:
       - Statement:
         - Effect: Allow
@@ -217,12 +213,6 @@ Resources:
           - 'es:ESHttpGet'
           - 'es:ESHttpPost'
           Resource: '*'
-      - Statement:
-        - Effect: Allow
-          Action:
-          - 'sagemaker:InvokeEndpoint'
-          - 'sagemaker:InvokeEndpointAsync'
-          Resource: !Sub 'arn:aws:sagemaker:${AWS::Region}:${AWS::AccountId}:endpoint/${EmbeddingEndpoint}'
     Metadata:
       BuildMethod: nodejs18.x
   Deployment:

diff --git a/chat/test/test_event_config.py b/chat/test/test_event_config.py
@@ -57,7 +57,7 @@ def test_attempt_override_without_superuser_status(self):
             "question": "test question",
             "ref": "test ref",
             "temperature": 0.2,
-            "text_key": "title",
+            "text_key": "id",
         }
         self.assertEqual(actual.azure_endpoint, expected_output["azure_endpoint"])
         self.assertEqual(actual.attributes, expected_output["attributes"])

diff --git a/dev/env.json b/dev/env.json
@@ -1,7 +1,7 @@
 {
   "Parameters": {
     "API_TOKEN_SECRET": "DEVELOPMENT_SECRET",
-    "ELASTICSEARCH_ENDPOINT": "",
+    "OPENSEARCH_ENDPOINT": "",
     "ENV_PREFIX": "",
     "DC_URL": ""
   }

diff --git a/node/src/api/opensearch.js b/node/src/api/opensearch.js
@@ -1,6 +1,6 @@
 const { HttpRequest } = require("@aws-sdk/protocol-http");
 const { awsFetch } = require("../aws/fetch");
-const { elasticsearchEndpoint, prefix } = require("../environment");
+const { openSearchEndpoint, prefix } = require("../environment");
 const Honeybadger = require("../honeybadger-setup");
 
 async function getCollection(id, opts) {
@@ -65,7 +65,7 @@ function isVisible(doc, { allowPrivate, allowUnpublished }) {
 }
 
 function initRequest(path) {
-  const endpoint = elasticsearchEndpoint();
+  const endpoint = openSearchEndpoint();
 
   return new HttpRequest({
     method: "GET",
@@ -80,7 +80,7 @@ function initRequest(path) {
 
 async function search(targets, body, optionsQuery = {}) {
   Honeybadger.addBreadcrumb("Searching", { metadata: { targets, body } });
-  const endpoint = elasticsearchEndpoint();
+  const endpoint = openSearchEndpoint();
 
   const request = new HttpRequest({
     method: "POST",
@@ -98,7 +98,7 @@ async function search(targets, body, optionsQuery = {}) {
 }
 
 async function scroll(scrollId) {
-  const endpoint = elasticsearchEndpoint();
+  const endpoint = openSearchEndpoint();
 
   const request = new HttpRequest({
     method: "POST",
@@ -114,7 +114,7 @@ async function scroll(scrollId) {
 }
 
 async function deleteScroll(scrollId) {
-  const endpoint = elasticsearchEndpoint();
+  const endpoint = openSearchEndpoint();
 
   const request = new HttpRequest({
     method: "DELETE",

diff --git a/node/src/environment.js b/node/src/environment.js
@@ -40,8 +40,8 @@ function dcUrl() {
   return process.env.DC_URL;
 }
 
-function elasticsearchEndpoint() {
-  return process.env.ELASTICSEARCH_ENDPOINT;
+function openSearchEndpoint() {
+  return process.env.OPENSEARCH_ENDPOINT;
 }
 
 function prefix(value) {
@@ -61,7 +61,7 @@ module.exports = {
   appInfo,
   dcApiEndpoint,
   dcUrl,
-  elasticsearchEndpoint,
+  openSearchEndpoint,
   prefix,
   region,
 };
diff --git a/node/test/test-helpers/index.js b/node/test/test-helpers/index.js
@@ -46,7 +46,7 @@ function mockIndex() {
   const mock = nock("https://index.test.library.northwestern.edu");
 
   beforeEach(function () {
-    process.env.ELASTICSEARCH_ENDPOINT = "index.test.library.northwestern.edu";
+    process.env.OPENSEARCH_ENDPOINT = "index.test.library.northwestern.edu";
   });
 
   afterEach(function () {

diff --git a/node/test/unit/aws/environment.test.js b/node/test/unit/aws/environment.test.js
@@ -9,8 +9,8 @@ describe("environment", function () {
   helpers.saveEnvironment();
 
   it("returns the index endpoint", function () {
-    process.env.ELASTICSEARCH_ENDPOINT = "index.test.library.northwestern.edu";
-    expect(environment.elasticsearchEndpoint()).to.eq(
+    process.env.OPENSEARCH_ENDPOINT = "index.test.library.northwestern.edu";
+    expect(environment.openSearchEndpoint()).to.eq(
       "index.test.library.northwestern.edu"
     );
   });

diff --git a/template.yaml b/template.yaml
@@ -19,7 +19,7 @@ Globals:
         API_TOKEN_SECRET: !Ref ApiTokenSecret
         DC_API_ENDPOINT: !Ref DcApiEndpoint
         DC_URL: !Ref DcUrl
-        ELASTICSEARCH_ENDPOINT: !Ref ElasticsearchEndpoint
+        OPENSEARCH_ENDPOINT: !Ref OpenSearchEndpoint
         ENV_PREFIX: !Ref EnvironmentPrefix
         HONEYBADGER_API_KEY: !Ref HoneybadgerApiKey
         HONEYBADGER_ENV: !Ref HoneybadgerEnv
@@ -35,9 +35,6 @@ Parameters:
   AzureOpenaiApiKey:
     Type: String
     Description: Azure OpenAI API Key
-  AzureOpenaiEmbeddingDeploymentId:
-    Type: String
-    Description: Azure OpenAI Embedding Deployment ID
   AzureOpenaiLlmDeploymentId:
     Type: String
     Description: Azure OpenAI LLM Deployment ID
@@ -59,12 +56,12 @@ Parameters:
   DcUrl:
     Type: String
     Description: URL of Digital Collections website
-  ElasticsearchEndpoint:
+  OpenSearchModelId:
     Type: String
-    Description: Elasticsearch url
-  EmbeddingEndpoint:
+    Description: OpenSearch Model ID
+  OpenSearchEndpoint:
     Type: String
-    Description: Sagemaker Inference Endpoint
+    Description: OpenSearch endpoint
   EnvironmentPrefix:
     Type: String
     Description: Index Prefix
@@ -653,11 +650,10 @@ Resources:
       Parameters:
         ApiTokenSecret: !Ref ApiTokenSecret
         AzureOpenaiApiKey: !Ref AzureOpenaiApiKey
-        AzureOpenaiEmbeddingDeploymentId: !Ref AzureOpenaiEmbeddingDeploymentId
         AzureOpenaiLlmDeploymentId: !Ref AzureOpenaiLlmDeploymentId
         AzureOpenaiResourceName: !Ref AzureOpenaiResourceName
-        ElasticsearchEndpoint: !Ref ElasticsearchEndpoint
-        EmbeddingEndpoint: !Ref EmbeddingEndpoint
+        OpenSearchEndpoint: !Ref OpenSearchEndpoint
+        OpenSearchModelId: !Ref OpenSearchModelId
   chatWebsocketEndpoint:
     Type: AWS::Serverless::Function
     Properties: