langchain-ai · hwchase17 · Jun 20, 2023 · Jun 16, 2023 · Jun 16, 2023 · Jun 19, 2023
diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb b/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb
@@ -5,9 +5,8 @@
    "id": "2b9582dc",
    "metadata": {},
    "source": [
-    "# SingleStoreDB vector search\n",
-    "[SingleStore DB](https://singlestore.com) is a high-performance distributed database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. For a significant duration, it has provided support for vector functions such as [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html), thereby positioning itself as an ideal solution for AI applications that require text similarity matching. \n",
-    "This tutorial illustrates how to utilize the features of the SingleStore DB Vector Store."
+    "# SingleStoreDB\n",
+    "[SingleStoreDB](https://singlestore.com/) is a high-performance distributed SQL database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. It provides vector storage, and vector functions including [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html) and [euclidean_distance](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/euclidean_distance.html), thereby supporting AI applications that require text similarity matching. This tutorial illustrates how to [work with vector data in SingleStoreDB](https://docs.singlestore.com/managed-service/en/developer-resources/functional-extensions/working-with-vector-data.html)."
    ]
   },
   {
@@ -58,10 +57,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load text samples\n",
-    "from langchain.document_loaders import TextLoader\n",
-    "\n",
-    "loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
+    "# Load text samples \n",
+    "loader = TextLoader('../../../state_of_the_union.txt')\n",
     "documents = loader.load()\n",
     "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
     "docs = text_splitter.split_documents(documents)\n",
@@ -91,7 +88,7 @@
     "docsearch = SingleStoreDB.from_documents(\n",
     "    docs,\n",
     "    embeddings,\n",
-    "    table_name=\"noteook\",  # use table with a custom name\n",
+    "    table_name = \"notebook\", # use table with a custom name \n",
     ")"
    ]
   },

diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py
@@ -1,6 +1,7 @@
 """Wrapper around SingleStore DB."""
 from __future__ import annotations
 
+import enum
 import json
 from typing import (
     Any,
@@ -20,6 +21,19 @@
 from langchain.vectorstores.base import VectorStore, VectorStoreRetriever
 
 
+class DistanceStrategy(str, enum.Enum):
+    EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
+    DOT_PRODUCT = "DOT_PRODUCT"
+
+
+DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT
+
+ORDERING_DIRECTIVE: dict = {
+    DistanceStrategy.EUCLIDEAN_DISTANCE: "",
+    DistanceStrategy.DOT_PRODUCT: "DESC",
+}
+
+
 class SingleStoreDB(VectorStore):
     """
     This class serves as a Pythonic interface to the SingleStore DB database.
@@ -45,6 +59,7 @@ def __init__(
         self,
         embedding: Embeddings,
         *,
+        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
         table_name: str = "embeddings",
         content_field: str = "content",
         metadata_field: str = "metadata",
@@ -59,6 +74,18 @@ def __init__(
         Args:
             embedding (Embeddings): A text embedding model.
 
+            distance_strategy (DistanceStrategy, optional):
+                Determines the strategy employed for calculating
+                the distance between vectors in the embedding space.
+                Defaults to DOT_PRODUCT.
+                Available options are:
+                - DOT_PRODUCT: Computes the scalar product of two vectors.
+                    This is the default behavior
+                - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between
+                    two vectors. This metric considers the geometric distance in
+                    the vector space, and might be more suitable for embeddings
+                    that rely on spatial relationships.
+
             table_name (str, optional): Specifies the name of the table in use.
                 Defaults to "embeddings".
             content_field (str, optional): Specifies the field to store the content.
@@ -137,6 +164,7 @@ def __init__(
 
                 vectorstore = SingleStoreDB(
                     OpenAIEmbeddings(),
+                    distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
                     host="127.0.0.1",
                     port=3306,
                     user="user",
@@ -159,6 +187,7 @@ def __init__(
         """
 
         self.embedding = embedding
+        self.distance_strategy = distance_strategy
         self.table_name = table_name
         self.content_field = content_field
         self.metadata_field = metadata_field
@@ -167,6 +196,17 @@ def __init__(
         """Pass the rest of the kwargs to the connection."""
         self.connection_kwargs = kwargs
 
+        """Add program name and version to connection attributes."""
+        if "conn_attrs" not in self.connection_kwargs:
+            self.connection_kwargs["conn_attrs"] = dict()
+        if "program_name" not in self.connection_kwargs["conn_attrs"]:
+            self.connection_kwargs["conn_attrs"][
+                "program_name"
+            ] = "langchain python sdk"
+            self.connection_kwargs["conn_attrs"][
+                "program_version"
+            ] = "0.0.205"  # the version of SingleStoreDB VectorStore implementation
+
         """Create connection pool."""
         self.connection_pool = QueuePool(
             self._get_connection,
@@ -246,7 +286,7 @@ def add_texts(
         return []
 
     def similarity_search(
-        self, query: str, k: int = 4, **kwargs: Any
+        self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
     ) -> List[Document]:
         """Returns the most similar indexed documents to the query text.
 
@@ -255,21 +295,38 @@ def similarity_search(
         Args:
             query (str): The query text for which to find similar documents.
             k (int): The number of documents to return. Default is 4.
+            filter (dict): A dictionary of metadata fields and values to filter by.
 
         Returns:
             List[Document]: A list of documents that are most similar to the query text.
+
+        Examples:
+            .. code-block:: python
+                from langchain.vectorstores import SingleStoreDB
+                from langchain.embeddings import OpenAIEmbeddings
+                s2 = SingleStoreDB.from_documents(
+                    docs,
+                    OpenAIEmbeddings(),
+                    host="username:password@localhost:3306/database"
+                )
+                s2.similarity_search("query text", 1,
+                    {"metadata_field": "metadata_value"})
         """
-        docs_and_scores = self.similarity_search_with_score(query, k=k)
+        docs_and_scores = self.similarity_search_with_score(
+            query=query, k=k, filter=filter
+        )
         return [doc for doc, _ in docs_and_scores]
 
     def similarity_search_with_score(
-        self, query: str, k: int = 4
+        self, query: str, k: int = 4, filter: Optional[dict] = None
     ) -> List[Tuple[Document, float]]:
         """Return docs most similar to query. Uses cosine similarity.
 
         Args:
             query: Text to look up documents similar to.
             k: Number of Documents to return. Defaults to 4.
+            filter: A dictionary of metadata fields and values to filter by.
+                    Defaults to None.
 
         Returns:
             List of Documents most similar to the query and score for each
@@ -278,21 +335,52 @@ def similarity_search_with_score(
         embedding = self.embedding.embed_query(query)
         conn = self.connection_pool.connect()
         result = []
+        where_clause: str = ""
+        where_clause_values: List[Any] = []
+        if filter:
+            where_clause = "WHERE "
+            arguments = []
+
+            def build_where_clause(
+                where_clause_values: List[Any],
+                sub_filter: dict,
+                prefix_args: List[str] = [],
+            ) -> None:
+                for key in sub_filter.keys():
+                    if isinstance(sub_filter[key], dict):
+                        build_where_clause(
+                            where_clause_values, sub_filter[key], prefix_args + [key]
+                        )
+                    else:
+                        arguments.append(
+                            "JSON_EXTRACT_JSON({}, {}) = %s".format(
+                                self.metadata_field,
+                                ", ".join(["%s"] * (len(prefix_args) + 1)),
+                            )
+                        )
+                        where_clause_values += prefix_args + [key]
+                        where_clause_values.append(json.dumps(sub_filter[key]))
+
+            build_where_clause(where_clause_values, filter)
+            where_clause += " AND ".join(arguments)
+
         try:
             cur = conn.cursor()
             try:
                 cur.execute(
-                    """SELECT {}, {}, DOT_PRODUCT({}, JSON_ARRAY_PACK(%s)) as __score 
-                    FROM {} ORDER BY __score DESC LIMIT %s""".format(
+                    """SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score
+                    FROM {} {} ORDER BY __score {} LIMIT %s""".format(
                         self.content_field,
                         self.metadata_field,
+                        self.distance_strategy,
                         self.vector_field,
                         self.table_name,
+                        where_clause,
+                        ORDERING_DIRECTIVE[self.distance_strategy],
                     ),
-                    (
-                        "[{}]".format(",".join(map(str, embedding))),
-                        k,
-                    ),
+                    ("[{}]".format(",".join(map(str, embedding))),)
+                    + tuple(where_clause_values)
+                    + (k,),
                 )
 
                 for row in cur.fetchall():
@@ -310,6 +398,7 @@ def from_texts(
         texts: List[str],
         embedding: Embeddings,
         metadatas: Optional[List[dict]] = None,
+        distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
         table_name: str = "embeddings",
         content_field: str = "content",
         metadata_field: str = "metadata",
@@ -338,6 +427,7 @@ def from_texts(
 
         instance = cls(
             embedding,
+            distance_strategy=distance_strategy,
             table_name=table_name,
             content_field=content_field,
             metadata_field=metadata_field,