Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update SinglStoreDB vectorstore #6423

Merged
merged 8 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
"id": "2b9582dc",
"metadata": {},
"source": [
"# SingleStoreDB vector search\n",
"[SingleStore DB](https://singlestore.com) is a high-performance distributed database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. For a significant duration, it has provided support for vector functions such as [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html), thereby positioning itself as an ideal solution for AI applications that require text similarity matching. \n",
"This tutorial illustrates how to utilize the features of the SingleStore DB Vector Store."
"# SingleStoreDB\n",
"[SingleStoreDB](https://singlestore.com/) is a high-performance distributed SQL database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. It provides vector storage, and vector functions including [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html) and [euclidean_distance](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/euclidean_distance.html), thereby supporting AI applications that require text similarity matching. This tutorial illustrates how to [work with vector data in SingleStoreDB](https://docs.singlestore.com/managed-service/en/developer-resources/functional-extensions/working-with-vector-data.html)."
]
},
{
Expand Down Expand Up @@ -58,10 +57,8 @@
"metadata": {},
"outputs": [],
"source": [
"# Load text samples\n",
"from langchain.document_loaders import TextLoader\n",
"\n",
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
"# Load text samples \n",
"loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
Expand Down Expand Up @@ -91,7 +88,7 @@
"docsearch = SingleStoreDB.from_documents(\n",
" docs,\n",
" embeddings,\n",
" table_name=\"noteook\", # use table with a custom name\n",
" table_name = \"notebook\", # use table with a custom name \n",
")"
]
},
Expand Down
108 changes: 99 additions & 9 deletions langchain/vectorstores/singlestoredb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Wrapper around SingleStore DB."""
from __future__ import annotations

import enum
import json
from typing import (
Any,
Expand All @@ -20,6 +21,19 @@
from langchain.vectorstores.base import VectorStore, VectorStoreRetriever


class DistanceStrategy(str, enum.Enum):
EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE"
DOT_PRODUCT = "DOT_PRODUCT"


DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT

ORDERING_DIRECTIVE: dict = {
DistanceStrategy.EUCLIDEAN_DISTANCE: "",
DistanceStrategy.DOT_PRODUCT: "DESC",
}


class SingleStoreDB(VectorStore):
"""
This class serves as a Pythonic interface to the SingleStore DB database.
Expand All @@ -45,6 +59,7 @@ def __init__(
self,
embedding: Embeddings,
*,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
table_name: str = "embeddings",
content_field: str = "content",
metadata_field: str = "metadata",
Expand All @@ -59,6 +74,18 @@ def __init__(
Args:
embedding (Embeddings): A text embedding model.

distance_strategy (DistanceStrategy, optional):
Determines the strategy employed for calculating
the distance between vectors in the embedding space.
Defaults to DOT_PRODUCT.
Available options are:
- DOT_PRODUCT: Computes the scalar product of two vectors.
This is the default behavior
- EUCLIDEAN_DISTANCE: Computes the Euclidean distance between
two vectors. This metric considers the geometric distance in
the vector space, and might be more suitable for embeddings
that rely on spatial relationships.

table_name (str, optional): Specifies the name of the table in use.
Defaults to "embeddings".
content_field (str, optional): Specifies the field to store the content.
Expand Down Expand Up @@ -137,6 +164,7 @@ def __init__(

vectorstore = SingleStoreDB(
OpenAIEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
host="127.0.0.1",
port=3306,
user="user",
Expand All @@ -159,6 +187,7 @@ def __init__(
"""

self.embedding = embedding
self.distance_strategy = distance_strategy
self.table_name = table_name
self.content_field = content_field
self.metadata_field = metadata_field
Expand All @@ -167,6 +196,17 @@ def __init__(
"""Pass the rest of the kwargs to the connection."""
self.connection_kwargs = kwargs

"""Add program name and version to connection attributes."""
if "conn_attrs" not in self.connection_kwargs:
self.connection_kwargs["conn_attrs"] = dict()
if "program_name" not in self.connection_kwargs["conn_attrs"]:
self.connection_kwargs["conn_attrs"][
"program_name"
] = "langchain python sdk"
self.connection_kwargs["conn_attrs"][
"program_version"
] = "0.0.205" # the version of SingleStoreDB VectorStore implementation

"""Create connection pool."""
self.connection_pool = QueuePool(
self._get_connection,
Expand Down Expand Up @@ -246,7 +286,7 @@ def add_texts(
return []

def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any
) -> List[Document]:
"""Returns the most similar indexed documents to the query text.

Expand All @@ -255,21 +295,38 @@ def similarity_search(
Args:
query (str): The query text for which to find similar documents.
k (int): The number of documents to return. Default is 4.
filter (dict): A dictionary of metadata fields and values to filter by.

Returns:
List[Document]: A list of documents that are most similar to the query text.

Examples:
.. code-block:: python
from langchain.vectorstores import SingleStoreDB
from langchain.embeddings import OpenAIEmbeddings
s2 = SingleStoreDB.from_documents(
docs,
OpenAIEmbeddings(),
host="username:password@localhost:3306/database"
)
s2.similarity_search("query text", 1,
{"metadata_field": "metadata_value"})
"""
docs_and_scores = self.similarity_search_with_score(query, k=k)
docs_and_scores = self.similarity_search_with_score(
query=query, k=k, filter=filter
)
return [doc for doc, _ in docs_and_scores]

def similarity_search_with_score(
self, query: str, k: int = 4
self, query: str, k: int = 4, filter: Optional[dict] = None
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query. Uses cosine similarity.

Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: A dictionary of metadata fields and values to filter by.
Defaults to None.

Returns:
List of Documents most similar to the query and score for each
Expand All @@ -278,21 +335,52 @@ def similarity_search_with_score(
embedding = self.embedding.embed_query(query)
conn = self.connection_pool.connect()
result = []
where_clause: str = ""
where_clause_values: List[Any] = []
if filter:
where_clause = "WHERE "
arguments = []

def build_where_clause(
where_clause_values: List[Any],
sub_filter: dict,
prefix_args: List[str] = [],
) -> None:
for key in sub_filter.keys():
if isinstance(sub_filter[key], dict):
build_where_clause(
where_clause_values, sub_filter[key], prefix_args + [key]
)
else:
arguments.append(
"JSON_EXTRACT_JSON({}, {}) = %s".format(
self.metadata_field,
", ".join(["%s"] * (len(prefix_args) + 1)),
)
)
where_clause_values += prefix_args + [key]
where_clause_values.append(json.dumps(sub_filter[key]))

build_where_clause(where_clause_values, filter)
where_clause += " AND ".join(arguments)

try:
cur = conn.cursor()
try:
cur.execute(
"""SELECT {}, {}, DOT_PRODUCT({}, JSON_ARRAY_PACK(%s)) as __score
FROM {} ORDER BY __score DESC LIMIT %s""".format(
"""SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score
FROM {} {} ORDER BY __score {} LIMIT %s""".format(
self.content_field,
self.metadata_field,
self.distance_strategy,
self.vector_field,
self.table_name,
where_clause,
ORDERING_DIRECTIVE[self.distance_strategy],
),
(
"[{}]".format(",".join(map(str, embedding))),
k,
),
("[{}]".format(",".join(map(str, embedding))),)
+ tuple(where_clause_values)
+ (k,),
)

for row in cur.fetchall():
Expand All @@ -310,6 +398,7 @@ def from_texts(
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY,
table_name: str = "embeddings",
content_field: str = "content",
metadata_field: str = "metadata",
Expand Down Expand Up @@ -338,6 +427,7 @@ def from_texts(

instance = cls(
embedding,
distance_strategy=distance_strategy,
table_name=table_name,
content_field=content_field,
metadata_field=metadata_field,
Expand Down
Loading