From 24be3dffcd286a0740afae2a63611a83bab57e8a Mon Sep 17 00:00:00 2001 From: Volodymyr Tkachuk Date: Fri, 16 Jun 2023 15:06:12 +0300 Subject: [PATCH 1/6] update notebook example --- .../vectorstores/integrations/singlestoredb.ipynb | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb b/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb index 2e8e8c4568858..c011e95077839 100644 --- a/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb +++ b/docs/extras/modules/data_connection/vectorstores/integrations/singlestoredb.ipynb @@ -5,9 +5,8 @@ "id": "2b9582dc", "metadata": {}, "source": [ - "# SingleStoreDB vector search\n", - "[SingleStore DB](https://singlestore.com) is a high-performance distributed database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. For a significant duration, it has provided support for vector functions such as [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html), thereby positioning itself as an ideal solution for AI applications that require text similarity matching. \n", - "This tutorial illustrates how to utilize the features of the SingleStore DB Vector Store." + "# SingleStoreDB\n", + "[SingleStoreDB](https://singlestore.com/) is a high-performance distributed SQL database that supports deployment both in the [cloud](https://www.singlestore.com/cloud/) and on-premises. It provides vector storage, and vector functions including [dot_product](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/dot_product.html) and [euclidean_distance](https://docs.singlestore.com/managed-service/en/reference/sql-reference/vector-functions/euclidean_distance.html), thereby supporting AI applications that require text similarity matching. This tutorial illustrates how to [work with vector data in SingleStoreDB](https://docs.singlestore.com/managed-service/en/developer-resources/functional-extensions/working-with-vector-data.html)." ] }, { @@ -58,10 +57,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Load text samples\n", - "from langchain.document_loaders import TextLoader\n", - "\n", - "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "# Load text samples \n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", "documents = loader.load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)\n", @@ -91,7 +88,7 @@ "docsearch = SingleStoreDB.from_documents(\n", " docs,\n", " embeddings,\n", - " table_name=\"noteook\", # use table with a custom name\n", + " table_name = \"notebook\", # use table with a custom name \n", ")" ] }, From 2cf3254e2e89281980761e7769eed960ef4dd850 Mon Sep 17 00:00:00 2001 From: Volodymyr Tkachuk Date: Fri, 16 Jun 2023 17:02:13 +0300 Subject: [PATCH 2/6] add distance_strategy property to the SingleStoreDB vector store --- langchain/vectorstores/singlestoredb.py | 32 +++++++++++++++++-- .../vectorstores/test_singlestoredb.py | 20 +++++++++++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py index b5a6ad7734072..208b8bfb4c616 100644 --- a/langchain/vectorstores/singlestoredb.py +++ b/langchain/vectorstores/singlestoredb.py @@ -1,7 +1,9 @@ """Wrapper around SingleStore DB.""" from __future__ import annotations +import enum import json + from typing import ( Any, ClassVar, @@ -19,6 +21,16 @@ from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore, VectorStoreRetriever +class DistanceStrategy(str, enum.Enum): + EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE" + DOT_PRODUCT = "DOT_PRODUCT" + +DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT + +ORDERING_DIRECTIVE: dict = { + DistanceStrategy.EUCLIDEAN_DISTANCE: "", + DistanceStrategy.DOT_PRODUCT: "DESC" +} class SingleStoreDB(VectorStore): """ @@ -45,6 +57,7 @@ def __init__( self, embedding: Embeddings, *, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, table_name: str = "embeddings", content_field: str = "content", metadata_field: str = "metadata", @@ -59,6 +72,15 @@ def __init__( Args: embedding (Embeddings): A text embedding model. + distance_strategy (DistanceStrategy, optional): Determines the strategy employed for calculating + the distance between vectors in the embedding space. + Defaults to DOT_PRODUCT. + Available options are: + - DOT_PRODUCT: Computes the scalar product of two vectors. This is the default behavior + - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between two vectors. This metric + considers the geometric distance in the vector space, and might be more suitable for + embeddings that rely on spatial relationships. + table_name (str, optional): Specifies the name of the table in use. Defaults to "embeddings". content_field (str, optional): Specifies the field to store the content. @@ -137,6 +159,7 @@ def __init__( vectorstore = SingleStoreDB( OpenAIEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, host="127.0.0.1", port=3306, user="user", @@ -159,6 +182,7 @@ def __init__( """ self.embedding = embedding + self.distance_strategy = distance_strategy self.table_name = table_name self.content_field = content_field self.metadata_field = metadata_field @@ -282,12 +306,14 @@ def similarity_search_with_score( cur = conn.cursor() try: cur.execute( - """SELECT {}, {}, DOT_PRODUCT({}, JSON_ARRAY_PACK(%s)) as __score - FROM {} ORDER BY __score DESC LIMIT %s""".format( + """SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score + FROM {} ORDER BY __score {} LIMIT %s""".format( self.content_field, self.metadata_field, + self.distance_strategy, self.vector_field, self.table_name, + ORDERING_DIRECTIVE[self.distance_strategy] ), ( "[{}]".format(",".join(map(str, embedding))), @@ -310,6 +336,7 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, table_name: str = "embeddings", content_field: str = "content", metadata_field: str = "metadata", @@ -338,6 +365,7 @@ def from_texts( instance = cls( embedding, + distance_strategy=distance_strategy, table_name=table_name, content_field=content_field, metadata_field=metadata_field, diff --git a/tests/integration_tests/vectorstores/test_singlestoredb.py b/tests/integration_tests/vectorstores/test_singlestoredb.py index 87bfce8282ab0..a3c6f3dfc66b1 100644 --- a/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -5,7 +5,7 @@ import pytest from langchain.docstore.document import Document -from langchain.vectorstores.singlestoredb import SingleStoreDB +from langchain.vectorstores.singlestoredb import DistanceStrategy, SingleStoreDB from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db" @@ -80,6 +80,24 @@ def test_singlestoredb_new_vector(texts: List[str]) -> None: drop(table_name) +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_euclidean_distance(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_euclidean_distance" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=2) + assert output == TEST_RESULT + drop(table_name) + + @pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") def test_singlestoredb_from_existing(texts: List[str]) -> None: """Test adding a new document""" From bac9193e7e042f9312fc9ff633ba490ffc8475a5 Mon Sep 17 00:00:00 2001 From: Volodymyr Tkachuk Date: Mon, 19 Jun 2023 16:45:04 +0300 Subject: [PATCH 3/6] Add EUCLIDEAN_DISTANCE and metadata filtering to the SingleStoreDB VectorStore --- langchain/vectorstores/singlestoredb.py | 42 +++++- poetry.lock | 54 ++++---- pyproject.toml | 2 +- .../vectorstores/test_singlestoredb.py | 128 ++++++++++++++++++ 4 files changed, 192 insertions(+), 34 deletions(-) diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py index 208b8bfb4c616..6f3f21f7eb207 100644 --- a/langchain/vectorstores/singlestoredb.py +++ b/langchain/vectorstores/singlestoredb.py @@ -191,6 +191,13 @@ def __init__( """Pass the rest of the kwargs to the connection.""" self.connection_kwargs = kwargs + """Add program name and version to connection attributes.""" + if "conn_attrs" not in self.connection_kwargs: + self.connection_kwargs["conn_attrs"] = dict() + if "program_name" not in self.connection_kwargs["conn_attrs"]: + self.connection_kwargs["conn_attrs"]["program_name"] = "langchain python sdk" + self.connection_kwargs["conn_attrs"]["program_version"] = "0.0.205" # the version of SingleStoreDB VectorStore implementation + """Create connection pool.""" self.connection_pool = QueuePool( self._get_connection, @@ -270,7 +277,7 @@ def add_texts( return [] def similarity_search( - self, query: str, k: int = 4, **kwargs: Any + self, query: str, k: int = 4, filter: Optional[dict] = None, **kwargs: Any ) -> List[Document]: """Returns the most similar indexed documents to the query text. @@ -279,21 +286,23 @@ def similarity_search( Args: query (str): The query text for which to find similar documents. k (int): The number of documents to return. Default is 4. + filter (dict): A dictionary of metadata fields and values to filter by. Returns: List[Document]: A list of documents that are most similar to the query text. """ - docs_and_scores = self.similarity_search_with_score(query, k=k) + docs_and_scores = self.similarity_search_with_score(query=query, k=k, filter=filter) return [doc for doc, _ in docs_and_scores] def similarity_search_with_score( - self, query: str, k: int = 4 + self, query: str, k: int = 4, filter: Optional[dict] = None ) -> List[Tuple[Document, float]]: """Return docs most similar to query. Uses cosine similarity. Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. + filter: A dictionary of metadata fields and values to filter by. Defaults to None. Returns: List of Documents most similar to the query and score for each @@ -302,23 +311,44 @@ def similarity_search_with_score( embedding = self.embedding.embed_query(query) conn = self.connection_pool.connect() result = [] + where_clause: str = "" + where_clause_values = [] + if filter: + where_clause = "WHERE " + arguments = [] + + def build_where_clause(where_clause_values:List[Any], sub_filter: dict, prefix_args: List[str] = []): + for key in sub_filter.keys(): + if isinstance(sub_filter[key], dict): + build_where_clause(where_clause_values, sub_filter[key], prefix_args + [key]) + else: + arguments.append("JSON_EXTRACT_JSON({}, {}) = %s".format( + self.metadata_field, ", ".join(["%s"] * (len(prefix_args) + 1)))) + where_clause_values += prefix_args + [key] + where_clause_values.append(json.dumps(sub_filter[key])) + + build_where_clause(where_clause_values, filter) + where_clause += " AND ".join(arguments) + try: cur = conn.cursor() try: cur.execute( """SELECT {}, {}, {}({}, JSON_ARRAY_PACK(%s)) as __score - FROM {} ORDER BY __score {} LIMIT %s""".format( + FROM {} {} ORDER BY __score {} LIMIT %s""".format( self.content_field, self.metadata_field, self.distance_strategy, self.vector_field, self.table_name, + where_clause, ORDERING_DIRECTIVE[self.distance_strategy] ), ( "[{}]".format(",".join(map(str, embedding))), - k, - ), + ) + + tuple(where_clause_values) + + (k,) ) for row in cur.fetchall(): diff --git a/poetry.lock b/poetry.lock index 826cb88d911b6..b0ede3ac889e4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.0 and should not be changed by hand. [[package]] name = "absl-py" @@ -3845,13 +3845,13 @@ tests = ["doctest", "pytest", "pytest-mock"] [[package]] name = "langchainplus-sdk" -version = "0.0.10" +version = "0.0.11" description = "Client library to connect to the LangChainPlus LLM Tracing and Evaluation Platform." optional = false python-versions = ">=3.8.1,<4.0" files = [ - {file = "langchainplus_sdk-0.0.10-py3-none-any.whl", hash = "sha256:6ea4013a92a4c33a61d22deb49620577c592a79ee44038b2c751032a71cbc7b6"}, - {file = "langchainplus_sdk-0.0.10.tar.gz", hash = "sha256:4f810b38df74a99d01e5723e653da02f05df3ee922971cccabc365d00c33dbf6"}, + {file = "langchainplus_sdk-0.0.11-py3-none-any.whl", hash = "sha256:fbe3482ffe253e439ec8386a2904594a875b590e29e4adcbd938452a69a6c7c6"}, + {file = "langchainplus_sdk-0.0.11.tar.gz", hash = "sha256:e50679309a31d9526f467aa13d4dbcfba0dc00a295cea72ffcc9972865ecac1b"}, ] [package.dependencies] @@ -6034,7 +6034,7 @@ ptyprocess = ">=0.5" name = "pgvector" version = "0.1.8" description = "pgvector support for Python" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "pgvector-0.1.8-py2.py3-none-any.whl", hash = "sha256:99dce3a6580ef73863edb9b8441937671f4e1a09383826e6b0838176cd441a96"}, @@ -7092,13 +7092,13 @@ requests = [ [[package]] name = "pyparsing" -version = "3.0.9" +version = "3.1.0" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = true python-versions = ">=3.6.8" files = [ - {file = "pyparsing-3.0.9-py3-none-any.whl", hash = "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"}, - {file = "pyparsing-3.0.9.tar.gz", hash = "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb"}, + {file = "pyparsing-3.1.0-py3-none-any.whl", hash = "sha256:d554a96d1a7d3ddaf7183104485bc19fd80543ad6ac5bdb6426719d766fb06c1"}, + {file = "pyparsing-3.1.0.tar.gz", hash = "sha256:edb662d6fe322d6e990b1594b5feaeadf806803359e3d4d42f11e295e588f0ea"}, ] [package.extras] @@ -7106,13 +7106,13 @@ diagrams = ["jinja2", "railroad-diagrams"] [[package]] name = "pypdf" -version = "3.9.1" +version = "3.10.0" description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" optional = true python-versions = ">=3.6" files = [ - {file = "pypdf-3.9.1-py3-none-any.whl", hash = "sha256:5f4abdb4691a8d7631e7f2db09f66cfe3a388a072882d8375c6b1bdc28027c0a"}, - {file = "pypdf-3.9.1.tar.gz", hash = "sha256:c2b7fcfe25fbd04e8da600cb2700267ecee7e8781dc798cce3a4f567143a4df1"}, + {file = "pypdf-3.10.0-py3-none-any.whl", hash = "sha256:af28f36eeb5bcde26b4f9db9cc9df00610e8e5904d997e3141132e7768ff9247"}, + {file = "pypdf-3.10.0.tar.gz", hash = "sha256:bc15457f1f9767532d51546300a9226f745fee8d9acf626fcfcf42af77ad342c"}, ] [package.dependencies] @@ -7120,7 +7120,7 @@ typing_extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\ [package.extras] crypto = ["PyCryptodome"] -dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "wheel"] +dev = ["black", "flit", "pip-tools", "pre-commit (<2.18.0)", "pytest-cov", "pytest-socket", "wheel"] docs = ["myst_parser", "sphinx", "sphinx_rtd_theme"] full = ["Pillow", "PyCryptodome"] image = ["Pillow"] @@ -8379,17 +8379,17 @@ files = [ [[package]] name = "singlestoredb" -version = "0.6.1" +version = "0.7.1" description = "Interface to the SingleStore database and cluster management APIs" optional = true python-versions = ">=3.6" files = [ - {file = "singlestoredb-0.6.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:bf1769e53993981420650a02c59ba367913d9f0256948cc98f6f9d464f74852a"}, - {file = "singlestoredb-0.6.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4e90fa1dfde1e31f7abe011f75d9dc8cccbc35b968ed8381bd44c0b7dd4026b"}, - {file = "singlestoredb-0.6.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:44d361c3fa4de6228b525d0b1d22db75790d8e6fb84c3d0b2213bf41774d4323"}, - {file = "singlestoredb-0.6.1-cp36-abi3-win32.whl", hash = "sha256:ad9543c41286a2095718ad7e133cc8b3b5de938f731157fbb2d4d2b0d1623aff"}, - {file = "singlestoredb-0.6.1-cp36-abi3-win_amd64.whl", hash = "sha256:f9f9feda947b9fe9182863758118c8961ebb74281098b42894c99b58d30b2526"}, - {file = "singlestoredb-0.6.1.tar.gz", hash = "sha256:2e00f4cd869dc1ecf33df853c521ebd6ce913af2bf3b2f98675ffa3dc6911636"}, + {file = "singlestoredb-0.7.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:a997e9ffabef76009b92ca2c172d312a63718a34f48ea0bb275242e5232b3fd8"}, + {file = "singlestoredb-0.7.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f863ddbd0a13a5aa2b3374d1476db230d48b08d42590f2cda330df1ea7a84f4"}, + {file = "singlestoredb-0.7.1-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9158188807ac820ce08af169c44a27fe72172ee35b5e66bb98638215913c20f"}, + {file = "singlestoredb-0.7.1-cp36-abi3-win32.whl", hash = "sha256:9aec253c5db73d4ddd8d86eb91cac74c34b2d2bea5d95162feda04834b27f01c"}, + {file = "singlestoredb-0.7.1-cp36-abi3-win_amd64.whl", hash = "sha256:593f34fd5c131d2a0b8907b1c043343a3b880ac40b10770db2172ec4e448afe0"}, + {file = "singlestoredb-0.7.1.tar.gz", hash = "sha256:e103ad07b594fb0eb7134f1cbdefc08842a7462a8fc801ece8f96c155f7d9fd0"}, ] [package.dependencies] @@ -9056,13 +9056,13 @@ mpmath = ">=0.19" [[package]] name = "syrupy" -version = "4.0.2" +version = "4.0.4" description = "Pytest Snapshot Test Utility" optional = false python-versions = ">=3.8.1,<4" files = [ - {file = "syrupy-4.0.2-py3-none-any.whl", hash = "sha256:dfd1f0fad298eee753de4f2471d4346412c4435885c4b7beea648d4934c6620a"}, - {file = "syrupy-4.0.2.tar.gz", hash = "sha256:3c75ab6866580679b2cb9abe78e74c3e2011fffc6333651c6beb2a78a716ab80"}, + {file = "syrupy-4.0.4-py3-none-any.whl", hash = "sha256:9ea222e6c882ee34e35b418660c52bf3f62e5fe249331932eeceb2597eeab02d"}, + {file = "syrupy-4.0.4.tar.gz", hash = "sha256:806195794ed4fc17bec2836dd94f52b037fdb136e7c11949633e3970096ffec6"}, ] [package.dependencies] @@ -10238,13 +10238,13 @@ files = [ [[package]] name = "weaviate-client" -version = "3.20.1" +version = "3.21.0" description = "A python native Weaviate client" optional = false python-versions = ">=3.8" files = [ - {file = "weaviate-client-3.20.1.tar.gz", hash = "sha256:752912423f6334575c3feffcc98e5604e2d4b3bd3baddbff57d38db23bec3e9f"}, - {file = "weaviate_client-3.20.1-py3-none-any.whl", hash = "sha256:175f1665d9f1e580dcfa48cfd4dab1e49925d1655b0b4720d0f2ee7709c956f6"}, + {file = "weaviate-client-3.21.0.tar.gz", hash = "sha256:ec94ac554883c765e94da8b2947c4f0fa4a0378ed3bbe9f3653df3a5b1745a6d"}, + {file = "weaviate_client-3.21.0-py3-none-any.whl", hash = "sha256:420444ded7106fb000f4f8b2321b5f5fa2387825aa7a303d702accf61026f9d2"}, ] [package.dependencies] @@ -10855,7 +10855,7 @@ azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices- cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] @@ -10864,4 +10864,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "abfd5265cf134d614666453b6f4ec958bcf8de6447b4bdad091c333528162d04" +content-hash = "36cd5d49d9fe836427b308283af07540cb139c63884c10c3149d8a7242ed295e" diff --git a/pyproject.toml b/pyproject.toml index 79c7a56d2a640..19ad6fc803abe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,7 +101,7 @@ azure-cognitiveservices-speech = {version = "^1.28.0", optional = true} py-trello = {version = "^0.19.0", optional = true} momento = {version = "^1.5.0", optional = true} bibtexparser = {version = "^1.4.0", optional = true} -singlestoredb = {version = "^0.6.1", optional = true} +singlestoredb = {version = "^0.7.1", optional = true} pyspark = {version = "^3.4.0", optional = true} tigrisdb = {version = "^1.0.0b6", optional = true} nebula3-python = {version = "^3.4.0", optional = true} diff --git a/tests/integration_tests/vectorstores/test_singlestoredb.py b/tests/integration_tests/vectorstores/test_singlestoredb.py index a3c6f3dfc66b1..efd87b05d0eb8 100644 --- a/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -158,3 +158,131 @@ def test_singlestoredb_add_texts_to_existing(texts: List[str]) -> None: output = docsearch.similarity_search("foo", k=2) assert output == TEST_RESULT drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata(texts: List[str]) -> None: + """Test filtering by metadata""" + table_name = "test_singlestoredb_filter_metadata" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1, filter={"index": 2}) + assert output == [Document(page_content="baz", metadata={"index": 2})] + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata_2(texts: List[str]) -> None: + """Test filtering by metadata field that is similar for each document""" + table_name = "test_singlestoredb_filter_metadata_2" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i, "category": "budget"}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1, filter={"category": "budget"}) + assert output == [Document(page_content="foo", metadata={"index": 0, "category": "budget"})] + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata_3(texts: List[str]) -> None: + """Test filtering by two metadata fields""" + table_name = "test_singlestoredb_filter_metadata_3" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i, "category": "budget"}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1, filter={"category": "budget", "index": 1}) + assert output == [Document(page_content="bar", metadata={"index": 1, "category": "budget"})] + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata_4(texts: List[str]) -> None: + """Test no matches""" + table_name = "test_singlestoredb_filter_metadata_4" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i, "category": "budget"}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1, filter={"category": "vacation"}) + assert output == [] + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata_5(texts: List[str]) -> None: + """Test complex metadata path""" + table_name = "test_singlestoredb_filter_metadata_5" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i, "category": "budget", "subfield" : { + "subfield" : { "idx": i, "other_idx": i + 1}}}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1, filter={"category": "budget", "subfield" : { "subfield" : { "idx": 2}}}) + assert output == [Document(page_content="baz", metadata={"index": 2, "category": "budget", + "subfield" : { "subfield" : { "idx": 2, "other_idx": 3}}})] + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata_6(texts: List[str]) -> None: + """Test filtering by other bool""" + table_name = "test_singlestoredb_filter_metadata_6" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i, "category": "budget", "is_good" : i == 1}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1, filter={"category": "budget", "is_good" : True}) + assert output == [Document(page_content="bar", metadata={"index": 1, "category": "budget", "is_good" : True})] + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_filter_metadata_7(texts: List[str]) -> None: + """Test filtering by float""" + table_name = "test_singlestoredb_filter_metadata_7" + drop(table_name) + docs = [Document(page_content=t, metadata={"index": i, "category": "budget", "score" : i + 0.5}) for i, t in enumerate(texts)] + docsearch = SingleStoreDB.from_documents( + docs, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("bar", k=1, filter={"category": "budget", "score" : 2.5}) + assert output == [Document(page_content="baz", metadata={"index": 2, "category": "budget", "score" : 2.5})] + drop(table_name) From 4a52bdd251481c6da02520b778940154ddb92534 Mon Sep 17 00:00:00 2001 From: Volodymyr Tkachuk Date: Mon, 19 Jun 2023 16:57:41 +0300 Subject: [PATCH 4/6] add example --- langchain/vectorstores/singlestoredb.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py index 6f3f21f7eb207..926b9ba7d6471 100644 --- a/langchain/vectorstores/singlestoredb.py +++ b/langchain/vectorstores/singlestoredb.py @@ -290,6 +290,17 @@ def similarity_search( Returns: List[Document]: A list of documents that are most similar to the query text. + + Examples: + .. code-block:: python + from langchain.vectorstores import SingleStoreDB + from langchain.embeddings import OpenAIEmbeddings + s2 = SingleStoreDB.from_documents( + docs, + OpenAIEmbeddings(), + host="username:password@localhost:3306/database" + ) + s2.similarity_search("query text", 1, {"metadata_field": "metadata_value"}) """ docs_and_scores = self.similarity_search_with_score(query=query, k=k, filter=filter) return [doc for doc, _ in docs_and_scores] From b02ba8a3f3ee8d3a8b7f93bcf4fc6bb96d4e994a Mon Sep 17 00:00:00 2001 From: Volodymyr Tkachuk Date: Mon, 19 Jun 2023 17:56:15 +0300 Subject: [PATCH 5/6] resolve lock file conflict --- poetry.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/poetry.lock b/poetry.lock index b0ede3ac889e4..b654efc55b671 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6034,7 +6034,7 @@ ptyprocess = ">=0.5" name = "pgvector" version = "0.1.8" description = "pgvector support for Python" -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "pgvector-0.1.8-py2.py3-none-any.whl", hash = "sha256:99dce3a6580ef73863edb9b8441937671f4e1a09383826e6b0838176cd441a96"}, @@ -10855,7 +10855,7 @@ azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices- cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] @@ -10864,4 +10864,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "36cd5d49d9fe836427b308283af07540cb139c63884c10c3149d8a7242ed295e" +content-hash = "1d1a851fba50d040e0a6cc9f06d54de98b6f011464eb54dbdaffae82b5b71e84" From 8c0d45fb85d0bae88a7a3d7ba28d11c83f20610e Mon Sep 17 00:00:00 2001 From: Volodymyr Tkachuk Date: Mon, 19 Jun 2023 18:09:45 +0300 Subject: [PATCH 6/6] fix lint and format --- langchain/vectorstores/singlestoredb.py | 65 +++++++----- .../vectorstores/test_singlestoredb.py | 98 +++++++++++++++---- 2 files changed, 123 insertions(+), 40 deletions(-) diff --git a/langchain/vectorstores/singlestoredb.py b/langchain/vectorstores/singlestoredb.py index 926b9ba7d6471..65fa526732a91 100644 --- a/langchain/vectorstores/singlestoredb.py +++ b/langchain/vectorstores/singlestoredb.py @@ -3,7 +3,6 @@ import enum import json - from typing import ( Any, ClassVar, @@ -21,17 +20,20 @@ from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore, VectorStoreRetriever + class DistanceStrategy(str, enum.Enum): EUCLIDEAN_DISTANCE = "EUCLIDEAN_DISTANCE" DOT_PRODUCT = "DOT_PRODUCT" + DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT ORDERING_DIRECTIVE: dict = { DistanceStrategy.EUCLIDEAN_DISTANCE: "", - DistanceStrategy.DOT_PRODUCT: "DESC" + DistanceStrategy.DOT_PRODUCT: "DESC", } + class SingleStoreDB(VectorStore): """ This class serves as a Pythonic interface to the SingleStore DB database. @@ -72,14 +74,17 @@ def __init__( Args: embedding (Embeddings): A text embedding model. - distance_strategy (DistanceStrategy, optional): Determines the strategy employed for calculating + distance_strategy (DistanceStrategy, optional): + Determines the strategy employed for calculating the distance between vectors in the embedding space. Defaults to DOT_PRODUCT. Available options are: - - DOT_PRODUCT: Computes the scalar product of two vectors. This is the default behavior - - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between two vectors. This metric - considers the geometric distance in the vector space, and might be more suitable for - embeddings that rely on spatial relationships. + - DOT_PRODUCT: Computes the scalar product of two vectors. + This is the default behavior + - EUCLIDEAN_DISTANCE: Computes the Euclidean distance between + two vectors. This metric considers the geometric distance in + the vector space, and might be more suitable for embeddings + that rely on spatial relationships. table_name (str, optional): Specifies the name of the table in use. Defaults to "embeddings". @@ -195,8 +200,12 @@ def __init__( if "conn_attrs" not in self.connection_kwargs: self.connection_kwargs["conn_attrs"] = dict() if "program_name" not in self.connection_kwargs["conn_attrs"]: - self.connection_kwargs["conn_attrs"]["program_name"] = "langchain python sdk" - self.connection_kwargs["conn_attrs"]["program_version"] = "0.0.205" # the version of SingleStoreDB VectorStore implementation + self.connection_kwargs["conn_attrs"][ + "program_name" + ] = "langchain python sdk" + self.connection_kwargs["conn_attrs"][ + "program_version" + ] = "0.0.205" # the version of SingleStoreDB VectorStore implementation """Create connection pool.""" self.connection_pool = QueuePool( @@ -300,9 +309,12 @@ def similarity_search( OpenAIEmbeddings(), host="username:password@localhost:3306/database" ) - s2.similarity_search("query text", 1, {"metadata_field": "metadata_value"}) + s2.similarity_search("query text", 1, + {"metadata_field": "metadata_value"}) """ - docs_and_scores = self.similarity_search_with_score(query=query, k=k, filter=filter) + docs_and_scores = self.similarity_search_with_score( + query=query, k=k, filter=filter + ) return [doc for doc, _ in docs_and_scores] def similarity_search_with_score( @@ -313,7 +325,8 @@ def similarity_search_with_score( Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. - filter: A dictionary of metadata fields and values to filter by. Defaults to None. + filter: A dictionary of metadata fields and values to filter by. + Defaults to None. Returns: List of Documents most similar to the query and score for each @@ -323,18 +336,28 @@ def similarity_search_with_score( conn = self.connection_pool.connect() result = [] where_clause: str = "" - where_clause_values = [] + where_clause_values: List[Any] = [] if filter: where_clause = "WHERE " arguments = [] - def build_where_clause(where_clause_values:List[Any], sub_filter: dict, prefix_args: List[str] = []): + def build_where_clause( + where_clause_values: List[Any], + sub_filter: dict, + prefix_args: List[str] = [], + ) -> None: for key in sub_filter.keys(): if isinstance(sub_filter[key], dict): - build_where_clause(where_clause_values, sub_filter[key], prefix_args + [key]) + build_where_clause( + where_clause_values, sub_filter[key], prefix_args + [key] + ) else: - arguments.append("JSON_EXTRACT_JSON({}, {}) = %s".format( - self.metadata_field, ", ".join(["%s"] * (len(prefix_args) + 1)))) + arguments.append( + "JSON_EXTRACT_JSON({}, {}) = %s".format( + self.metadata_field, + ", ".join(["%s"] * (len(prefix_args) + 1)), + ) + ) where_clause_values += prefix_args + [key] where_clause_values.append(json.dumps(sub_filter[key])) @@ -353,13 +376,11 @@ def build_where_clause(where_clause_values:List[Any], sub_filter: dict, prefix_a self.vector_field, self.table_name, where_clause, - ORDERING_DIRECTIVE[self.distance_strategy] + ORDERING_DIRECTIVE[self.distance_strategy], ), - ( - "[{}]".format(",".join(map(str, embedding))), - ) + ("[{}]".format(",".join(map(str, embedding))),) + tuple(where_clause_values) - + (k,) + + (k,), ) for row in cur.fetchall(): diff --git a/tests/integration_tests/vectorstores/test_singlestoredb.py b/tests/integration_tests/vectorstores/test_singlestoredb.py index efd87b05d0eb8..f01ab1d7a49b1 100644 --- a/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -165,7 +165,9 @@ def test_singlestoredb_filter_metadata(texts: List[str]) -> None: """Test filtering by metadata""" table_name = "test_singlestoredb_filter_metadata" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i}) for i, t in enumerate(texts)] + docs = [ + Document(page_content=t, metadata={"index": i}) for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -183,7 +185,10 @@ def test_singlestoredb_filter_metadata_2(texts: List[str]) -> None: """Test filtering by metadata field that is similar for each document""" table_name = "test_singlestoredb_filter_metadata_2" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i, "category": "budget"}) for i, t in enumerate(texts)] + docs = [ + Document(page_content=t, metadata={"index": i, "category": "budget"}) + for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -192,7 +197,9 @@ def test_singlestoredb_filter_metadata_2(texts: List[str]) -> None: host=TEST_SINGLESTOREDB_URL, ) output = docsearch.similarity_search("foo", k=1, filter={"category": "budget"}) - assert output == [Document(page_content="foo", metadata={"index": 0, "category": "budget"})] + assert output == [ + Document(page_content="foo", metadata={"index": 0, "category": "budget"}) + ] drop(table_name) @@ -201,7 +208,10 @@ def test_singlestoredb_filter_metadata_3(texts: List[str]) -> None: """Test filtering by two metadata fields""" table_name = "test_singlestoredb_filter_metadata_3" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i, "category": "budget"}) for i, t in enumerate(texts)] + docs = [ + Document(page_content=t, metadata={"index": i, "category": "budget"}) + for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -209,8 +219,12 @@ def test_singlestoredb_filter_metadata_3(texts: List[str]) -> None: table_name=table_name, host=TEST_SINGLESTOREDB_URL, ) - output = docsearch.similarity_search("foo", k=1, filter={"category": "budget", "index": 1}) - assert output == [Document(page_content="bar", metadata={"index": 1, "category": "budget"})] + output = docsearch.similarity_search( + "foo", k=1, filter={"category": "budget", "index": 1} + ) + assert output == [ + Document(page_content="bar", metadata={"index": 1, "category": "budget"}) + ] drop(table_name) @@ -219,7 +233,10 @@ def test_singlestoredb_filter_metadata_4(texts: List[str]) -> None: """Test no matches""" table_name = "test_singlestoredb_filter_metadata_4" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i, "category": "budget"}) for i, t in enumerate(texts)] + docs = [ + Document(page_content=t, metadata={"index": i, "category": "budget"}) + for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -237,8 +254,17 @@ def test_singlestoredb_filter_metadata_5(texts: List[str]) -> None: """Test complex metadata path""" table_name = "test_singlestoredb_filter_metadata_5" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i, "category": "budget", "subfield" : { - "subfield" : { "idx": i, "other_idx": i + 1}}}) for i, t in enumerate(texts)] + docs = [ + Document( + page_content=t, + metadata={ + "index": i, + "category": "budget", + "subfield": {"subfield": {"idx": i, "other_idx": i + 1}}, + }, + ) + for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -246,9 +272,19 @@ def test_singlestoredb_filter_metadata_5(texts: List[str]) -> None: table_name=table_name, host=TEST_SINGLESTOREDB_URL, ) - output = docsearch.similarity_search("foo", k=1, filter={"category": "budget", "subfield" : { "subfield" : { "idx": 2}}}) - assert output == [Document(page_content="baz", metadata={"index": 2, "category": "budget", - "subfield" : { "subfield" : { "idx": 2, "other_idx": 3}}})] + output = docsearch.similarity_search( + "foo", k=1, filter={"category": "budget", "subfield": {"subfield": {"idx": 2}}} + ) + assert output == [ + Document( + page_content="baz", + metadata={ + "index": 2, + "category": "budget", + "subfield": {"subfield": {"idx": 2, "other_idx": 3}}, + }, + ) + ] drop(table_name) @@ -257,7 +293,13 @@ def test_singlestoredb_filter_metadata_6(texts: List[str]) -> None: """Test filtering by other bool""" table_name = "test_singlestoredb_filter_metadata_6" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i, "category": "budget", "is_good" : i == 1}) for i, t in enumerate(texts)] + docs = [ + Document( + page_content=t, + metadata={"index": i, "category": "budget", "is_good": i == 1}, + ) + for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -265,8 +307,15 @@ def test_singlestoredb_filter_metadata_6(texts: List[str]) -> None: table_name=table_name, host=TEST_SINGLESTOREDB_URL, ) - output = docsearch.similarity_search("foo", k=1, filter={"category": "budget", "is_good" : True}) - assert output == [Document(page_content="bar", metadata={"index": 1, "category": "budget", "is_good" : True})] + output = docsearch.similarity_search( + "foo", k=1, filter={"category": "budget", "is_good": True} + ) + assert output == [ + Document( + page_content="bar", + metadata={"index": 1, "category": "budget", "is_good": True}, + ) + ] drop(table_name) @@ -275,7 +324,13 @@ def test_singlestoredb_filter_metadata_7(texts: List[str]) -> None: """Test filtering by float""" table_name = "test_singlestoredb_filter_metadata_7" drop(table_name) - docs = [Document(page_content=t, metadata={"index": i, "category": "budget", "score" : i + 0.5}) for i, t in enumerate(texts)] + docs = [ + Document( + page_content=t, + metadata={"index": i, "category": "budget", "score": i + 0.5}, + ) + for i, t in enumerate(texts) + ] docsearch = SingleStoreDB.from_documents( docs, FakeEmbeddings(), @@ -283,6 +338,13 @@ def test_singlestoredb_filter_metadata_7(texts: List[str]) -> None: table_name=table_name, host=TEST_SINGLESTOREDB_URL, ) - output = docsearch.similarity_search("bar", k=1, filter={"category": "budget", "score" : 2.5}) - assert output == [Document(page_content="baz", metadata={"index": 2, "category": "budget", "score" : 2.5})] + output = docsearch.similarity_search( + "bar", k=1, filter={"category": "budget", "score": 2.5} + ) + assert output == [ + Document( + page_content="baz", + metadata={"index": 2, "category": "budget", "score": 2.5}, + ) + ] drop(table_name)