From 50e77edb80421066d445c753972fb5680cccd601 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Fri, 2 Jun 2023 20:35:32 -0700 Subject: [PATCH 01/10] Use open source ClickHouse as vector store --- langchain/vectorstores/__init__.py | 3 + langchain/vectorstores/clickhouse.py | 448 ++++++++++++++++++ .../vectorstores/test_clickhouse.py | 105 ++++ 3 files changed, 556 insertions(+) create mode 100644 langchain/vectorstores/clickhouse.py create mode 100644 tests/integration_tests/vectorstores/test_clickhouse.py diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index cef7d9fba05eb..fd986b30c62f5 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -23,6 +23,7 @@ from langchain.vectorstores.vectara import Vectara from langchain.vectorstores.weaviate import Weaviate from langchain.vectorstores.zilliz import Zilliz +from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings __all__ = [ "Redis", @@ -51,4 +52,6 @@ "DocArrayHnswSearch", "DocArrayInMemorySearch", "Typesense", + "Clickhouse", + "ClickhouseSettings" ] diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py new file mode 100644 index 0000000000000..8092323465e9c --- /dev/null +++ b/langchain/vectorstores/clickhouse.py @@ -0,0 +1,448 @@ +"""Wrapper around open source ClickHouse VectorSearch capability.""" + +from __future__ import annotations + +import json +import logging +from hashlib import sha1 +from threading import Thread +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import six +from pydantic import BaseSettings + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore + +logger = logging.getLogger() + +def has_mul_sub_str(s: str, *args: Any) -> bool: + for a in args: + if a not in s: + return False + return True + +class ClickhouseSettings(BaseSettings): + """ClickHouse Client Configuration + + Attribute: + clickhouse_host (str) : An URL to connect to MyScale backend. + Defaults to 'localhost'. + clickhouse_port (int) : URL port to connect with HTTP. Defaults to 8443. + username (str) : Username to login. Defaults to None. + password (str) : Password to login. Defaults to None. + index_type (str): index type string. + index_param (list): index build parameter. + index_query_params(dict): index query parameters. + database (str) : Database name to find the table. Defaults to 'default'. + table (str) : Table name to operate on. + Defaults to 'vector_table'. + metric (str) : Metric to compute distance, + supported are ('angular', 'euclidean', 'manhattan', 'hamming', 'dot'). Defaults to 'angular'. + https://github.com/spotify/annoy/blob/main/src/annoymodule.cc#L149-L169 + column_map (Dict) : Column type map to project column name onto langchain + semantics. Must have keys: `text`, `id`, `vector`, + must be same size to number of columns. For example: + .. code-block:: python + + { + 'id': 'text_id', + 'vector': 'text_embedding', + 'text': 'text_plain', + 'metadata': 'metadata_dictionary_in_json', + } + + Defaults to identity map. + """ + + host: str = "localhost" + port: int = 8123 + + username: Optional[str] = None + password: Optional[str] = None + + index_type: str = "annoy" + # Annoy supports L2Distance and cosineDistance. + index_param: Optional[List[str]] = [100, "'L2Distance'"] + index_query_params: Dict[str, str] = {} + + column_map: Dict[str, str] = { + "id": "id", + "text": "text", + "vector": "vector", + "metadata": "metadata", + } + + database: str = "default" + table: str = "langchain" + metric: str = "angular" + + def __getitem__(self, item: str) -> Any: + return getattr(self, item) + + class Config: + env_file = ".env" + env_prefix = "clickhouse_" + env_file_encoding = "utf-8" + + +class Clickhouse(VectorStore): + """Wrapper around ClickHouse vector database + + You need a `clickhouse-connect` python package, and a valid account + to connect to ClickHouse. + + ClickHouse can not only search with simple vector indexes, + it also supports complex query with multiple conditions, + constraints and even sub-queries. + + For more information, please visit + [ClickHouse official site](https://docs.ClickHouse.com/en/overview/) + """ + + def __init__( + self, + embedding: Embeddings, + config: Optional[ClickhouseSettings] = None, + **kwargs: Any, + ) -> None: + """ClickHouse Wrapper to LangChain + + embedding_function (Embeddings): + config (ClickHouseSettings): Configuration to ClickHouse Client + Other keyword arguments will pass into + [clickhouse-connect](https://docs.clickhouse.com/) + """ + try: + from clickhouse_connect import get_client + except ImportError: + raise ValueError( + "Could not import clickhouse connect python package. " + "Please install it with `pip install clickhouse-connect`." + ) + try: + from tqdm import tqdm + + self.pgbar = tqdm + except ImportError: + # Just in case if tqdm is not installed + self.pgbar = lambda x, **kwargs: x + super().__init__() + if config is not None: + self.config = config + else: + self.config = ClickhouseSettings() + assert self.config + assert self.config.host and self.config.port + assert ( + self.config.column_map + and self.config.database + and self.config.table + and self.config.metric + ) + for k in ["id", "vector", "text", "metadata"]: + assert k in self.config.column_map + assert self.config.metric in ["angular", "euclidean", "manhattan", "hamming", "dot"] + + # initialize the schema + dim = len(embedding.embed_query("test")) + + index_params = ( + ",".join([f"'{k}={v}'" for k, v in self.config.index_param.items()]) + if self.config.index_param + else "" + ) if isinstance(self.config.index_param, Dict) else \ + ",".join(self.config.index_param) if isinstance(self.config.index_param, List) \ + else self.config.index_param + + schema_ = f""" + CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( + {self.config.column_map['id']} String, + {self.config.column_map['text']} String, + {self.config.column_map['vector']} Array(Float32), + {self.config.column_map['metadata']} JSON, + CONSTRAINT cons_vec_len CHECK length({self.config.column_map['vector']}) = {dim}, + INDEX vec_idx {self.config.column_map['vector']} TYPE {self.config.index_type}({index_params}) GRANULARITY 1000 + ) ENGINE = MergeTree ORDER BY {self.config.column_map['id']} + SETTINGS index_granularity = 8192 + """ + print(schema_) + self.dim = dim + self.BS = "\\" + self.must_escape = ("\\", "'") + self.embedding_function = embedding + self.dist_order = "ASC" # Only support ConsingDistance and L2Distance + + # Create a connection to clickhouse + self.client = get_client( + host=self.config.host, + port=self.config.port, + username=self.config.username, + password=self.config.password, + **kwargs, + ) + # Enable JSON type + self.client.command("SET allow_experimental_object_type=1") + # Enable Annoy index + self.client.command("SET allow_experimental_annoy_index=1") + self.client.command(schema_) + + def escape_str(self, value: str) -> str: + return "".join(f"{self.BS}{c}" if c in self.must_escape else c for c in value) + + def _build_insert_sql(self, transac: Iterable, column_names: Iterable[str]) -> str: + ks = ",".join(column_names) + _data = [] + for n in transac: + n = ",".join([f"'{self.escape_str(str(_n))}'" for _n in n]) + _data.append(f"({n})") + i_str = f""" + INSERT INTO TABLE + {self.config.database}.{self.config.table}({ks}) + VALUES + {','.join(_data)} + """ + return i_str + + def _insert(self, transac: Iterable, column_names: Iterable[str]) -> None: + _insert_query = self._build_insert_sql(transac, column_names) + print(_insert_query) + self.client.command(_insert_query) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + batch_size: int = 32, + ids: Optional[Iterable[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Insert more texts through the embeddings and add to the VectorStore. + + Args: + texts: Iterable of strings to add to the VectorStore. + ids: Optional list of ids to associate with the texts. + batch_size: Batch size of insertion + metadata: Optional column data to be inserted + + Returns: + List of ids from adding the texts into the VectorStore. + + """ + # Embed and create the documents + ids = ids or [sha1(t.encode("utf-8")).hexdigest() for t in texts] + colmap_ = self.config.column_map + transac = [] + column_names = { + colmap_["id"]: ids, + colmap_["text"]: texts, + colmap_["vector"]: self.embedding_function.embed_documents(list(texts)), + } + metadatas = metadatas or [{} for _ in texts] + column_names[colmap_["metadata"]] = map(json.dumps, metadatas) + assert len(set(colmap_) - set(column_names)) >= 0 + keys, values = zip(*column_names.items()) + try: + t = None + for v in self.pgbar( + zip(*values), desc="Inserting data...", total=len(metadatas) + ): + assert len(v[keys.index(self.config.column_map["vector"])]) == self.dim + transac.append(v) + if len(transac) == batch_size: + if t: + t.join() + t = Thread(target=self._insert, args=[transac, keys]) + t.start() + transac = [] + if len(transac) > 0: + if t: + t.join() + self._insert(transac, keys) + return [i for i in ids] + except Exception as e: + logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m") + return [] + + @classmethod + def from_texts( + cls, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[Dict[Any, Any]]] = None, + config: Optional[ClickhouseSettings] = None, + text_ids: Optional[Iterable[str]] = None, + batch_size: int = 32, + **kwargs: Any, + ) -> Clickhouse: + """Create ClickHouse wrapper with existing texts + + Args: + embedding_function (Embeddings): Function to extract text embedding + texts (Iterable[str]): List or tuple of strings to be added + config (ClickHouseSettings, Optional): ClickHouse configuration + text_ids (Optional[Iterable], optional): IDs for the texts. + Defaults to None. + batch_size (int, optional): Batchsize when transmitting data to ClickHouse. + Defaults to 32. + metadata (List[dict], optional): metadata to texts. Defaults to None. + Other keyword arguments will pass into + [clickhouse-connect](https://clickhouse.com/docs/en/integrations/python#clickhouse-connect-driver-api) + Returns: + ClickHouse Index + """ + ctx = cls(embedding, config, **kwargs) + ctx.add_texts(texts, ids=text_ids, batch_size=batch_size, metadatas=metadatas) + return ctx + + def __repr__(self) -> str: + """Text representation for ClickHouse Vector Store, prints backends, username and schemas. + Easy to use with `str(ClickHouse())` + + Returns: + repr: string to show connection info and data schema + """ + _repr = f"\033[92m\033[1m{self.config.database}.{self.config.table} @ " + _repr += f"{self.config.host}:{self.config.port}\033[0m\n\n" + _repr += f"\033[1musername: {self.config.username}\033[0m\n\nTable Schema:\n" + _repr += "-" * 51 + "\n" + for r in self.client.query( + f"DESC {self.config.database}.{self.config.table}" + ).named_results(): + _repr += ( + f"|\033[94m{r['name']:24s}\033[0m|\033[96m{r['type']:24s}\033[0m|\n" + ) + _repr += "-" * 51 + "\n" + return _repr + + def _build_query_sql( + self, q_emb: List[float], topk: int, where_str: Optional[str] = None + ) -> str: + q_emb_str = ",".join(map(str, q_emb)) + if where_str: + where_str = f"PREWHERE {where_str}" + else: + where_str = "" + + settings_strs = [] + if self.config.index_query_params: + for k,v in six.iteritems(self.config.index_query_params): + settings_strs.append(f"SETTING {k}={v}") + q_str = f""" + SELECT {self.config.column_map['text']}, + {self.config.column_map['metadata']}, dist + FROM {self.config.database}.{self.config.table} + {where_str} + ORDER BY L2Distance({self.config.column_map['vector']}, [{q_emb_str}]) + AS dist {self.dist_order} + LIMIT {topk} {' '.join(settings_strs)} + """ + print(q_str) + return q_str + + def similarity_search( + self, query: str, k: int = 4, where_str: Optional[str] = None, **kwargs: Any + ) -> List[Document]: + """Perform a similarity search with ClickHouse + + Args: + query (str): query string + k (int, optional): Top K neighbors to retrieve. Defaults to 4. + where_str (Optional[str], optional): where condition string. + Defaults to None. + + NOTE: Please do not let end-user to fill this and always be aware + of SQL injection. When dealing with metadatas, remember to + use `{self.metadata_column}.attribute` instead of `attribute` + alone. The default name for it is `metadata`. + + Returns: + List[Document]: List of Documents + """ + return self.similarity_search_by_vector( + self.embedding_function.embed_query(query), k, where_str, **kwargs + ) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + where_str: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Perform a similarity search with ClickHouse by vectors + + Args: + query (str): query string + k (int, optional): Top K neighbors to retrieve. Defaults to 4. + where_str (Optional[str], optional): where condition string. + Defaults to None. + + NOTE: Please do not let end-user to fill this and always be aware + of SQL injection. When dealing with metadatas, remember to + use `{self.metadata_column}.attribute` instead of `attribute` + alone. The default name for it is `metadata`. + + Returns: + List[Document]: List of (Document, similarity) + """ + q_str = self._build_query_sql(embedding, k, where_str) + try: + return [ + Document( + page_content=r[self.config.column_map["text"]], + metadata=r[self.config.column_map["metadata"]], + ) + for r in self.client.query(q_str).named_results() + ] + except Exception as e: + logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m") + return [] + + def similarity_search_with_relevance_scores( + self, query: str, k: int = 4, where_str: Optional[str] = None, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Perform a similarity search with ClickHouse + + Args: + query (str): query string + k (int, optional): Top K neighbors to retrieve. Defaults to 4. + where_str (Optional[str], optional): where condition string. + Defaults to None. + + NOTE: Please do not let end-user to fill this and always be aware + of SQL injection. When dealing with metadatas, remember to + use `{self.metadata_column}.attribute` instead of `attribute` + alone. The default name for it is `metadata`. + + Returns: + List[Document]: List of documents + """ + q_str = self._build_query_sql(self.embedding_function.embed_query(query), k, where_str) + try: + return [ + ( + Document( + page_content=r[self.config.column_map["text"]], + metadata=r[self.config.column_map["metadata"]], + ), + r["dist"], + ) + for r in self.client.query(q_str).named_results() + ] + except Exception as e: + logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m") + return [] + + def drop(self) -> None: + """ + Helper function: Drop data + """ + self.client.command( + f"DROP TABLE IF EXISTS {self.config.database}.{self.config.table}" + ) + + @property + def metadata_column(self) -> str: + return self.config.column_map["metadata"] diff --git a/tests/integration_tests/vectorstores/test_clickhouse.py b/tests/integration_tests/vectorstores/test_clickhouse.py new file mode 100644 index 0000000000000..6bc0370b29e26 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_clickhouse.py @@ -0,0 +1,105 @@ +"""Test ClickHouse functionality.""" +import pytest + +from langchain.docstore.document import Document +from langchain.vectorstores import Clickhouse, ClickhouseSettings +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + +def test_clickhouse() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + config = ClickhouseSettings() + config.table = "test_clickhouse" + docsearch = Clickhouse.from_texts(texts, FakeEmbeddings(), config=config) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"_dummy": 0})] + docsearch.drop() + +@pytest.mark.asyncio +async def test_clickhouse_async() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + config = ClickhouseSettings() + config.table = "test_clickhouse_async" + docsearch = Clickhouse.from_texts( + texts=texts, embedding=FakeEmbeddings(), config=config + ) + output = await docsearch.asimilarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"_dummy": 0})] + docsearch.drop() + +def test_clickhouse_with_metadatas() -> None: + """Test end to end construction and search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + config = ClickhouseSettings() + config.table = "test_clickhouse_with_metadatas" + docsearch = Clickhouse.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + config=config, + metadatas=metadatas, + ) + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"page": "0"})] + docsearch.drop() + + +def test_clickhouse_with_metadatas_with_relevance_scores() -> None: + """Test end to end construction and scored search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": str(i)} for i in range(len(texts))] + config = ClickhouseSettings() + config.table = "test_clickhouse_with_metadatas_with_relevance_scores" + docsearch = Clickhouse.from_texts( + texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, config=config + ) + output = docsearch.similarity_search_with_relevance_scores("foo", k=1) + assert output[0][0] == Document(page_content="foo", metadata={"page": "0"}) + docsearch.drop() + + +def test_clickhouse_search_filter() -> None: + """Test end to end construction and search with metadata filtering.""" + texts = ["far", "bar", "baz"] + metadatas = [{"first_letter": "{}".format(text[0])} for text in texts] + config = ClickhouseSettings() + config.table = "test_clickhouse_search_filter" + docsearch = Clickhouse.from_texts( + texts=texts, embedding=FakeEmbeddings(), metadatas=metadatas, config=config + ) + output = docsearch.similarity_search( + "far", k=1, where_str=f"{docsearch.metadata_column}.first_letter='f'" + ) + assert output == [Document(page_content="far", metadata={"first_letter": "f"})] + output = docsearch.similarity_search( + "bar", k=1, where_str=f"{docsearch.metadata_column}.first_letter='b'" + ) + assert output == [Document(page_content="bar", metadata={"first_letter": "b"})] + docsearch.drop() + + +def test_clickhouse_with_persistence() -> None: + """Test end to end construction and search, with persistence.""" + config = ClickhouseSettings() + config.table = "test_clickhouse_with_persistence" + texts = [ + "foo", + "bar", + "baz", + ] + docsearch = Clickhouse.from_texts( + texts=texts, embedding=FakeEmbeddings(), config=config + ) + + output = docsearch.similarity_search("foo", k=1) + assert output == [Document(page_content="foo", metadata={"_dummy": 0})] + + # Get a new VectorStore with same config + # it will reuse the table spontaneously + # unless you drop it + docsearch = Clickhouse(embedding=FakeEmbeddings(), config=config) + output = docsearch.similarity_search("foo", k=1) + + # Clean up + docsearch.drop() From 6bf26463dc1250ca48760474589fdbdf5c03d4f9 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sat, 3 Jun 2023 01:55:17 -0700 Subject: [PATCH 02/10] Add clickhouse vector store notebook --- .../vectorstores/examples/clickhouse.ipynb | 291 ++++++++++++++++++ langchain/vectorstores/clickhouse.py | 3 - 2 files changed, 291 insertions(+), 3 deletions(-) create mode 100644 docs/modules/indexes/vectorstores/examples/clickhouse.ipynb diff --git a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb new file mode 100644 index 0000000000000..fa4e0d0430a98 --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb @@ -0,0 +1,291 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "683953b3", + "metadata": {}, + "source": [ + "# ClickHouse Vector Search\n", + "\n", + "> [ClickHouse](https://clickhouse.com/) is the fastest and most resource efficient open-source database for real-time apps and analytics with full SQL support and a wide range of functions to assist users in writing analytical queries. Lately added data structures and distance search functions (like `L2Distance`) as well as [approximate nearest neighbor search indexes](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/annindexes) enable ClickHouse to be used as a high performance and scalable vector database to store and search vectors with SQL.\n", + "\n", + "This notebook shows how to use functionality related to the `ClickHouse` vector search." + ] + }, + { + "cell_type": "markdown", + "id": "43ead5d5-2c1f-4dce-a69a-cb00e4f9d6f0", + "metadata": {}, + "source": [ + "## Setting up envrionments" + ] + }, + { + "cell_type": "markdown", + "id": "b2c434bc", + "metadata": {}, + "source": [ + "Setting up local clickhouse server with docker (optional)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "249a7751", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:43:43.035606Z", + "start_time": "2023-06-03T08:43:42.618531Z" + } + }, + "outputs": [], + "source": [ + "! docker run -d -p 8123:8123 -p9000:9000 --name langchain-clickhouse-server --ulimit nofile=262144:262144 clickhouse/clickhouse-server:23.4.2.11" + ] + }, + { + "cell_type": "markdown", + "id": "7bd3c1c0", + "metadata": {}, + "source": [ + "Setup up clickhouse client driver" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d614bf8", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install clickhouse-connect" + ] + }, + { + "cell_type": "markdown", + "id": "15a1d477-9cdb-4d82-b019-96951ecb2b72", + "metadata": {}, + "source": [ + "We want to use OpenAIEmbeddings so we have to get the OpenAI API Key." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "91003ea5-0c8c-436c-a5de-aaeaeef2f458", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:49:35.383673Z", + "start_time": "2023-06-03T08:49:33.984547Z" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "if not os.environ['OPENAI_API_KEY']:\n", + " os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "aac9563e", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:33:31.554934Z", + "start_time": "2023-06-03T08:33:31.549590Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Clickhouse, ClickhouseSettings" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "a3c3999a", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:33:32.527387Z", + "start_time": "2023-06-03T08:33:32.501312Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e104aee", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:33:35.503823Z", + "start_time": "2023-06-03T08:33:33.745832Z" + } + }, + "outputs": [], + "source": [ + "for d in docs:\n", + " d.metadata = {'some': 'metadata'}\n", + "settings = ClickhouseSettings(table=\"clickhouse_vector_search_example\")\n", + "docsearch = Clickhouse.from_documents(docs, embeddings, config=settings)\n", + "\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = docsearch.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c608226", + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "e3a8b105", + "metadata": {}, + "source": [ + "## Get connection info and data schema" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69996818", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:28:58.252991Z", + "start_time": "2023-06-03T08:28:58.197560Z" + } + }, + "outputs": [], + "source": [ + "print(str(docsearch))" + ] + }, + { + "cell_type": "markdown", + "id": "f59360c0", + "metadata": {}, + "source": [ + "## Filtering\n", + "\n", + "You can have direct access to ClickHouse SQL where statement. You can write `WHERE` clause following standard SQL.\n", + "\n", + "**NOTE**: Please be aware of SQL injection, this interface must not be directly called by end-user.\n", + "\n", + "If you custimized your `column_map` under your setting, you search with filter like this:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "232055f6", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:29:36.680805Z", + "start_time": "2023-06-03T08:29:34.963676Z" + } + }, + "outputs": [], + "source": [ + "from langchain.vectorstores import Clickhouse, ClickhouseSettings\n", + "from langchain.document_loaders import TextLoader\n", + "\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "for i, d in enumerate(docs):\n", + " d.metadata = {'doc_id': i}\n", + "\n", + "docsearch = Clickhouse.from_documents(docs, embeddings)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddbcee77", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:29:43.487436Z", + "start_time": "2023-06-03T08:29:43.040831Z" + } + }, + "outputs": [], + "source": [ + "meta = docsearch.metadata_column\n", + "output = docsearch.similarity_search_with_relevance_scores('What did the president say about Ketanji Brown Jackson?', \n", + " k=4, where_str=f\"{meta}.doc_id<10\")\n", + "for d, dist in output:\n", + " print(dist, d.metadata, d.page_content[:20] + '...')" + ] + }, + { + "cell_type": "markdown", + "id": "a359ed74", + "metadata": {}, + "source": [ + "## Deleting your data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb6a9d36", + "metadata": { + "ExecuteTime": { + "end_time": "2023-06-03T08:30:24.822384Z", + "start_time": "2023-06-03T08:30:24.798571Z" + } + }, + "outputs": [], + "source": [ + "docsearch.drop()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index 8092323465e9c..ac7176fba64cc 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -167,7 +167,6 @@ def __init__( ) ENGINE = MergeTree ORDER BY {self.config.column_map['id']} SETTINGS index_granularity = 8192 """ - print(schema_) self.dim = dim self.BS = "\\" self.must_escape = ("\\", "'") @@ -207,7 +206,6 @@ def _build_insert_sql(self, transac: Iterable, column_names: Iterable[str]) -> s def _insert(self, transac: Iterable, column_names: Iterable[str]) -> None: _insert_query = self._build_insert_sql(transac, column_names) - print(_insert_query) self.client.command(_insert_query) def add_texts( @@ -338,7 +336,6 @@ def _build_query_sql( AS dist {self.dist_order} LIMIT {topk} {' '.join(settings_strs)} """ - print(q_str) return q_str def similarity_search( From 24db3052125d16d46f28fe8ba868ff129f30fdfc Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sat, 3 Jun 2023 02:05:03 -0700 Subject: [PATCH 03/10] Add clickhouse document --- docs/integrations/clickhouse.md | 52 +++++++++++ .../vectorstores/examples/clickhouse.ipynb | 88 ++++++++++++++++--- 2 files changed, 126 insertions(+), 14 deletions(-) create mode 100644 docs/integrations/clickhouse.md diff --git a/docs/integrations/clickhouse.md b/docs/integrations/clickhouse.md new file mode 100644 index 0000000000000..97d8732adeceb --- /dev/null +++ b/docs/integrations/clickhouse.md @@ -0,0 +1,52 @@ +# ClickHouse + +This page covers how to use ClickHouse Vector Search within LangChain. + +[ClickHouse](https://clickhouse.com) is a open source real-time OLAP database with full SQL support and a wide range of functions to assist users in writing analytical queries. Some of these functions and data structures perform distance operations between vectors, enabling ClickHouse to be used as a vector database. + +Due to the fully parallelized query pipeline, ClickHouse can process vector search operations very quickly, especially when performing exact matching through a linear scan over all rows, delivering processing speed comparable to dedicated vector databases. + +High compression levels, tunable through custom compression codecs, enable very large datasets to be stored and queried. ClickHouse is not memory-bound, allowing multi-TB datasets containing embeddings to be queried. + +The capabilities for computing the distance between two vectors are just another SQL function and can be effectively combined with more traditional SQL filtering and aggregation capabilities. This allows vectors to be stored and queried alongside metadata, and even rich text, enabling a broad array of use cases and applications. + +Finally, experimental ClickHouse capabilities like [Approximate Nearest Neighbour (ANN) indices](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/annindexes) support faster approximate matching of vectors and provide a promising development aimed to further enhance the vector matching capabilities of ClickHouse. + +## Installation +- Install clickhouse server by [binary](https://clickhouse.com/docs/en/install) or [docker image](https://hub.docker.com/r/clickhouse/clickhouse-server/) +- Install the Python SDK with `pip install clickhouse-connect` + +### Configure clickhouse vector index + +Customize `ClickhouseSettings` object with parameters + + ```python + from langchain.vectorstores import ClickHouse, ClickhouseSettings + config = ClickhouseSettings(host="", port=8123, ...) + index = Clickhouse(embedding_function, config) + index.add_documents(...) + ``` + +## Wrappers +supported functions: +- `add_texts` +- `add_documents` +- `from_texts` +- `from_documents` +- `similarity_search` +- `asimilarity_search` +- `similarity_search_by_vector` +- `asimilarity_search_by_vector` +- `similarity_search_with_relevance_scores` + +### VectorStore + +There exists a wrapper around open source Clickhouse database, allowing you to use it as a vectorstore, +whether for semantic search or similar example retrieval. + +To import this vectorstore: +```python +from langchain.vectorstores import Clickhouse +``` + +For a more detailed walkthrough of the MyScale wrapper, see [this notebook](../modules/indexes/vectorstores/examples/clickhouse.ipynb) diff --git a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb index fa4e0d0430a98..272fe0143c737 100644 --- a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb +++ b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb @@ -71,7 +71,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "id": "91003ea5-0c8c-436c-a5de-aaeaeef2f458", "metadata": { "ExecuteTime": { @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 2, "id": "aac9563e", "metadata": { "ExecuteTime": { @@ -108,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, "id": "a3c3999a", "metadata": { "ExecuteTime": { @@ -130,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "6e104aee", "metadata": { "ExecuteTime": { @@ -138,7 +138,15 @@ "start_time": "2023-06-03T08:33:33.745832Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 5516.23it/s]\n" + ] + } + ], "source": [ "for d in docs:\n", " d.metadata = {'some': 'metadata'}\n", @@ -151,10 +159,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "9c608226", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], "source": [ "print(docs[0].page_content)" ] @@ -169,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "69996818", "metadata": { "ExecuteTime": { @@ -177,7 +199,26 @@ "start_time": "2023-06-03T08:28:58.197560Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[92m\u001B[1mdefault.clickhouse_vector_search_example @ localhost:8123\u001B[0m\n", + "\n", + "\u001B[1musername: None\u001B[0m\n", + "\n", + "Table Schema:\n", + "---------------------------------------------------\n", + "|\u001B[94mid \u001B[0m|\u001B[96mString \u001B[0m|\n", + "|\u001B[94mtext \u001B[0m|\u001B[96mString \u001B[0m|\n", + "|\u001B[94mvector \u001B[0m|\u001B[96mArray(Float32) \u001B[0m|\n", + "|\u001B[94mmetadata \u001B[0m|\u001B[96mObject('json') \u001B[0m|\n", + "---------------------------------------------------\n", + "\n" + ] + } + ], "source": [ "print(str(docsearch))" ] @@ -198,7 +239,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "232055f6", "metadata": { "ExecuteTime": { @@ -206,7 +247,15 @@ "start_time": "2023-06-03T08:29:34.963676Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 6234.90it/s]\n" + ] + } + ], "source": [ "from langchain.vectorstores import Clickhouse, ClickhouseSettings\n", "from langchain.document_loaders import TextLoader\n", @@ -226,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "ddbcee77", "metadata": { "ExecuteTime": { @@ -234,7 +283,18 @@ "start_time": "2023-06-03T08:29:43.040831Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6007716611401479 {'doc_id': 0, 'some': 'metadata'} Tonight. I call on t...\n", + "0.6007716611401479 {'doc_id': 0, 'some': 'metadata'} Tonight. I call on t...\n", + "0.6497183251536313 {'doc_id': 0, 'some': 'metadata'} A former top litigat...\n", + "0.6497183251536313 {'doc_id': 0, 'some': 'metadata'} A former top litigat...\n" + ] + } + ], "source": [ "meta = docsearch.metadata_column\n", "output = docsearch.similarity_search_with_relevance_scores('What did the president say about Ketanji Brown Jackson?', \n", @@ -253,7 +313,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "fb6a9d36", "metadata": { "ExecuteTime": { From 9151667c8e8ea1602b5205d65a1c0b987336564e Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sat, 3 Jun 2023 02:26:11 -0700 Subject: [PATCH 04/10] Fix clickhouse doc link --- langchain/vectorstores/clickhouse.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index ac7176fba64cc..f22ede26612e8 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -41,6 +41,7 @@ class ClickhouseSettings(BaseSettings): metric (str) : Metric to compute distance, supported are ('angular', 'euclidean', 'manhattan', 'hamming', 'dot'). Defaults to 'angular'. https://github.com/spotify/annoy/blob/main/src/annoymodule.cc#L149-L169 + column_map (Dict) : Column type map to project column name onto langchain semantics. Must have keys: `text`, `id`, `vector`, must be same size to number of columns. For example: @@ -98,7 +99,7 @@ class Clickhouse(VectorStore): constraints and even sub-queries. For more information, please visit - [ClickHouse official site](https://docs.ClickHouse.com/en/overview/) + [ClickHouse official site](https://clickhouse.com/clickhouse) """ def __init__( From cb8d44742932ff771b67d1cbf62d79f1cabad786 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sat, 3 Jun 2023 21:52:47 -0700 Subject: [PATCH 05/10] Fix pylint errors --- langchain/vectorstores/clickhouse.py | 12 +++++------- .../vectorstores/test_clickhouse.py | 3 +++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index f22ede26612e8..7ca236bf56c91 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -8,7 +8,6 @@ from threading import Thread from typing import Any, Dict, Iterable, List, Optional, Tuple, Union -import six from pydantic import BaseSettings from langchain.docstore.document import Document @@ -65,7 +64,7 @@ class ClickhouseSettings(BaseSettings): index_type: str = "annoy" # Annoy supports L2Distance and cosineDistance. - index_param: Optional[List[str]] = [100, "'L2Distance'"] + index_param: Optional[Union[List, Dict]] = [100, "'L2Distance'"] index_query_params: Dict[str, str] = {} column_map: Dict[str, str] = { @@ -151,10 +150,9 @@ def __init__( index_params = ( ",".join([f"'{k}={v}'" for k, v in self.config.index_param.items()]) - if self.config.index_param - else "" + if self.config.index_param else "" ) if isinstance(self.config.index_param, Dict) else \ - ",".join(self.config.index_param) if isinstance(self.config.index_param, List) \ + ",".join([str(p) for p in self.config.index_param]) if isinstance(self.config.index_param, List) \ else self.config.index_param schema_ = f""" @@ -326,8 +324,8 @@ def _build_query_sql( settings_strs = [] if self.config.index_query_params: - for k,v in six.iteritems(self.config.index_query_params): - settings_strs.append(f"SETTING {k}={v}") + for k in self.config.index_query_params: + settings_strs.append(f"SETTING {k}={self.config.index_query_params[k]}") q_str = f""" SELECT {self.config.column_map['text']}, {self.config.column_map['metadata']}, dist diff --git a/tests/integration_tests/vectorstores/test_clickhouse.py b/tests/integration_tests/vectorstores/test_clickhouse.py index 6bc0370b29e26..c8ccc7e5a8cc9 100644 --- a/tests/integration_tests/vectorstores/test_clickhouse.py +++ b/tests/integration_tests/vectorstores/test_clickhouse.py @@ -5,6 +5,7 @@ from langchain.vectorstores import Clickhouse, ClickhouseSettings from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + def test_clickhouse() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] @@ -15,6 +16,7 @@ def test_clickhouse() -> None: assert output == [Document(page_content="foo", metadata={"_dummy": 0})] docsearch.drop() + @pytest.mark.asyncio async def test_clickhouse_async() -> None: """Test end to end construction and search.""" @@ -28,6 +30,7 @@ async def test_clickhouse_async() -> None: assert output == [Document(page_content="foo", metadata={"_dummy": 0})] docsearch.drop() + def test_clickhouse_with_metadatas() -> None: """Test end to end construction and search.""" texts = ["foo", "bar", "baz"] From c621b23e8f3cf7c4b5bbc66577de4e4bf0ad8471 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sun, 4 Jun 2023 00:57:51 -0700 Subject: [PATCH 06/10] Optimize clickhouse table schema --- langchain/vectorstores/clickhouse.py | 39 +++++++++++++++------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index 7ca236bf56c91..ee90ee3c90632 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -48,8 +48,9 @@ class ClickhouseSettings(BaseSettings): { 'id': 'text_id', - 'vector': 'text_embedding', - 'text': 'text_plain', + 'uuid': 'global_unique_id' + 'embedding': 'text_embedding', + 'document': 'text_plain', 'metadata': 'metadata_dictionary_in_json', } @@ -69,8 +70,9 @@ class ClickhouseSettings(BaseSettings): column_map: Dict[str, str] = { "id": "id", - "text": "text", - "vector": "vector", + "uuid": "uuid", + "document": "document", + "embedding": "embedding", "metadata": "metadata", } @@ -141,7 +143,7 @@ def __init__( and self.config.table and self.config.metric ) - for k in ["id", "vector", "text", "metadata"]: + for k in ["id", "embedding", "document", "metadata", "uuid"]: assert k in self.config.column_map assert self.config.metric in ["angular", "euclidean", "manhattan", "hamming", "dot"] @@ -157,13 +159,14 @@ def __init__( schema_ = f""" CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( - {self.config.column_map['id']} String, - {self.config.column_map['text']} String, - {self.config.column_map['vector']} Array(Float32), + {self.config.column_map['id']} Nullable(String), + {self.config.column_map['document']} Nullable(String), + {self.config.column_map['embedding']} Array(Float32), {self.config.column_map['metadata']} JSON, - CONSTRAINT cons_vec_len CHECK length({self.config.column_map['vector']}) = {dim}, - INDEX vec_idx {self.config.column_map['vector']} TYPE {self.config.index_type}({index_params}) GRANULARITY 1000 - ) ENGINE = MergeTree ORDER BY {self.config.column_map['id']} + {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), + CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim}, + INDEX vec_idx {self.config.column_map['embedding']} TYPE {self.config.index_type}({index_params}) GRANULARITY 1000 + ) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192 """ self.dim = dim @@ -233,8 +236,8 @@ def add_texts( transac = [] column_names = { colmap_["id"]: ids, - colmap_["text"]: texts, - colmap_["vector"]: self.embedding_function.embed_documents(list(texts)), + colmap_["document"]: texts, + colmap_["embedding"]: self.embedding_function.embed_documents(list(texts)), } metadatas = metadatas or [{} for _ in texts] column_names[colmap_["metadata"]] = map(json.dumps, metadatas) @@ -245,7 +248,7 @@ def add_texts( for v in self.pgbar( zip(*values), desc="Inserting data...", total=len(metadatas) ): - assert len(v[keys.index(self.config.column_map["vector"])]) == self.dim + assert len(v[keys.index(self.config.column_map["embedding"])]) == self.dim transac.append(v) if len(transac) == batch_size: if t: @@ -327,11 +330,11 @@ def _build_query_sql( for k in self.config.index_query_params: settings_strs.append(f"SETTING {k}={self.config.index_query_params[k]}") q_str = f""" - SELECT {self.config.column_map['text']}, + SELECT {self.config.column_map['document']}, {self.config.column_map['metadata']}, dist FROM {self.config.database}.{self.config.table} {where_str} - ORDER BY L2Distance({self.config.column_map['vector']}, [{q_emb_str}]) + ORDER BY L2Distance({self.config.column_map['embedding']}, [{q_emb_str}]) AS dist {self.dist_order} LIMIT {topk} {' '.join(settings_strs)} """ @@ -387,7 +390,7 @@ def similarity_search_by_vector( try: return [ Document( - page_content=r[self.config.column_map["text"]], + page_content=r[self.config.column_map["document"]], metadata=r[self.config.column_map["metadata"]], ) for r in self.client.query(q_str).named_results() @@ -420,7 +423,7 @@ def similarity_search_with_relevance_scores( return [ ( Document( - page_content=r[self.config.column_map["text"]], + page_content=r[self.config.column_map["document"]], metadata=r[self.config.column_map["metadata"]], ), r["dist"], From 6075525f3248bbd43e5edc008004a7a02fbe1a43 Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sun, 4 Jun 2023 01:14:18 -0700 Subject: [PATCH 07/10] Improve clickhouse notebook --- .../vectorstores/examples/clickhouse.ipynb | 65 ++++++++++++++----- langchain/vectorstores/clickhouse.py | 25 ++++--- 2 files changed, 61 insertions(+), 29 deletions(-) diff --git a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb index 272fe0143c737..cdd246c72c442 100644 --- a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb +++ b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb @@ -143,7 +143,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 5516.23it/s]\n" + "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 3192.65it/s]\n" ] } ], @@ -197,23 +197,25 @@ "ExecuteTime": { "end_time": "2023-06-03T08:28:58.252991Z", "start_time": "2023-06-03T08:28:58.197560Z" - } + }, + "scrolled": true }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001B[92m\u001B[1mdefault.clickhouse_vector_search_example @ localhost:8123\u001B[0m\n", + "\u001b[92m\u001b[1mdefault.clickhouse_vector_search_example @ localhost:8123\u001b[0m\n", "\n", - "\u001B[1musername: None\u001B[0m\n", + "\u001b[1musername: None\u001b[0m\n", "\n", "Table Schema:\n", "---------------------------------------------------\n", - "|\u001B[94mid \u001B[0m|\u001B[96mString \u001B[0m|\n", - "|\u001B[94mtext \u001B[0m|\u001B[96mString \u001B[0m|\n", - "|\u001B[94mvector \u001B[0m|\u001B[96mArray(Float32) \u001B[0m|\n", - "|\u001B[94mmetadata \u001B[0m|\u001B[96mObject('json') \u001B[0m|\n", + "|\u001b[94mid \u001b[0m|\u001b[96mNullable(String) \u001b[0m|\n", + "|\u001b[94mdocument \u001b[0m|\u001b[96mNullable(String) \u001b[0m|\n", + "|\u001b[94membedding \u001b[0m|\u001b[96mArray(Float32) \u001b[0m|\n", + "|\u001b[94mmetadata \u001b[0m|\u001b[96mObject('json') \u001b[0m|\n", + "|\u001b[94muuid \u001b[0m|\u001b[96mUUID \u001b[0m|\n", "---------------------------------------------------\n", "\n" ] @@ -223,6 +225,37 @@ "print(str(docsearch))" ] }, + { + "cell_type": "code", + "execution_count": 16, + "id": "54f4f561", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Clickhouse Table DDL:\n", + "\n", + "CREATE TABLE IF NOT EXISTS default.langchain(\n", + " id Nullable(String),\n", + " document Nullable(String),\n", + " embedding Array(Float32),\n", + " metadata JSON,\n", + " uuid UUID DEFAULT generateUUIDv4(),\n", + " CONSTRAINT cons_vec_len CHECK length(embedding) = 1536,\n", + " INDEX vec_idx embedding TYPE annoy(100,'L2Distance') GRANULARITY 1000\n", + ") ENGINE = MergeTree ORDER BY uuid\n", + "SETTINGS index_granularity = 8192\n" + ] + } + ], + "source": [ + "print(f\"Clickhouse Table DDL:\\n\\n{docsearch.schema}\")" + ] + }, { "cell_type": "markdown", "id": "f59360c0", @@ -239,7 +272,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "id": "232055f6", "metadata": { "ExecuteTime": { @@ -252,7 +285,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 6234.90it/s]\n" + "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 3815.40it/s]\n" ] } ], @@ -275,7 +308,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "id": "ddbcee77", "metadata": { "ExecuteTime": { @@ -288,10 +321,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.6007716611401479 {'doc_id': 0, 'some': 'metadata'} Tonight. I call on t...\n", - "0.6007716611401479 {'doc_id': 0, 'some': 'metadata'} Tonight. I call on t...\n", - "0.6497183251536313 {'doc_id': 0, 'some': 'metadata'} A former top litigat...\n", - "0.6497183251536313 {'doc_id': 0, 'some': 'metadata'} A former top litigat...\n" + "0.6778053245115545 {'doc_id': 0} Madam Speaker, Madam...\n", + "0.699764931319791 {'doc_id': 8} And so many families...\n", + "0.7041348358644341 {'doc_id': 1} Groups of citizens b...\n", + "0.7053751184302747 {'doc_id': 6} And I’m taking robus...\n" ] } ], @@ -313,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "id": "fb6a9d36", "metadata": { "ExecuteTime": { diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index ee90ee3c90632..5ee85fd31e1b5 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -157,18 +157,17 @@ def __init__( ",".join([str(p) for p in self.config.index_param]) if isinstance(self.config.index_param, List) \ else self.config.index_param - schema_ = f""" - CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( - {self.config.column_map['id']} Nullable(String), - {self.config.column_map['document']} Nullable(String), - {self.config.column_map['embedding']} Array(Float32), - {self.config.column_map['metadata']} JSON, - {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), - CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim}, - INDEX vec_idx {self.config.column_map['embedding']} TYPE {self.config.index_type}({index_params}) GRANULARITY 1000 - ) ENGINE = MergeTree ORDER BY uuid - SETTINGS index_granularity = 8192 - """ + self.schema = f"""\ +CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( + {self.config.column_map['id']} Nullable(String), + {self.config.column_map['document']} Nullable(String), + {self.config.column_map['embedding']} Array(Float32), + {self.config.column_map['metadata']} JSON, + {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), + CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim}, + INDEX vec_idx {self.config.column_map['embedding']} TYPE {self.config.index_type}({index_params}) GRANULARITY 1000 +) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\ +""" self.dim = dim self.BS = "\\" self.must_escape = ("\\", "'") @@ -187,7 +186,7 @@ def __init__( self.client.command("SET allow_experimental_object_type=1") # Enable Annoy index self.client.command("SET allow_experimental_annoy_index=1") - self.client.command(schema_) + self.client.command(self.schema) def escape_str(self, value: str) -> str: return "".join(f"{self.BS}{c}" if c in self.must_escape else c for c in value) From acc7b5bcbc4ecfc3c654e711357d59ca47f8bebb Mon Sep 17 00:00:00 2001 From: Hao Chen Date: Sun, 4 Jun 2023 01:44:02 -0700 Subject: [PATCH 08/10] Add clickhouse table guide for distributed cluster with sharding --- .../vectorstores/examples/clickhouse.ipynb | 45 ++++++++++++------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb index cdd246c72c442..a256e74ac81df 100644 --- a/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb +++ b/docs/modules/indexes/vectorstores/examples/clickhouse.ipynb @@ -143,7 +143,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 3192.65it/s]\n" + "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 2801.49it/s]\n" ] } ], @@ -198,7 +198,7 @@ "end_time": "2023-06-03T08:28:58.252991Z", "start_time": "2023-06-03T08:28:58.197560Z" }, - "scrolled": true + "scrolled": false }, "outputs": [ { @@ -225,12 +225,28 @@ "print(str(docsearch))" ] }, + { + "cell_type": "markdown", + "id": "324ac147", + "metadata": {}, + "source": [ + "### Clickhouse table schema" + ] + }, + { + "cell_type": "markdown", + "id": "b5bd7c5b", + "metadata": {}, + "source": [ + "> Clickhouse table will be automatically created if not exist by default. Advanced users could pre-create the table with optimized settings. For distributed Clickhouse cluster with sharding, table engine should be configured as `Distributed`." + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "id": "54f4f561", "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [ { @@ -239,7 +255,7 @@ "text": [ "Clickhouse Table DDL:\n", "\n", - "CREATE TABLE IF NOT EXISTS default.langchain(\n", + "CREATE TABLE IF NOT EXISTS default.clickhouse_vector_search_example(\n", " id Nullable(String),\n", " document Nullable(String),\n", " embedding Array(Float32),\n", @@ -247,8 +263,7 @@ " uuid UUID DEFAULT generateUUIDv4(),\n", " CONSTRAINT cons_vec_len CHECK length(embedding) = 1536,\n", " INDEX vec_idx embedding TYPE annoy(100,'L2Distance') GRANULARITY 1000\n", - ") ENGINE = MergeTree ORDER BY uuid\n", - "SETTINGS index_granularity = 8192\n" + ") ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\n" ] } ], @@ -272,7 +287,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "232055f6", "metadata": { "ExecuteTime": { @@ -285,7 +300,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 3815.40it/s]\n" + "Inserting data...: 100%|██████████| 42/42 [00:00<00:00, 6939.56it/s]\n" ] } ], @@ -308,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "ddbcee77", "metadata": { "ExecuteTime": { @@ -321,10 +336,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "0.6778053245115545 {'doc_id': 0} Madam Speaker, Madam...\n", - "0.699764931319791 {'doc_id': 8} And so many families...\n", - "0.7041348358644341 {'doc_id': 1} Groups of citizens b...\n", - "0.7053751184302747 {'doc_id': 6} And I’m taking robus...\n" + "0.6779101415357189 {'doc_id': 0} Madam Speaker, Madam...\n", + "0.6997970363474885 {'doc_id': 8} And so many families...\n", + "0.7044504914336727 {'doc_id': 1} Groups of citizens b...\n", + "0.7053558702165094 {'doc_id': 6} And I’m taking robus...\n" ] } ], @@ -346,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "fb6a9d36", "metadata": { "ExecuteTime": { From c1088e99dc1ea93846f4f7f548b4d2eb55b17ada Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Sun, 4 Jun 2023 16:18:20 -0700 Subject: [PATCH 09/10] fmt --- langchain/vectorstores/__init__.py | 4 ++-- langchain/vectorstores/clickhouse.py | 33 +++++++++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index fd986b30c62f5..c1e48b2d79369 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -4,6 +4,7 @@ from langchain.vectorstores.atlas import AtlasDB from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma +from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings from langchain.vectorstores.deeplake import DeepLake from langchain.vectorstores.docarray import DocArrayHnswSearch, DocArrayInMemorySearch from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch @@ -23,7 +24,6 @@ from langchain.vectorstores.vectara import Vectara from langchain.vectorstores.weaviate import Weaviate from langchain.vectorstores.zilliz import Zilliz -from langchain.vectorstores.clickhouse import Clickhouse, ClickhouseSettings __all__ = [ "Redis", @@ -53,5 +53,5 @@ "DocArrayInMemorySearch", "Typesense", "Clickhouse", - "ClickhouseSettings" + "ClickhouseSettings", ] diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index 5ee85fd31e1b5..3880a403b83c5 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -16,12 +16,14 @@ logger = logging.getLogger() + def has_mul_sub_str(s: str, *args: Any) -> bool: for a in args: if a not in s: return False return True + class ClickhouseSettings(BaseSettings): """ClickHouse Client Configuration @@ -145,17 +147,28 @@ def __init__( ) for k in ["id", "embedding", "document", "metadata", "uuid"]: assert k in self.config.column_map - assert self.config.metric in ["angular", "euclidean", "manhattan", "hamming", "dot"] + assert self.config.metric in [ + "angular", + "euclidean", + "manhattan", + "hamming", + "dot", + ] # initialize the schema dim = len(embedding.embed_query("test")) index_params = ( - ",".join([f"'{k}={v}'" for k, v in self.config.index_param.items()]) - if self.config.index_param else "" - ) if isinstance(self.config.index_param, Dict) else \ - ",".join([str(p) for p in self.config.index_param]) if isinstance(self.config.index_param, List) \ + ( + ",".join([f"'{k}={v}'" for k, v in self.config.index_param.items()]) + if self.config.index_param + else "" + ) + if isinstance(self.config.index_param, Dict) + else ",".join([str(p) for p in self.config.index_param]) + if isinstance(self.config.index_param, List) else self.config.index_param + ) self.schema = f"""\ CREATE TABLE IF NOT EXISTS {self.config.database}.{self.config.table}( @@ -172,7 +185,7 @@ def __init__( self.BS = "\\" self.must_escape = ("\\", "'") self.embedding_function = embedding - self.dist_order = "ASC" # Only support ConsingDistance and L2Distance + self.dist_order = "ASC" # Only support ConsingDistance and L2Distance # Create a connection to clickhouse self.client = get_client( @@ -247,7 +260,9 @@ def add_texts( for v in self.pgbar( zip(*values), desc="Inserting data...", total=len(metadatas) ): - assert len(v[keys.index(self.config.column_map["embedding"])]) == self.dim + assert ( + len(v[keys.index(self.config.column_map["embedding"])]) == self.dim + ) transac.append(v) if len(transac) == batch_size: if t: @@ -417,7 +432,9 @@ def similarity_search_with_relevance_scores( Returns: List[Document]: List of documents """ - q_str = self._build_query_sql(self.embedding_function.embed_query(query), k, where_str) + q_str = self._build_query_sql( + self.embedding_function.embed_query(query), k, where_str + ) try: return [ ( From 52837dd5096267553a6a460378ba8d51797a4a7e Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Sun, 4 Jun 2023 16:24:21 -0700 Subject: [PATCH 10/10] lint --- langchain/vectorstores/clickhouse.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/langchain/vectorstores/clickhouse.py b/langchain/vectorstores/clickhouse.py index 3880a403b83c5..ed179c8b00897 100644 --- a/langchain/vectorstores/clickhouse.py +++ b/langchain/vectorstores/clickhouse.py @@ -40,7 +40,8 @@ class ClickhouseSettings(BaseSettings): table (str) : Table name to operate on. Defaults to 'vector_table'. metric (str) : Metric to compute distance, - supported are ('angular', 'euclidean', 'manhattan', 'hamming', 'dot'). Defaults to 'angular'. + supported are ('angular', 'euclidean', 'manhattan', 'hamming', + 'dot'). Defaults to 'angular'. https://github.com/spotify/annoy/blob/main/src/annoymodule.cc#L149-L169 column_map (Dict) : Column type map to project column name onto langchain @@ -178,7 +179,8 @@ def __init__( {self.config.column_map['metadata']} JSON, {self.config.column_map['uuid']} UUID DEFAULT generateUUIDv4(), CONSTRAINT cons_vec_len CHECK length({self.config.column_map['embedding']}) = {dim}, - INDEX vec_idx {self.config.column_map['embedding']} TYPE {self.config.index_type}({index_params}) GRANULARITY 1000 + INDEX vec_idx {self.config.column_map['embedding']} TYPE \ +{self.config.index_type}({index_params}) GRANULARITY 1000 ) ENGINE = MergeTree ORDER BY uuid SETTINGS index_granularity = 8192\ """ self.dim = dim @@ -311,8 +313,8 @@ def from_texts( return ctx def __repr__(self) -> str: - """Text representation for ClickHouse Vector Store, prints backends, username and schemas. - Easy to use with `str(ClickHouse())` + """Text representation for ClickHouse Vector Store, prints backends, username + and schemas. Easy to use with `str(ClickHouse())` Returns: repr: string to show connection info and data schema