From 47aee5b5f95100fdc7b98d484bd96048fa62b6e1 Mon Sep 17 00:00:00 2001 From: wd0517 Date: Wed, 16 Oct 2024 13:38:26 +0000 Subject: [PATCH 1/6] update README --- README.md | 72 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 4d26f5b..d7e447b 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ This is a Python client for TiDB Vector. -> Now only TiDB Cloud Serverless cluster support vector data type, see this [docs](https://docs.pingcap.com/tidbcloud/vector-search-overview?utm_source=github&utm_medium=tidb-vector-python) for more information. +Both TiDB Cloud Serverless ([doc](https://docs.pingcap.com/tidbcloud/vector-search-overview?utm_source=github&utm_medium=tidb-vector-python)) and TiDB Open Source Version (>= 8.4 DMR) support vector data type. ## Installation @@ -42,44 +42,52 @@ from tidb_vector.sqlalchemy import VectorType engine = create_engine('mysql://****.root:******@gateway01.xxxxxx.shared.aws.tidbcloud.com:4000/test') Base = declarative_base() -class Test(Base): - __tablename__ = 'test' +class Document(Base): + __tablename__ = 'sqlalchemy_documents' id = Column(Integer, primary_key=True) embedding = Column(VectorType(3)) -# or add hnsw index when creating table -class TestWithIndex(Base): - __tablename__ = 'test_with_index' - id = Column(Integer, primary_key=True) - embedding = Column(VectorType(3), comment="hnsw(distance=l2)") - Base.metadata.create_all(engine) ``` Insert vector data ```python -test = Test(embedding=[1, 2, 3]) -session.add(test) +doc = Document(embedding=[1, 2, 3]) +session.add(doc) session.commit() ``` Get the nearest neighbors ```python -session.scalars(select(Test).order_by(Test.embedding.l2_distance([1, 2, 3.1])).limit(5)) +session.scalars(select(Document).order_by(Document.embedding.l2_distance([1, 2, 3.1])).limit(5)) ``` Get the distance ```python -session.scalars(select(Test.embedding.l2_distance([1, 2, 3.1]))) +session.scalars(select(Document.embedding.l2_distance([1, 2, 3.1]))) ``` Get within a certain distance ```python -session.scalars(select(Test).filter(Test.embedding.l2_distance([1, 2, 3.1]) < 0.2)) +session.scalars(select(Document).filter(Document.embedding.l2_distance([1, 2, 3.1]) < 0.2)) +``` + +Add hnsw index + +```python +# vector index currently depends on tiflash +session.execute(text('ALTER TABLE sqlalchemy_documents SET TIFLASH REPLICA 1')) +index = Index( + 'idx_embedding', + func.vec_cosine_distance(Document.embedding), + mysql_prefix="vector", + mysql_using="hnsw" +) +index.create(engine) ``` ### Django @@ -119,48 +127,50 @@ db = MySQLDatabase( **connect_kwargs, ) -class TestModel(Model): - class Meta: - database = db - table_name = 'test' - +class DocumentModel(Model): embedding = VectorField(3) - -# or add hnsw index when creating table -class TestModelWithIndex(Model): class Meta: database = db - table_name = 'test_with_index' - - embedding = VectorField(3, constraints=[SQL("COMMENT 'hnsw(distance=l2)'")]) - + table_name = 'peewee_documents' db.connect() -db.create_tables([TestModel, TestModelWithIndex]) +db.create_tables([DocumentModel]) ``` Insert vector data ```python -TestModel.create(embedding=[1, 2, 3]) +DocumentModel.create(embedding=[1, 2, 3]) ``` Get the nearest neighbors ```python -TestModel.select().order_by(TestModel.embedding.l2_distance([1, 2, 3.1])).limit(5) +DocumentModel.select().order_by(DocumentModel.embedding.l2_distance([1, 2, 3.1])).limit(5) ``` Get the distance ```python -TestModel.select(TestModel.embedding.cosine_distance([1, 2, 3.1]).alias('distance')) +DocumentModel.select(DocumentModel.embedding.cosine_distance([1, 2, 3.1]).alias('distance')) ``` Get within a certain distance ```python -TestModel.select().where(TestModel.embedding.l2_distance([1, 2, 3.1]) < 0.5) +DocumentModel.select().where(DocumentModel.embedding.l2_distance([1, 2, 3.1]) < 0.5) +``` + +Add hnsw index + +```python +# vector index currently depends on tiflash +db.execute_sql(SQL( + "ALTER TABLE peewee_documents SET TIFLASH REPLICA 1;" +)) +TestModel.add_index(SQL( + "CREATE VECTOR INDEX idx_embedding ON peewee_documents ((vec_cosine_distance(embedding))) USING HNSW" +)) ``` ### TiDB Vector Client From 282fdd80b0916e63130fbf83c181e0ff431b30df Mon Sep 17 00:00:00 2001 From: wd0517 Date: Wed, 16 Oct 2024 14:03:08 +0000 Subject: [PATCH 2/6] fix lint --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9633d72..c6817c9 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -18,6 +18,7 @@ jobs: - name: Install dependencies run: | + export PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --upgrade pip python -m pip install tox From 2beacd22cf3ef9edd6cc5f8754d82f92605b510f Mon Sep 17 00:00:00 2001 From: wd0517 Date: Wed, 16 Oct 2024 14:13:25 +0000 Subject: [PATCH 3/6] fix lint --- .github/workflows/ci.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c6817c9..8d87154 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,9 +16,13 @@ jobs: - name: Checkout uses: actions/checkout@v3 + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: 3.12 + - name: Install dependencies run: | - export PIP_BREAK_SYSTEM_PACKAGES=1 python -m pip install --upgrade pip python -m pip install tox From 5fa716dc864d2816f9a0404a6321ad6eba8a20d8 Mon Sep 17 00:00:00 2001 From: wd0517 Date: Wed, 16 Oct 2024 14:16:52 +0000 Subject: [PATCH 4/6] upgrade flake8 --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 8d85f5f..e2d1c62 100644 --- a/tox.ini +++ b/tox.ini @@ -25,7 +25,7 @@ setenv = skip_install = True allowlist_externals = bash deps = - flake8==6.0.0 + flake8==7.1.1 black==23.7.0 commands = bash -c "flake8 --max-line-length 130 tidb_vector tests" From 8745e4c6e1f5a43ac6a470e643905906280fba22 Mon Sep 17 00:00:00 2001 From: WD Date: Thu, 17 Oct 2024 09:09:16 +0800 Subject: [PATCH 5/6] Update README.md Co-authored-by: Mini256 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d7e447b..b0d7f2c 100644 --- a/README.md +++ b/README.md @@ -168,7 +168,7 @@ Add hnsw index db.execute_sql(SQL( "ALTER TABLE peewee_documents SET TIFLASH REPLICA 1;" )) -TestModel.add_index(SQL( +DocumentModel.add_index(SQL( "CREATE VECTOR INDEX idx_embedding ON peewee_documents ((vec_cosine_distance(embedding))) USING HNSW" )) ``` From 3a3c9814aefc7c010e94baf1794a09b67bd54185 Mon Sep 17 00:00:00 2001 From: WD Date: Thu, 17 Oct 2024 13:32:45 +0000 Subject: [PATCH 6/6] aviod explicitly using HNSW --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b0d7f2c..1e27d7a 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Get within a certain distance session.scalars(select(Document).filter(Document.embedding.l2_distance([1, 2, 3.1]) < 0.2)) ``` -Add hnsw index +Add vector index to speed up query ```python # vector index currently depends on tiflash @@ -85,7 +85,6 @@ index = Index( 'idx_embedding', func.vec_cosine_distance(Document.embedding), mysql_prefix="vector", - mysql_using="hnsw" ) index.create(engine) ``` @@ -161,7 +160,7 @@ Get within a certain distance DocumentModel.select().where(DocumentModel.embedding.l2_distance([1, 2, 3.1]) < 0.5) ``` -Add hnsw index +Add vector index to speed up query ```python # vector index currently depends on tiflash @@ -169,7 +168,7 @@ db.execute_sql(SQL( "ALTER TABLE peewee_documents SET TIFLASH REPLICA 1;" )) DocumentModel.add_index(SQL( - "CREATE VECTOR INDEX idx_embedding ON peewee_documents ((vec_cosine_distance(embedding))) USING HNSW" + "CREATE VECTOR INDEX idx_embedding ON peewee_documents ((vec_cosine_distance(embedding)))" )) ```