tembo-io · ChuckHend · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
@@ -0,0 +1,162 @@
+name: rag
+description: A Tembo Postgres Stack configured for retrieval augmented generation (RAG).
+repository: "quay.io/tembo"
+organization: tembo
+images:
+  14: "standard-cnpg:14-a0a5ab5"
+  15: "standard-cnpg:15-a0a5ab5"
+  16: "standard-cnpg:16-a0a5ab5"
+stack_version: 0.1.0
+appServices:
+  - image: quay.io/tembo/vector-serve:32ce013
+    name: embeddings
+    env:
+      - name: TMPDIR
+        value: /models
+      - name: BATCH_SIZE
+        value: "1000"
+      - name: XDG_CACHE_HOME
+        value: /models/.cache
+    routing:
+      - port: 3000
+        ingressPath: /vectordb
+        middlewares:
+          - map-embeddings
+    middlewares:
+      - !replacePathRegex
+          name: map-embeddings
+          config:
+            regex: ^\/vectordb\/?
+            replacement: /v1/embeddings
+    resources:
+      requests:
+        cpu: 500m
+        memory: 1500Mi
+      limits:
+        cpu: 4000m
+        memory: 1500Mi
+    storage:
+      volumeMounts:
+      - mountPath: /models
+        name: hf-data-vol
+      volumes:
+      - ephemeral:
+          volumeClaimTemplate:
+            spec:
+              accessModes:
+              - ReadWriteOnce
+              resources:
+                requests:
+                  storage: 2Gi
+        name: hf-data-vol
+trunk_installs:
+  - name: pgmq
+    version: 1.1.1
+  - name: vectorize
+    version: 0.10.0
+  - name: pgvector
+    version: 0.6.0
+  - name: pg_stat_statements
+    version: 1.10.0
+extensions:
+  - name: vector
+    locations:
+      - database: postgres
+        enabled: true
+        version: 0.6.0
+  - name: pg_cron
+    locations:
+    - database: postgres
+      enabled: true
+      version: 1.5.2
+  - name: pgmq
+    locations:
+    - database: postgres
+      enabled: true
+      version: 1.1.1
+  - name: vectorize
+    locations:
+    - database: postgres
+      enabled: true
+      version: 0.10.0
+  - name: pg_stat_statements
+    locations:
+      - database: postgres
+        enabled: true
+        version: 1.10.0
+compute_templates:
+  - cpu: 0.25
+    memory: 1Gi
+    instance_class: GeneralPurpose
+  - cpu: 0.5
+    memory: 2Gi
+    instance_class: GeneralPurpose
+  - cpu: 1
+    memory: 4Gi
+    instance_class: GeneralPurpose
+  - cpu: 2
+    memory: 8Gi
+    instance_class: GeneralPurpose
+  - cpu: 4
+    memory: 16Gi
+    instance_class: GeneralPurpose
+  - cpu: 8
+    memory: 32Gi
+    instance_class: GeneralPurpose
+  - cpu: 0.5
+    memory: 1Gi
+    instance_class: ComputeOptimized
+  - cpu: 1
+    memory: 2Gi
+    instance_class: ComputeOptimized
+  - cpu: 2
+    memory: 4Gi
+    instance_class: ComputeOptimized
+  - cpu: 4
+    memory: 8Gi
+    instance_class: ComputeOptimized
+  - cpu: 8
+    memory: 16Gi
+    instance_class: ComputeOptimized
+  - cpu: 0.5
+    memory: 4Gi
+    instance_class: MemoryOptimized
+  - cpu: 1
+    memory: 8Gi
+    instance_class: MemoryOptimized
+  - cpu: 2
+    memory: 16Gi
+    instance_class: MemoryOptimized
+  - cpu: 4
+    memory: 32Gi
+    instance_class: MemoryOptimized
+postgres_config_engine: standard
+postgres_config:
+  - name: cron.host
+    value: /controller/run
+  - name: vectorize.host
+    value: postgresql:///postgres?host=/controller/run
+  - name: autovacuum_vacuum_cost_limit
+    value: -1
+  - name: autovacuum_vacuum_scale_factor
+    value: 0.05
+  - name: autovacuum_vacuum_insert_scale_factor
+    value: 0.05
+  - name: autovacuum_analyze_scale_factor
+    value: 0.05
+  - name: checkpoint_timeout
+    value: 10min
+  - name: track_activity_query_size
+    value: 2048
+  - name: wal_compression
+    value: 'on'
+  - name: track_io_timing
+    value: 'on'
+  - name: log_min_duration_statement # https://www.postgresql.org/docs/15/runtime-config-logging.html
+    value: 1000
+  - name: pg_stat_statements.track
+    value: all
+  - name: shared_preload_libraries
+    value: vectorize,pg_stat_statements,pg_cron
+  - name: vectorize.embedding_service_url
+    value: http://${NAMESPACE}-embeddings.${NAMESPACE}.svc.cluster.local:3000/v1/embeddings
diff --git a/tembo-py/.gitignore b/tembo-py/.gitignore
@@ -0,0 +1,26 @@
+data/
+
+**/*/__pycache__/
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+coverage-junit-unit-tests.xml
+coverage-report-unit-tests.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cov.xml
+.vscode
+*.json
+
+# py files on root of project
+*.py
+
+# files and dirs beginning with a underscore
+_*
diff --git a/tembo-py/Makefile b/tembo-py/Makefile
@@ -0,0 +1,23 @@
+SOURCE_OBJECTS=tembo_py tests
+
+format:
+	poetry run ruff check --silent --fix --exit-zero ${SOURCE_OBJECTS}
+	poetry run black ${SOURCE_OBJECTS}
+
+
+check:
+	poetry run ruff check ${SOURCE_OBJECTS}
+	poetry run black --check ${SOURCE_OBJECTS}
+	poetry run mypy ${SOURCE_OBJECTS}
+
+test:
+	poetry run pytest -s \
+		--ignore=tests/integration_tests \
+		--cov=./ \
+		--cov-report=xml:coverage-report-unit-tests.xml \
+		--junitxml=coverage-junit-unit-tests.xml \
+		--cov-report term
+
+
+run.postgres:
+	docker run -p 5432:5432 --name tembo-postgres -e POSTGRES_PASSWORD=postgres -d quay.io/tembo/vectorize-pg:latest
diff --git a/tembo-py/README.md b/tembo-py/README.md
@@ -0,0 +1,12 @@
+# tembo-py
+
+The official Python client for Tembo.io
+
+
+## Installation
+
+```bash
+pip install tembo-py
+```
+
+
diff --git a/tembo-py/pyproject.toml b/tembo-py/pyproject.toml
@@ -0,0 +1,23 @@
+[tool.poetry]
+name = "tembo-py"
+version = "0.1.0"
+description = "The official Python client for Tembo.io"
+authors = ["Adam Hendel <adam@tembo.io>"]
+license = "Apache-2"
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<3.12"
+llama-index = "^0.10.9"
+llama-index-vector-stores-postgres = "^0.1.1"
+tiktoken = "^0.6.0"
+psycopg = "^3.1.18"
+
+[tool.poetry.group.dev.dependencies]
+ruff = "^0.2.2"
+black = "^24.2.0"
+mypy = "^1.8.0"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
diff --git a/tembo-py/tembo_py/__init__.py b/tembo-py/tembo_py/__init__.py
diff --git a/tembo-py/tembo_py/rag.py b/tembo-py/tembo_py/rag.py
@@ -0,0 +1,144 @@
+from dataclasses import dataclass, field
+import json
+import logging
+from typing import Optional
+
+from llama_index.core import SimpleDirectoryReader
+from llama_index.core.node_parser import SentenceSplitter
+import psycopg
+
+
+@dataclass
+class TemboRAGcontroller:
+    project_name: str
+    chunk_size: Optional[int] = None
+    chat_model: str = "gpt-3.5-turbo"
+    sentence_transformer: str = "sentence-transformers/all-MiniLM-L12-v2"
+    connection_string: Optional[str] = None
+    _table_name: str = "vectorize._data_{project_name}"
+
+    # post-init
+    sentence_splitter: SentenceSplitter = field(
+        default_factory=SentenceSplitter, init=False
+    )
+
+    def __post_init__(self):
+        chunk_size = self.chunk_size or get_context_size(self.chat_model)
+        self.sentence_splitter = SentenceSplitter(chunk_size=chunk_size)
+        self.chunk_size = chunk_size
+
+    def prepare_from_directory(
+        self, document_dir: str, **kwargs
+    ) -> list[tuple[str, str, str, str]]:
+        documents = SimpleDirectoryReader(document_dir).load_data()
+        chunks = self.sentence_splitter.get_nodes_from_documents(documents, **kwargs)
+        chunks_for_copy: list[tuple[str, str, dict, str]] = []
+        for chunk in chunks:
+            chunks_for_copy.append(
+                (
+                    chunk.metadata["file_name"],
+                    chunk.id_,
+                    json.dumps(chunk.metadata),
+                    chunk.get_content(),
+                )
+            )
+        logging.info("Prepared %s chunks", len(chunks_for_copy))
+        return chunks_for_copy
+
+    def load_documents(
+        self,
+        documents: list[tuple[str, str, str, str]],
+        connection_string: Optional[str] = None,
+    ):
+        connection_string = connection_string or self.connection_string
+        if not connection_string:
+            raise ValueError("No connection string provided")
+        self._init_table(self.project_name, connection_string)
+        table = self._table_name.format(project_name=self.project_name)
+        self._load_docs(table, documents, connection_string)
+
+    def init_rag(
+        self, connection_string: Optional[str] = None, transformer: Optional[str] = None
+    ):
+        connection_string = connection_string or self.connection_string
+        if not connection_string:
+            raise ValueError("No connection string provided")
+
+        xformer = transformer or self.sentence_transformer
+        q = """
+        SELECT vectorize.init_rag(
+            agent_name => %s,
+            table_name => %s,
+            schema => %s,
+            unique_record_id => 'record_id',
+            "column" => 'content',
+            transformer => %s
+        );
+        """
+        schema, table = self._table_name.format(project_name=self.project_name).split(
+            "."
+        )
+        with psycopg.connect(connection_string, autocommit=True) as conn:
+            cur = conn.cursor()
+            cur.execute(q, (self.project_name, table, schema, xformer))
+
+    def _load_docs(
+        self,
+        table: str,
+        documents: list[tuple[str, str, str, str]],
+        connection_string: str,
+    ):
+        with psycopg.connect(connection_string, autocommit=True) as conn:
+            cur = conn.cursor()
+            sql = f"COPY {table} (document_name, chunk_id, meta, content) FROM STDIN"
+            # log every 10% completion
+            num_chunks = len(documents)
+            deca = num_chunks // 10
+            with cur.copy(sql) as copy:
+                for i, row in enumerate(documents):
+                    if i % deca == 0:
+                        logging.info("writing row %s / %s", i, num_chunks)
+                    copy.write_row(row)
+
+    def _init_table(self, project_name: str, connection_string: str):
+        table = self._table_name.format(project_name=project_name)
+        q = f"""
+        CREATE TABLE IF NOT EXISTS {table} (
+            record_id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
+            document_name TEXT NOT NULL,
+            chunk_id TEXT NOT NULL,
+            meta JSONB,
+            content TEXT NOT NULL
+        )
+        """
+        with psycopg.connect(connection_string, autocommit=True) as conn:
+            cur = conn.cursor()
+            cur.execute(q)
+
+
+def get_context_size(model):
+    if model.startswith("gpt-4-1106"):
+        return 128000
+    if model.startswith("gpt-4-32k"):
+        return 32768
+    if model.startswith("gpt-4"):
+        return 8192
+    if model.startswith("gpt-3.5-turbo-16k"):
+        return 16384
+    if model.startswith("gpt-3.5-turbo"):
+        return 4096
+    if model in ("text-davinci-002", "text-davinci-003"):
+        return 4097
+    if model in ("ada", "babbage", "curie"):
+        return 2049
+    if model == "code-cushman-001":
+        return 2048
+    if model == "code-davinci-002":
+        return 8001
+    if model == "davinci":
+        return 2049
+    if model in ("text-ada-001", "text-babbage-001", "text-curie-001"):
+        return 2049
+    if model == "text-embedding-ada-002":
+        return 8192
+    return 4096