From 5046aa1ac387ec404f4c91b7a9ed91de53751962 Mon Sep 17 00:00:00 2001 From: Ben Chambers <35960+bjchambers@users.noreply.github.com> Date: Tue, 23 Jan 2024 14:34:56 -0800 Subject: [PATCH 1/2] feat: initial pgvector indices Also applied `ruff check --fix` and `ruff format`. --- app/collections/models.py | 3 ++- app/main.py | 1 + migrations/0001_schema.sql | 6 +++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/app/collections/models.py b/app/collections/models.py index fb93679..0194150 100644 --- a/app/collections/models.py +++ b/app/collections/models.py @@ -1,3 +1,4 @@ +<<<<<<< HEAD from enum import Enum from pydantic import BaseModel, ConfigDict, Field, TypeAdapter @@ -58,4 +59,4 @@ class CollectionCreate(BaseModel): text_distance_metric: DistanceMetric = DistanceMetric.cosine """The distance metric to use on the text embedding. - NOTE: Changing distance metrics is not currently supported.""" \ No newline at end of file + NOTE: Changing distance metrics is not currently supported.""" diff --git a/app/main.py b/app/main.py index 2a1633f..e53f185 100644 --- a/app/main.py +++ b/app/main.py @@ -4,6 +4,7 @@ import asyncpg from fastapi import FastAPI +from app.collections.models import EmbeddingModel from app.common import db from app.config import app_configs, settings from app.ingest.store import Store diff --git a/migrations/0001_schema.sql b/migrations/0001_schema.sql index 0e69d43..0346626 100644 --- a/migrations/0001_schema.sql +++ b/migrations/0001_schema.sql @@ -1,4 +1,8 @@ -- Apply the base schema. +CREATE TYPE embedding_model as ENUM ( + 'openai_text_embedding_ada_002', + 'hf_baai_bge_small_en' +); CREATE TYPE distance_metric AS ENUM ('cosine', 'l2', 'ip'); @@ -95,4 +99,4 @@ CREATE TABLE embedding( PRIMARY KEY (id), FOREIGN KEY(chunk_id) REFERENCES chunk (id) -); \ No newline at end of file +); From bea6d8696b9ceaa46d0bec59951519ea0ee064e3 Mon Sep 17 00:00:00 2001 From: Ryan Michael Date: Wed, 24 Jan 2024 10:46:38 -0500 Subject: [PATCH 2/2] Flatten documents --- app/collections/models.py | 3 +-- app/documents/models.py | 6 ++++++ app/documents/router.py | 43 +++++++++++++++++++++----------------- app/main.py | 1 - migrations/0001_schema.sql | 6 +----- 5 files changed, 32 insertions(+), 27 deletions(-) diff --git a/app/collections/models.py b/app/collections/models.py index 0194150..fb93679 100644 --- a/app/collections/models.py +++ b/app/collections/models.py @@ -1,4 +1,3 @@ -<<<<<<< HEAD from enum import Enum from pydantic import BaseModel, ConfigDict, Field, TypeAdapter @@ -59,4 +58,4 @@ class CollectionCreate(BaseModel): text_distance_metric: DistanceMetric = DistanceMetric.cosine """The distance metric to use on the text embedding. - NOTE: Changing distance metrics is not currently supported.""" + NOTE: Changing distance metrics is not currently supported.""" \ No newline at end of file diff --git a/app/documents/models.py b/app/documents/models.py index f570179..34732c5 100644 --- a/app/documents/models.py +++ b/app/documents/models.py @@ -3,6 +3,12 @@ from pydantic import BaseModel +class CreateRequest(BaseModel): + """The name of the collection the document should be added to.""" + collection_id: int + + """The URL of the document to add.""" + url: str class IngestState(Enum): PENDING = "pending" diff --git a/app/documents/router.py b/app/documents/router.py index c1307f9..ebd289b 100644 --- a/app/documents/router.py +++ b/app/documents/router.py @@ -1,20 +1,18 @@ from typing import Annotated, List import asyncpg -from fastapi import APIRouter, BackgroundTasks, Body, HTTPException, Path, status +from fastapi import APIRouter, BackgroundTasks, Body, HTTPException, Path, status, Query from loguru import logger -from app.collections.router import PathCollectionId from app.common.db import PgConnectionDep, PgPoolDep from app.documents.models import Document from app.ingest.extract import extract from app.ingest.extract.source import ExtractSource from app.ingest.store import Store, StoreDep -# TODO: Move this to `/documents`. Will require figuring out -# how to specify the collection for create, list, etc. -router = APIRouter(prefix="/collections/{collection_id}/documents") +from .models import CreateRequest +router = APIRouter(prefix="/documents") # We can't use the session from the request because it ends as soon # as the request completes. So we need to pass the engine and start @@ -53,11 +51,10 @@ async def ingest_document(id: int, store: Store, pg_pool: asyncpg.Pool): @router.put("/") async def add_document( - collection_id: PathCollectionId, store: StoreDep, pg_pool: PgPoolDep, background: BackgroundTasks, - url: Annotated[str, Body(..., description="The URL of the document to add.")], + req: CreateRequest, ) -> Document: """Add a document.""" @@ -69,8 +66,8 @@ async def add_document( VALUES ($1, $2, 'pending') RETURNING id, collection_id, url, ingest_state, ingest_error """, - collection_id, - url, + req.collection_id, + req.url, ) document = Document.model_validate(dict(row)) @@ -83,31 +80,39 @@ async def add_document( @router.get("/") async def list_documents( - collection_id: PathCollectionId, conn: PgConnectionDep + conn: PgConnectionDep, + collection_id: Annotated[int | None, Query(description="Limit to documents associated with this collection")] = None, ) -> List[Document]: """List documents.""" # TODO: Test - results = await conn.fetch( + if collection_id == None: + results = await conn.fetch( + """ + SELECT id, collection_id, url, ingest_state, ingest_error + FROM document """ - SELECT id, collection_id, url, ingest_state, ingest_error - FROM document WHERE collection_id = $1 - """, - collection_id, - ) + ) + else: + results = await conn.fetch( + """ + SELECT id, collection_id, url, ingest_state, ingest_error + FROM document WHERE collection_id = $1 + """, + collection_id, + ) return [Document.model_validate(dict(result)) for result in results] @router.get("/{id}") async def get_document( - conn: PgConnectionDep, collection_id: PathCollectionId, id: PathDocumentId + conn: PgConnectionDep, id: PathDocumentId ) -> Document: # TODO: Test / return not found? result = await conn.fetchrow( """ SELECT id, collection_id, url, ingest_state, ingest_error - FROM document WHERE id = $1 AND collection_id = $2 + FROM document WHERE id = $1 """, id, - collection_id, ) return Document.model_validate(dict(result)) diff --git a/app/main.py b/app/main.py index e53f185..2a1633f 100644 --- a/app/main.py +++ b/app/main.py @@ -4,7 +4,6 @@ import asyncpg from fastapi import FastAPI -from app.collections.models import EmbeddingModel from app.common import db from app.config import app_configs, settings from app.ingest.store import Store diff --git a/migrations/0001_schema.sql b/migrations/0001_schema.sql index 0346626..0e69d43 100644 --- a/migrations/0001_schema.sql +++ b/migrations/0001_schema.sql @@ -1,8 +1,4 @@ -- Apply the base schema. -CREATE TYPE embedding_model as ENUM ( - 'openai_text_embedding_ada_002', - 'hf_baai_bge_small_en' -); CREATE TYPE distance_metric AS ENUM ('cosine', 'l2', 'ip'); @@ -99,4 +95,4 @@ CREATE TABLE embedding( PRIMARY KEY (id), FOREIGN KEY(chunk_id) REFERENCES chunk (id) -); +); \ No newline at end of file