diff --git a/.dockerignore b/.dockerignore index 9e4d9de..d00b206 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,14 @@ +# Ignore everything ** + +# Include (don't ignore) the application code !app !./pyproject.toml !./poetry.lock -**/__pycache__ \ No newline at end of file + +# Re-ignore pycache within `app`. +**/__pycache__ + +# Include (don't ignore) the migrations. +!migrations/*.sql +!yoyo.ini \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 439dcb0..7b4b940 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,4 +20,6 @@ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt # Finally, copy in the application code. COPY ./app /code/app +COPY ./migrations/0001_schema.sql /code/migrations/0001_schema.sql + CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "80"] \ No newline at end of file diff --git a/app/statements/__init__.py b/app/chunks/__init__.py similarity index 100% rename from app/statements/__init__.py rename to app/chunks/__init__.py diff --git a/app/statements/models.py b/app/chunks/models.py similarity index 80% rename from app/statements/models.py rename to app/chunks/models.py index 680a1e3..8a6eeb7 100644 --- a/app/statements/models.py +++ b/app/chunks/models.py @@ -27,26 +27,26 @@ class RetrieveRequest(BaseModel): """Whether to include a generated summary.""" -class BaseStatement(BaseModel): +class BaseChunk(BaseModel): kind: Literal["text", "raw_text", "image"] score: Optional[float] = None - """The similarity score of this statement.""" + """The similarity score of this chunk.""" -class TextStatement(BaseStatement): +class TextChunk(BaseChunk): kind: Literal["text"] = "text" raw: bool - text: str = Field(default="", description="Text content of the node.") + text: str = Field(default="", description="Text content of the chunk.") start_char_idx: Optional[int] = Field( - default=None, description="Start char index of the node." + default=None, description="Start char index of the chunk." ) end_char_idx: Optional[int] = Field( - default=None, description="End char index of the node." + default=None, description="End char index of the chunk." ) -class ImageStatement(BaseStatement): +class ImageChunk(BaseChunk): kind: Literal["image"] = "image" text: Optional[str] = Field(..., description="Textual description of the image.") image: Optional[str] = Field(..., description="Image of the node.") @@ -59,7 +59,7 @@ class RetrieveResponse(BaseModel): """The response from a chunk retrieval request.""" summary: Optional[str] - """Summary of the retrieved statements.""" + """Summary of the retrieved chunks.""" - statements: Sequence[Union[TextStatement, ImageStatement]] - """Retrieved statements.""" + chunks: Sequence[Union[TextChunk, ImageChunk]] + """Retrieved chunks.""" diff --git a/app/statements/router.py b/app/chunks/router.py similarity index 74% rename from app/statements/router.py rename to app/chunks/router.py index fc4e59b..f76d279 100644 --- a/app/statements/router.py +++ b/app/chunks/router.py @@ -6,14 +6,14 @@ from app.ingest.store import StoreDep -from .models import ImageStatement, RetrieveRequest, RetrieveResponse, TextStatement +from .models import ImageChunk, RetrieveRequest, RetrieveResponse, TextChunk -router = APIRouter(tags=["statements"], prefix="/statements") +router = APIRouter(prefix="/chunks") @router.post("/retrieve") -async def retrieve(store: StoreDep, request: RetrieveRequest) -> RetrieveResponse: - """Retrieve statements based on a given query.""" +async def retrieve_chunks(store: StoreDep, request: RetrieveRequest) -> RetrieveResponse: + """Retrieve chunks based on a given query.""" from llama_index.response_synthesizers import ResponseMode @@ -30,15 +30,15 @@ async def retrieve(store: StoreDep, request: RetrieveRequest) -> RetrieveRespons return RetrieveResponse( summary=results.response, - statements=statements if request.include_statements else [], + chunks=statements if request.include_statements else [], ) -def node_to_statement(node: NodeWithScore) -> Union[TextStatement, ImageStatement]: +def node_to_statement(node: NodeWithScore) -> Union[TextChunk, ImageChunk]: from llama_index.schema import ImageNode, TextNode if isinstance(node.node, TextNode): - return TextStatement( + return TextChunk( raw=True, score=node.score, text=node.node.text, @@ -46,7 +46,7 @@ def node_to_statement(node: NodeWithScore) -> Union[TextStatement, ImageStatemen end_char_idx=node.node.end_char_idx, ) elif isinstance(node.node, ImageNode): - return ImageStatement( + return ImageChunk( score=node.score, text=node.node.text if node.node.text else None, image=node.node.image, diff --git a/app/collections/models.py b/app/collections/models.py new file mode 100644 index 0000000..d83fbf6 --- /dev/null +++ b/app/collections/models.py @@ -0,0 +1,21 @@ + +from pydantic import BaseModel, ConfigDict, TypeAdapter + +class Collection(BaseModel): + model_config=ConfigDict(from_attributes=True) + + """A collection of indexed documents.""" + id: int + """The ID of the collection.""" + + name: str + """The name of the collection.""" + +collection_validator = TypeAdapter(Collection) + + +class CollectionCreate(BaseModel): + """The request to create a collection.""" + + name: str + """The name of the collection.""" \ No newline at end of file diff --git a/app/collections/router.py b/app/collections/router.py index 070ae1b..8ccf2fd 100644 --- a/app/collections/router.py +++ b/app/collections/router.py @@ -1,35 +1,37 @@ from typing import Annotated, List from fastapi import APIRouter, Path -from sqlmodel import Session, select +from pydantic import parse_obj_as -from app.common.schema import Collection, EngineDep +from app.common.db import PgConnectionDep +from app.collections.models import * -router = APIRouter(tags=["collections"], prefix="/collections") +router = APIRouter(prefix="/collections") @router.put("/") -async def add(engine: EngineDep, collection: Collection) -> Collection: +async def add_collection(conn: PgConnectionDep, collection: CollectionCreate) -> Collection: """Create a collection.""" - with Session(engine) as session: - session.add(collection) - session.commit() - session.refresh(collection) - return collection + result = await conn.fetchrow(""" + INSERT INTO collection (name) VALUES ($1) + RETURNING id, name + """, + collection.name) + return Collection.model_validate(dict(result)) @router.get("/") -async def list(engine: EngineDep) -> List[Collection]: +async def list_collections(conn: PgConnectionDep) -> List[Collection]: """List collections.""" - with Session(engine) as session: - return session.exec(select(Collection)).all() + results = await conn.fetch("SELECT id, name FROM collection") + return [Collection.model_validate(dict(result)) for result in results] PathCollectionId = Annotated[int, Path(..., description="The collection ID.")] @router.get("/{id}") -async def get(id: PathCollectionId, engine: EngineDep) -> Collection: +async def get_collection(id: PathCollectionId, conn: PgConnectionDep) -> Collection: """Get a specific collection.""" - with Session(engine) as session: - return session.get(Collection, id) + result = await conn.fetchrow("SELECT id, name FROM collection WHERE id = $1", id) + return Collection.model_validate(dict(result)) diff --git a/app/common/db.py b/app/common/db.py new file mode 100644 index 0000000..ec1c54d --- /dev/null +++ b/app/common/db.py @@ -0,0 +1,32 @@ +import contextlib +from enum import Enum +from typing import Annotated, AsyncIterator, Optional +from uuid import UUID +import asyncpg + +from fastapi import Depends, Request + +@contextlib.asynccontextmanager +async def create_pool(dsn: str) -> AsyncIterator[asyncpg.Pool]: + """ + Create a postgres connection pool. + + Arguments: + - dsn: Connection arguments specified using as a single string in + the following format: + `postgres://user:pass@host:port/database?option=value`. + """ + pool = await asyncpg.create_pool(dsn) + yield pool + pool.close() + +def _pg_pool(request: Request) -> asyncpg.Pool: + return request.state.pg_pool + +PgPoolDep = Annotated[asyncpg.Pool, Depends(_pg_pool)] + +async def _pg_connection(pool: PgPoolDep) -> asyncpg.Connection: + async with pool.acquire() as connection: + yield connection + +PgConnectionDep = Annotated[asyncpg.Connection, Depends(_pg_connection)] \ No newline at end of file diff --git a/app/common/schema.py b/app/common/schema.py deleted file mode 100644 index 7fdae38..0000000 --- a/app/common/schema.py +++ /dev/null @@ -1,53 +0,0 @@ -from enum import Enum -from typing import Annotated, Optional - -from fastapi import Depends, Request -from sqlalchemy import Engine, UniqueConstraint -from sqlmodel import Field, SQLModel - - -class Collection(SQLModel, table=True): - id: Optional[int] = Field(default=None, primary_key=True) - - # TODO: We may want this to be unique per-tenant rather than globally unique names. - name: str = Field(index=True, unique=True) - -class IngestState(Enum): - UNKNOWN = "unknown" - """Document is in an unknown state.""" - - PENDING = "pending" - """Document is pending ingestion.""" - - INGESTED = "ingested" - """Document has been ingested.""" - - FAILED = "failed" - """Document failed to be ingested. See `ingest_errors` for details.""" - -class Document(SQLModel, table=True): - """Schema for documents in the SQL DB.""" - - __table_args__ = ( - UniqueConstraint("collection_id", "url"), - UniqueConstraint("collection_id", "doc_id"), - ) - - id: Optional[int] = Field(default=None, primary_key=True) - collection_id: Optional[int] = Field(foreign_key="collection.id") - - url: str = Field(index=True) - doc_id: Optional[str] = Field(default=None) - - ingest_state: IngestState = Field(default=IngestState.UNKNOWN) - """The state of the document ingestion.""" - - ingest_error: Optional[str] = Field(default=None) - """Errors which occurred during ingestion, if any.""" - - -def _db(request: Request) -> Engine: - return request.state.engine - - -EngineDep = Annotated[Engine, Depends(_db)] diff --git a/app/config.py b/app/config.py index 27fb64c..3848f9a 100644 --- a/app/config.py +++ b/app/config.py @@ -1,7 +1,7 @@ from typing import Any, Optional from fastapi.routing import APIRoute -from pydantic import RedisDsn, ValidationInfo, field_validator +from pydantic import RedisDsn, PostgresDsn, ValidationInfo, field_validator from pydantic_core import Url from pydantic_settings import BaseSettings @@ -19,8 +19,11 @@ class Config: env_file = ".env" env_file_encoding = "utf-8" - DB: str = "sqlite:///database.db?check_same_thread=false" - """The database to connect to.""" + DB: PostgresDsn + """The Postgres database to connect to.""" + + APPLY_MIGRATIONS: bool = False + """Whether migrations should be applied to the database.""" ENVIRONMENT: Environment = Environment.PRODUCTION """The environment the application is running in.""" @@ -115,16 +118,6 @@ def custom_generate_unique_id_function(route: APIRoute) -> str: from a variety of sources -- documents, web pages, audio, etc. """ -STATEMENTS_DESCRIPTION: str = """Operations for retrieving statements. - -Statements include chunks of raw-text, images, and tables from documents, -as well as extracted propositions (facts) and other information from -the documents. - -Additionally, a summary of retrieved statements may be requested as well -as the statements or instead of the statements. -""" - app_configs: dict[str, Any] = { "title": "Dewy Knowledge Base API", "summary": "Knowledge curation for Retrieval Augmented Generation", @@ -132,20 +125,6 @@ def custom_generate_unique_id_function(route: APIRoute) -> str: "servers": [ {"url": "http://localhost:8000", "description": "Local server"}, ], - "openapi_tags": [ - { - "name": "documents", - "description": "Operations on specific documents, including ingestion.", - }, - { - "name": "statements", - "description": STATEMENTS_DESCRIPTION, - }, - { - "name": "collections", - "description": "Operations related to collections of documents.", - }, - ], "generate_unique_id_function": custom_generate_unique_id_function, } diff --git a/app/documents/models.py b/app/documents/models.py new file mode 100644 index 0000000..227a542 --- /dev/null +++ b/app/documents/models.py @@ -0,0 +1,26 @@ +from enum import Enum +from typing import Optional + +from pydantic import BaseModel + + +class IngestState(Enum): + PENDING = "pending" + """Document is pending ingestion.""" + + INGESTED = "ingested" + """Document has been ingested.""" + + FAILED = "failed" + """Document failed to be ingested. See `ingest_errors` for details.""" + +class Document(BaseModel): + """Schema for documents in the SQL DB.""" + + id: Optional[int] = None + collection_id: int + + url: str + + ingest_state: Optional[IngestState] = None + ingest_error: Optional[str] = None \ No newline at end of file diff --git a/app/documents/router.py b/app/documents/router.py index 055cf0e..40ca4f0 100644 --- a/app/documents/router.py +++ b/app/documents/router.py @@ -1,85 +1,97 @@ from typing import Annotated, List +import asyncpg from fastapi import APIRouter, BackgroundTasks, Body, HTTPException, Path, status from loguru import logger from sqlalchemy import Engine from sqlmodel import Session, select +from app.collections.router import PathCollectionId -from app.common.schema import Document, EngineDep, IngestState +from app.common.db import PgConnectionDep, PgPoolDep +from app.documents.models import * from app.ingest.extract import extract from app.ingest.extract.source import ExtractSource from app.ingest.store import Store, StoreDep -router = APIRouter(tags=["documents"], prefix="/documents") +# TODO: Move this to `/documents`. Will require figuring out +# how to specify the collection for create, list, etc. +router = APIRouter(prefix="/collections/{collection_id}/documents") -async def ingest_document(id: int, store: Store, engine: Engine): +# We can't use the session from the request because it ends as soon +# as the request completes. So we need to pass the engine and start +# a new session. +async def ingest_document(id: int, store: Store, pg_pool: asyncpg.Pool): # Load the content. - with Session(engine) as session: - document = session.get(Document, id) + async with pg_pool.acquire() as conn: + url = await conn.fetchval("SELECT url FROM document WHERE id = $1", id) - logger.debug("Loading content from {}", document.url) + logger.debug("Loading content for document {} from {}", id, url) documents = await extract( ExtractSource( - document.url, + url, ) ) - logger.debug("Loaded {} pages from {}", len(documents), document.url) + logger.debug("Loaded {} pages from {}", len(documents), url) if not documents: raise HTTPException( status_code=status.HTTP_412_PRECONDITION_FAILED, - detail=f"No content retrieved from '{document.url}'", + detail=f"No content retrieved from '{url}'", ) - logger.debug("Inserting {} documents from {}", len(documents), document.url) + logger.debug("Inserting {} documents from {}", len(documents), url) nodes = await store.ingestion_pipeline.arun(documents=documents) logger.debug("Done. Inserted {} nodes", len(nodes)) - document.ingest_state = IngestState.INGESTED - document.ingest_error = None - session.add(document) - session.commit() + await conn.execute(""" + UPDATE document + SET ingest_state = 'ingested', ingest_error = NULL + WHERE id = $1 + """, id) @router.put("/") -async def add( +async def add_document( + collection_id: PathCollectionId, store: StoreDep, - engine: EngineDep, + pg_pool: PgPoolDep, background: BackgroundTasks, url: Annotated[str, Body(..., description="The URL of the document to add.")], ) -> Document: """Add a document.""" - # Update the document in the DB. - document = Document( - url = url - ) - with Session(engine) as session: - # TODO: Support update (and fail if the document doesn't exist/etc.) - - document.ingest_state = IngestState.PENDING - document.ingest_error = None - - session.add(document) - session.commit() - session.refresh(document) - - # Create the background task to update the state. - background.add_task(ingest_document, document.id, store, engine) + row = None + async with pg_pool.acquire() as conn: + row = await conn.fetchrow(""" + INSERT INTO document (collection_id, url, ingest_state) VALUES ($1, $2, 'pending') + RETURNING id, collection_id, url, ingest_state, ingest_error + """, collection_id, url) + document = Document.model_validate(dict(row)) + background.add_task(ingest_document, document.id, store, pg_pool) return document PathDocumentId = Annotated[int, Path(..., description="The document ID.")] @router.get("/") -async def list(engine: EngineDep) -> List[Document]: +async def list_documents(collection_id: PathCollectionId, conn: PgConnectionDep) -> List[Document]: """List documents.""" - with Session(engine) as session: - return session.exec(select(Document)).all() + # TODO: Test + results = await conn.fetch(""" + SELECT id, collection_id, url, ingest_state, ingest_error + FROM document WHERE collection_id = $1 + """, collection_id) + return [Document.model_validate(dict(result)) for result in results] @router.get("/{id}") -async def get( - engine: EngineDep, +async def get_document( + conn: PgConnectionDep, + collection_id: PathCollectionId, id: PathDocumentId ) -> Document: - with Session(engine) as session: - return session.get(Document, id) + # TODO: Test / return not found? + result = await conn.fetchrow( + """ + SELECT id, collection_id, url, ingest_state, ingest_error + FROM document WHERE id = $1 AND collection_id = $2 + """, id, collection_id) + return Document.model_validate(dict(result)) diff --git a/app/main.py b/app/main.py index 6435d6d..b54e018 100644 --- a/app/main.py +++ b/app/main.py @@ -4,7 +4,9 @@ from fastapi import FastAPI from sqlalchemy import Engine from sqlmodel import SQLModel, create_engine +from loguru import logger +from app.common import db from app.config import app_configs, settings from app.ingest.store import Store from app.routes import api_router @@ -18,15 +20,33 @@ class State(TypedDict): @contextlib.asynccontextmanager async def lifespan(_app: FastAPI) -> AsyncIterator[State]: """Function creating instances used during the lifespan of the service.""" - engine = create_engine(settings.DB, echo=True) - SQLModel.metadata.create_all(engine) - state = { - "store": Store(), - "engine": engine, - } + # if settings.APPLY_MIGRATIONS: + # from yoyo import get_backend, read_migrations + # backend = get_backend(settings.DB.unicode_string()) + # migrations = read_migrations('migrations') + # with backend.lock(): + # outstanding = backend.to_apply(migrations) - yield state + # logger.info("Applying {} migrations", len(outstanding)) + + # # Apply any outstanding migrations + # backend.apply_migrations(outstanding) + + # logger.info("Done applying migrations.") + + async with db.create_pool(settings.DB.unicode_string()) as pg_pool: + if settings.APPLY_MIGRATIONS: + async with pg_pool.acquire() as conn: + with open("migrations/0001_schema.sql") as schema_file: + schema = schema_file.read() + await conn.execute(schema) + state = { + "store": Store(), + "pg_pool": pg_pool, + } + + yield state app = FastAPI(lifespan=lifespan, **app_configs) diff --git a/app/routes.py b/app/routes.py index 4643fc6..ce9c5e2 100644 --- a/app/routes.py +++ b/app/routes.py @@ -2,10 +2,10 @@ from app.collections.router import router as collections_router from app.documents.router import router as documents_router -from app.statements.router import router as statements_router +from app.chunks.router import router as chunks_router api_router = APIRouter(prefix="/api") api_router.include_router(collections_router) api_router.include_router(documents_router) -api_router.include_router(statements_router) +api_router.include_router(chunks_router) diff --git a/docker-compose.yml b/docker-compose.yml index b18cdf3..c57a7c6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,7 +8,8 @@ services: REDIS: "redis://default:testing123@redis:6379" LLAMA_INDEX_CACHE_DIR: "/tmp/cache/llama_index" HF_HOME: "/tmp/cache/hf" - # DB: "sqlite:///var/db/database.db?check_same_thread=false" + DB: "postgresql://dewydbuser:dewydbpwd@postgres/dewydb" + APPLY_MIGRATIONS: true env_file: - .env build: @@ -20,9 +21,27 @@ services: - kb-network depends_on: - redis + - postgres volumes: - llama-cache:/tmp/cache - - db:/var/db + + postgres: + image: ankane/pgvector + ports: + - 5432:5432 + restart: always + environment: + - POSTGRES_DB=dewydb + - POSTGRES_USER=dewydbuser + - POSTGRES_PASSWORD=dewydbpwd + - POSTGRES_HOST_AUTH_METHOD=trust + volumes: + - db:/var/lib/postgresql + networks: + - kb-network + healthcheck: + test: ['CMD-SHELL', 'pg_isready -U root'] + interval: 60s redis: build: diff --git a/example_notebook.ipynb b/example_notebook.ipynb index f834a40..0ee22a4 100644 --- a/example_notebook.ipynb +++ b/example_notebook.ipynb @@ -13,6 +13,20 @@ " })" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "response = client.put(f\"/collections/\",\n", + " json = {\n", + " \"name\": \"my_collection\",\n", + " })\n", + "response.raise_for_status()\n", + "collection_id = response.json()['id']" + ] + }, { "cell_type": "code", "execution_count": null, @@ -20,7 +34,7 @@ "outputs": [], "source": [ "# Add \"Query Rewriting for Retrieval-Augmented Large Language Models\"\n", - "response = client.put(f\"/documents/\",\n", + "response = client.put(f\"/collections/{collection_id}/documents/\",\n", " content = \"\\\"https://arxiv.org/pdf/2305.14283.pdf\\\"\")\n", "response.raise_for_status()\n", "print(response.json())\n", @@ -34,7 +48,7 @@ "outputs": [], "source": [ "# Report the status of the document ingestion.\n", - "response = client.get(f\"/documents/{document_id}\")\n", + "response = client.get(f\"/collections/{collection_id}/documents/{document_id}\")\n", "print(response.raise_for_status().json())" ] }, @@ -45,17 +59,24 @@ "outputs": [], "source": [ "# Retrieve 4 items with no summary.\n", - "results = client.post(f\"/statements/retrieve\",\n", - " json = {\n", - " \"query\": \"retrieval augmented generation\",\n", - " \"n\": 4\n", - " },\n", - " timeout = None)\n", + "results = client.post(f\"/chunks/retrieve\",\n", + " json = {\n", + " \"query\": \"retrieval augmented generation\",\n", + " \"n\": 4\n", + " },\n", + " timeout = None)\n", "results.raise_for_status()\n", "\n", "results.json()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -63,7 +84,7 @@ "outputs": [], "source": [ "# Retrieve 32 items, and include the summary.\n", - "results = client.post(f\"/statements/retrieve\",\n", + "results = client.post(f\"/chunks/retrieve\",\n", " json = {\n", " \"query\": \"How does query-rewriting improve Retrieval-Augmented-Generation?\",\n", " \"n\": 32,\n", diff --git a/migrations/0001_schema.sql b/migrations/0001_schema.sql new file mode 100644 index 0000000..247865f --- /dev/null +++ b/migrations/0001_schema.sql @@ -0,0 +1,91 @@ +-- Apply the base schema. + +CREATE TABLE collection ( + id SERIAL NOT NULL, + name VARCHAR NOT NULL, + + PRIMARY KEY (id) +); + +CREATE TYPE ingest_state AS ENUM ('pending', 'ingested', 'failed'); +CREATE TABLE document( + id SERIAL NOT NULL, + collection_id INTEGER, + url VARCHAR NOT NULL, + + -- The state of the most recent ingestion of this document. + -- TODO: Should we have a separate `ingestion` table and associate + -- many ingestions with each document ID? + ingest_state ingest_state, + -- The error (if any) resulting from the most recent ingestion. + ingest_error VARCHAR, + + PRIMARY KEY (id), + UNIQUE (collection_id, url), + FOREIGN KEY(collection_id) REFERENCES collection (id) +); + +CREATE TYPE chunk_kind AS ENUM ( + -- This is a chunk representing text. + 'text' +); +CREATE TABLE chunk( + id SERIAL NOT NULL, + + -- The document containing this chunk. + -- + -- TODO: We may want to allow chunks to be associated with + -- multiple documents. + document_id INTEGER, + + -- The kind of chunk. + kind chunk_kind NULL, + + -- The text associated with the chunk, if any. + -- + -- The existance of text does not imply the chunk is textual. For instance, + -- image chunks may set the text to a description of the image. + text VARCHAR, + + PRIMARY KEY (id), + FOREIGN KEY(document_id) REFERENCES document (id) +); + +-- CREATE TABLE ingestion( +-- id SERIAL NOT NULL, +-- -- The document ID this ingestion is associated with. +-- document_id INTEGER, +-- -- The state of this ingestion. +-- -- +-- -- NULL indicates unknown. +-- state ingest_state, +-- -- Concatetaned errors reported by this ingestion. +-- error VARCHAR, + +-- FOREIGN KEY(document_id) REFERENCES document (id) +-- ) + +CREATE TYPE embedding_kind AS ENUM ( + -- This is an embedding of the original text of the chunk. + -- + -- `key_text` will be NULL -- see the `text` of the original chunk. + 'text', + -- This is an embedding of the computed summary of the chunk. + -- + -- `key_text` will contain the computed summary. + 'computed_summary', + -- This is an embedding of the computed title. + -- + -- `key_text` will contain the computed summary. + 'computed_title' +); +CREATE TABLE embedding( + id SERIAL NOT NULL, + + chunk_id INTEGER, + + key_text VARCHAR, + + PRIMARY KEY (id), + FOREIGN KEY(chunk_id) REFERENCES chunk (id) +); \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index af30168..390d5b1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -220,6 +220,78 @@ files = [ {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, ] +[[package]] +name = "asyncpg" +version = "0.29.0" +description = "An asyncio PostgreSQL driver" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "asyncpg-0.29.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72fd0ef9f00aeed37179c62282a3d14262dbbafb74ec0ba16e1b1864d8a12169"}, + {file = "asyncpg-0.29.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52e8f8f9ff6e21f9b39ca9f8e3e33a5fcdceaf5667a8c5c32bee158e313be385"}, + {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9e6823a7012be8b68301342ba33b4740e5a166f6bbda0aee32bc01638491a22"}, + {file = "asyncpg-0.29.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:746e80d83ad5d5464cfbf94315eb6744222ab00aa4e522b704322fb182b83610"}, + {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ff8e8109cd6a46ff852a5e6bab8b0a047d7ea42fcb7ca5ae6eaae97d8eacf397"}, + {file = "asyncpg-0.29.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:97eb024685b1d7e72b1972863de527c11ff87960837919dac6e34754768098eb"}, + {file = "asyncpg-0.29.0-cp310-cp310-win32.whl", hash = "sha256:5bbb7f2cafd8d1fa3e65431833de2642f4b2124be61a449fa064e1a08d27e449"}, + {file = "asyncpg-0.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:76c3ac6530904838a4b650b2880f8e7af938ee049e769ec2fba7cd66469d7772"}, + {file = "asyncpg-0.29.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4900ee08e85af01adb207519bb4e14b1cae8fd21e0ccf80fac6aa60b6da37b4"}, + {file = "asyncpg-0.29.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a65c1dcd820d5aea7c7d82a3fdcb70e096f8f70d1a8bf93eb458e49bfad036ac"}, + {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5b52e46f165585fd6af4863f268566668407c76b2c72d366bb8b522fa66f1870"}, + {file = "asyncpg-0.29.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dc600ee8ef3dd38b8d67421359779f8ccec30b463e7aec7ed481c8346decf99f"}, + {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:039a261af4f38f949095e1e780bae84a25ffe3e370175193174eb08d3cecab23"}, + {file = "asyncpg-0.29.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:6feaf2d8f9138d190e5ec4390c1715c3e87b37715cd69b2c3dfca616134efd2b"}, + {file = "asyncpg-0.29.0-cp311-cp311-win32.whl", hash = "sha256:1e186427c88225ef730555f5fdda6c1812daa884064bfe6bc462fd3a71c4b675"}, + {file = "asyncpg-0.29.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfe73ffae35f518cfd6e4e5f5abb2618ceb5ef02a2365ce64f132601000587d3"}, + {file = "asyncpg-0.29.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:6011b0dc29886ab424dc042bf9eeb507670a3b40aece3439944006aafe023178"}, + {file = "asyncpg-0.29.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b544ffc66b039d5ec5a7454667f855f7fec08e0dfaf5a5490dfafbb7abbd2cfb"}, + {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d84156d5fb530b06c493f9e7635aa18f518fa1d1395ef240d211cb563c4e2364"}, + {file = "asyncpg-0.29.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:54858bc25b49d1114178d65a88e48ad50cb2b6f3e475caa0f0c092d5f527c106"}, + {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:bde17a1861cf10d5afce80a36fca736a86769ab3579532c03e45f83ba8a09c59"}, + {file = "asyncpg-0.29.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a2ec1b9ff88d8773d3eb6d3784dc7e3fee7756a5317b67f923172a4748a175"}, + {file = "asyncpg-0.29.0-cp312-cp312-win32.whl", hash = "sha256:bb1292d9fad43112a85e98ecdc2e051602bce97c199920586be83254d9dafc02"}, + {file = "asyncpg-0.29.0-cp312-cp312-win_amd64.whl", hash = "sha256:2245be8ec5047a605e0b454c894e54bf2ec787ac04b1cb7e0d3c67aa1e32f0fe"}, + {file = "asyncpg-0.29.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0009a300cae37b8c525e5b449233d59cd9868fd35431abc470a3e364d2b85cb9"}, + {file = "asyncpg-0.29.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cad1324dbb33f3ca0cd2074d5114354ed3be2b94d48ddfd88af75ebda7c43cc"}, + {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:012d01df61e009015944ac7543d6ee30c2dc1eb2f6b10b62a3f598beb6531548"}, + {file = "asyncpg-0.29.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:000c996c53c04770798053e1730d34e30cb645ad95a63265aec82da9093d88e7"}, + {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e0bfe9c4d3429706cf70d3249089de14d6a01192d617e9093a8e941fea8ee775"}, + {file = "asyncpg-0.29.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:642a36eb41b6313ffa328e8a5c5c2b5bea6ee138546c9c3cf1bffaad8ee36dd9"}, + {file = "asyncpg-0.29.0-cp38-cp38-win32.whl", hash = "sha256:a921372bbd0aa3a5822dd0409da61b4cd50df89ae85150149f8c119f23e8c408"}, + {file = "asyncpg-0.29.0-cp38-cp38-win_amd64.whl", hash = "sha256:103aad2b92d1506700cbf51cd8bb5441e7e72e87a7b3a2ca4e32c840f051a6a3"}, + {file = "asyncpg-0.29.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5340dd515d7e52f4c11ada32171d87c05570479dc01dc66d03ee3e150fb695da"}, + {file = "asyncpg-0.29.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e17b52c6cf83e170d3d865571ba574577ab8e533e7361a2b8ce6157d02c665d3"}, + {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f100d23f273555f4b19b74a96840aa27b85e99ba4b1f18d4ebff0734e78dc090"}, + {file = "asyncpg-0.29.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48e7c58b516057126b363cec8ca02b804644fd012ef8e6c7e23386b7d5e6ce83"}, + {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f9ea3f24eb4c49a615573724d88a48bd1b7821c890c2effe04f05382ed9e8810"}, + {file = "asyncpg-0.29.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8d36c7f14a22ec9e928f15f92a48207546ffe68bc412f3be718eedccdf10dc5c"}, + {file = "asyncpg-0.29.0-cp39-cp39-win32.whl", hash = "sha256:797ab8123ebaed304a1fad4d7576d5376c3a006a4100380fb9d517f0b59c1ab2"}, + {file = "asyncpg-0.29.0-cp39-cp39-win_amd64.whl", hash = "sha256:cce08a178858b426ae1aa8409b5cc171def45d4293626e7aa6510696d46decd8"}, + {file = "asyncpg-0.29.0.tar.gz", hash = "sha256:d1c49e1f44fffafd9a55e1a9b101590859d881d639ea2922516f5d9c512d354e"}, +] + +[package.dependencies] +async-timeout = {version = ">=4.0.3", markers = "python_version < \"3.12.0\""} + +[package.extras] +docs = ["Sphinx (>=5.3.0,<5.4.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["flake8 (>=6.1,<7.0)", "uvloop (>=0.15.3)"] + +[[package]] +name = "asyncpg-stubs" +version = "0.29.1" +description = "asyncpg stubs" +optional = false +python-versions = ">=3.8,<4.0" +files = [ + {file = "asyncpg_stubs-0.29.1-py3-none-any.whl", hash = "sha256:cce994d5a19394249e74ae8d252bde3c77cee0ddfc776cc708b724fdb4adebb6"}, + {file = "asyncpg_stubs-0.29.1.tar.gz", hash = "sha256:686afcc0af3a2f3c8e393cd850e0de430e5a139ce82b2f28ef8f693ecdf918bf"}, +] + +[package.dependencies] +asyncpg = ">=0.29,<0.30" +typing-extensions = ">=4.7.0,<5.0.0" + [[package]] name = "attrs" version = "23.2.0" @@ -1200,6 +1272,25 @@ files = [ {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, ] +[[package]] +name = "importlib-metadata" +version = "7.0.1" +description = "Read metadata from Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_metadata-7.0.1-py3-none-any.whl", hash = "sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e"}, + {file = "importlib_metadata-7.0.1.tar.gz", hash = "sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc"}, +] + +[package.dependencies] +zipp = ">=0.5" + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +perf = ["ipython"] +testing = ["flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)", "pytest-ruff"] + [[package]] name = "iopath" version = "0.1.10" @@ -2832,6 +2923,104 @@ files = [ [package.extras] test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] +[[package]] +name = "psycopg" +version = "3.1.17" +description = "PostgreSQL database adapter for Python" +optional = false +python-versions = ">=3.7" +files = [ + {file = "psycopg-3.1.17-py3-none-any.whl", hash = "sha256:96b7b13af6d5a514118b759a66b2799a8a4aa78675fa6bb0d3f7d52d67eff002"}, + {file = "psycopg-3.1.17.tar.gz", hash = "sha256:437e7d7925459f21de570383e2e10542aceb3b9cb972ce957fdd3826ca47edc6"}, +] + +[package.dependencies] +psycopg-binary = {version = "3.1.17", optional = true, markers = "implementation_name != \"pypy\" and extra == \"binary\""} +typing-extensions = ">=4.1" +tzdata = {version = "*", markers = "sys_platform == \"win32\""} + +[package.extras] +binary = ["psycopg-binary (==3.1.17)"] +c = ["psycopg-c (==3.1.17)"] +dev = ["black (>=23.1.0)", "codespell (>=2.2)", "dnspython (>=2.1)", "flake8 (>=4.0)", "mypy (>=1.4.1)", "types-setuptools (>=57.4)", "wheel (>=0.37)"] +docs = ["Sphinx (>=5.0)", "furo (==2022.6.21)", "sphinx-autobuild (>=2021.3.14)", "sphinx-autodoc-typehints (>=1.12)"] +pool = ["psycopg-pool"] +test = ["anyio (>=3.6.2,<4.0)", "mypy (>=1.4.1)", "pproxy (>=2.7)", "pytest (>=6.2.5)", "pytest-cov (>=3.0)", "pytest-randomly (>=3.5)"] + +[[package]] +name = "psycopg-binary" +version = "3.1.17" +description = "PostgreSQL database adapter for Python -- C optimisation distribution" +optional = false +python-versions = ">=3.7" +files = [ + {file = "psycopg_binary-3.1.17-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f9ba559eabb0ba1afd4e0504fa0b10e00a212cac0c4028b8a1c3b087b5c1e5de"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2b2a689eaede08cf91a36b10b0da6568dd6e4669200f201e082639816737992b"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a16abab0c1abc58feb6ab11d78d0f8178a67c3586bd70628ec7c0218ec04c4ef"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:73e7097b81cad9ae358334e3cec625246bb3b8013ae6bb287758dd6435e12f65"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:67a5b93101bc85a95a189c0a23d02a29cf06c1080a695a0dedfdd50dd734662a"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:751b31c2faae0348f87f22b45ef58f704bdcfc2abdd680fa0c743c124071157e"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b447ea765e71bc33a82cf070bba814b1efa77967442d116b95ccef8ce5da7631"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:d2e9ed88d9a6a475c67bf70fc8285e88ccece0391727c7701e5a512e0eafbb05"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a89f36bf7b612ff6ed3e789bd987cbd0787cf0d66c49386fa3bad816dd7bee87"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5ccbe8b2ec444763a51ecb1213befcbb75defc1ef36e7dd5dff501a23d7ce8cf"}, + {file = "psycopg_binary-3.1.17-cp310-cp310-win_amd64.whl", hash = "sha256:adb670031b27949c9dc5cf585c4a5a6b4469d3879fd2fb9d39b6d53e5f66b9bc"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0227885686c2cc0104ceb22d6eebc732766e9ad48710408cb0123237432e5435"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:9124b6db07e8d8b11f4512b8b56cbe136bf1b7d0417d1280e62291a9dcad4408"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8a46f77ba0ca7c5a5449b777170a518fa7820e1710edb40e777c9798f00d033"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5f5f5bcbb772d8c243d605fc7151beec760dd27532d42145a58fb74ef9c5fbf2"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:267a82548c21476120e43dc72b961f1af52c380c0b4c951bdb34cf14cb26bd35"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b20013051f1fd7d02b8d0766cfe8d009e8078babc00a6d39bc7e2d50a7b96af"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:8c5c38129cc79d7e3ba553035b9962a442171e9f97bb1b8795c0885213f206f3"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:d01c4faae66de60fcd3afd3720dcc8ffa03bc2087f898106da127774db12aac5"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e6ae27b0617ad3809449964b5e901b21acff8e306abacb8ba71d5ee7c8c47eeb"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:40af298b209dd77ca2f3e7eb3fbcfb87a25999fc015fcd14140bde030a164c7e"}, + {file = "psycopg_binary-3.1.17-cp311-cp311-win_amd64.whl", hash = "sha256:7b4e4c2b05f3b431e9026e82590b217e87696e7a7548f512ae8059d59fa8af3b"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:ea425a8dcd808a7232a5417d2633bfa543da583a2701b5228e9e29989a50deda"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a3f1196d76860e72d338fab0d2b6722e8d47e2285d693e366ae36011c4a5898a"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1e867c2a729348df218a14ba1b862e627177fd57c7b4f3db0b4c708f6d03696"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0711e46361ea3047cd049868419d030c8236a9dea7e9ed1f053cbd61a853ec9"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d1c0115bdf80cf6c8c9109cb10cf6f650fd1a8d841f884925e8cb12f34eb5371"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3d0d154c780cc7b28a3a0886e8a4b18689202a1dbb522b3c771eb3a1289cf7c3"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:f4028443bf25c1e04ecffdc552c0a98d826903dec76a1568dfddf5ebbbb03db7"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf424d92dd7e94705b31625b02d396297a7c8fab4b6f7de8dba6388323a7b71c"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:00377f6963ee7e4bf71cab17c2c235ef0624df9483f3b615d86aa24cde889d42"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9690a535d9ccd361bbc3590bfce7fe679e847f44fa7cc97f3b885f4744ca8a2c"}, + {file = "psycopg_binary-3.1.17-cp312-cp312-win_amd64.whl", hash = "sha256:6b2ae342d69684555bfe77aed5546d125b4a99012e0b83a8b3da68c8829f0935"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:86bb3656c8d744cc1e42003414cd6c765117d70aa23da6c0f4ff2b826e0fd0fd"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c10b7713e3ed31df7319c2a72d5fea5a2536476d7695a3e1d18a1f289060997c"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:12eab8bc91b4ba01b2ecee3b5b80501934b198f6e1f8d4b13596f3f38ba6e762"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6a728beefd89b430ebe2729d04ba10e05036b5e9d01648da60436000d2fcd242"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61104b8e7a43babf2bbaa36c08e31a12023e2f967166e99d6b052b11a4c7db06"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:02cd2eb62ffc56f8c847d68765cbf461b3d11b438fe48951e44b6c563ec27d18"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:ca1757a6e080086f7234dc45684e81a47a66a6dd492a37d6ce38c58a1a93e9ff"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:6e3543edc18553e31a3884af3cd7eea43d6c44532d8b9b16f3e743cdf6cfe6c5"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:914254849486e14aa931b0b3382cd16887f1507068ffba775cbdc5a55fe9ef19"}, + {file = "psycopg_binary-3.1.17-cp37-cp37m-win_amd64.whl", hash = "sha256:92fad8f1aa80a5ab316c0493dc6d1b54c1dba21937e43eea7296ff4a0ccc071e"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6d4f2e15d33ed4f9776fdf23683512d76f4e7825c4b80677e9e3ce6c1b193ff2"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4fa26836ce074a1104249378727e1f239a01530f36bae16e77cf6c50968599b4"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d54bcf2dfc0880bf13f38512d44b194c092794e4ee9e01d804bc6cd3eed9bfb7"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e28024204dc0c61094268c682041d2becfedfea2e3b46bed5f6138239304d98"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0b1ec6895cab887b92c303565617f994c9b9db53befda81fa2a31b76fe8a3ab1"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:420c1eb1626539c261cf3fbe099998da73eb990f9ce1a34da7feda414012ea5f"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:83404a353240fdff5cfe9080665fdfdcaa2d4d0c5112e15b0a2fe2e59200ed57"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:a0c4ba73f9e7721dd6cc3e6953016652dbac206f654229b7a1a8ac182b16e689"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f6898bf1ca5aa01115807643138e3e20ec603b17a811026bc4a49d43055720a7"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6b40fa54a02825d3d6a8009d9a82a2b4fad80387acf2b8fd6d398fd2813cb2d9"}, + {file = "psycopg_binary-3.1.17-cp38-cp38-win_amd64.whl", hash = "sha256:78ebb43dca7d5b41eee543cd005ee5a0256cecc74d84acf0fab4f025997b837e"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:02ac573f5a6e79bb6df512b3a6279f01f033bbd45c47186e8872fee45f6681d0"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:704f6393d758b12a4369887fe956b2a8c99e4aced839d9084de8e3f056015d40"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0340ef87a888fd940796c909e038426f4901046f61856598582a817162c64984"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a880e4113af3ab84d6a0991e3f85a2424924c8a182733ab8d964421df8b5190a"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:93921178b9a40c60c26e47eb44970f88c49fe484aaa3bb7ec02bb8b514eab3d9"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a05400e9314fc30bc1364865ba9f6eaa2def42b5e7e67f71f9a4430f870023e"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:3e2cc2bbf37ff1cf11e8b871c294e3532636a3cf7f0c82518b7537158923d77b"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:a343261701a8f63f0d8268f7fd32be40ffe28d24b65d905404ca03e7281f7bb5"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:dceb3930ec426623c0cacc78e447a90882981e8c49d6fea8d1e48850e24a0170"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d613a23f8928f30acb2b6b2398cb7775ba9852e8968e15df13807ba0d3ebd565"}, + {file = "psycopg_binary-3.1.17-cp39-cp39-win_amd64.whl", hash = "sha256:d90c0531e9d591bde8cea04e75107fcddcc56811b638a34853436b23c9a3cb7d"}, +] + [[package]] name = "ptyprocess" version = "0.7.0" @@ -4067,6 +4256,22 @@ files = [ pydantic = ">=1.10.13,<3.0.0" SQLAlchemy = ">=2.0.0,<2.1.0" +[[package]] +name = "sqlparse" +version = "0.4.4" +description = "A non-validating SQL parser." +optional = false +python-versions = ">=3.5" +files = [ + {file = "sqlparse-0.4.4-py3-none-any.whl", hash = "sha256:5430a4fe2ac7d0f93e66f1efc6e1338a41884b7ddf2a350cedd20ccc4d9d28f3"}, + {file = "sqlparse-0.4.4.tar.gz", hash = "sha256:d446183e84b8349fa3061f0fe7f06ca94ba65b426946ffebe6e3e8295332420c"}, +] + +[package.extras] +dev = ["build", "flake8"] +doc = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "stack-data" version = "0.6.3" @@ -5043,7 +5248,43 @@ files = [ idna = ">=2.0" multidict = ">=4.0" +[[package]] +name = "yoyo-migrations" +version = "8.2.0" +description = "Database migrations with SQL" +optional = false +python-versions = "*" +files = [ + {file = "yoyo-migrations-8.2.0.tar.gz", hash = "sha256:820606a03e262cf1cd4f59e256c28fa446425224d5b82a3d1275fd78178523e4"}, + {file = "yoyo_migrations-8.2.0-py3-none-any.whl", hash = "sha256:27dabe7432859288b0bd771093f593e3dd2ff6dd4e3b8438992a07c9a7154660"}, +] + +[package.dependencies] +importlib-metadata = ">=3.6.0" +sqlparse = "*" +tabulate = "*" + +[package.extras] +mysql = ["PyMySQL"] +postgres = ["psycopg2"] +pyodbc = ["pyodbc"] + +[[package]] +name = "zipp" +version = "3.17.0" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "zipp-3.17.0-py3-none-any.whl", hash = "sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31"}, + {file = "zipp-3.17.0.tar.gz", hash = "sha256:84e64a1c28cf7e91ed2078bb8cc8c259cb19b76942096c8d7b84947690cabaf0"}, +] + +[package.extras] +docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (<7.2.5)", "sphinx (>=3.5)", "sphinx-lint"] +testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] + [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "aa0a28eaacce7d47a4b2d70fd329066c65aa38af15507182db97374699473e16" +content-hash = "00a6dc6f81282aa336da14c1b035105ee85258b285b6f28f1e4d44882963b78a" diff --git a/pyproject.toml b/pyproject.toml index 1605108..d558f5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,18 +12,20 @@ pydantic-settings = "^2.1.0" fastapi = "^0.108.0" uvicorn = "^0.25.0" pymupdf = "^1.23.8" -unstructured = {extras = ["all-docs"], version = "^0.11.8"} llama-index = "^0.9.28" loguru = "^0.7.2" redis = "^5.0.1" accelerate = "^0.26.1" safetensors = "^0.4.1" sqlmodel = "^0.0.14" +asyncpg = "^0.29.0" [tool.poetry.group.dev.dependencies] ruff = "^0.1.11" mypy = "^1.8.0" ipykernel = "^6.28.0" +yoyo-migrations = "^8.2.0" +asyncpg-stubs = "^0.29.1" [build-system] requires = ["poetry-core"]