Quick & dirty get endpoints for chunks

This makes some changes to the chunk data model to make it easier to return as a resource - I expect this will evolve further as we start actually populating the DB with chunks.
DewyKB · Jan 24, 2024 · 4597a22 · 4597a22
1 parent e8c7e3d
commit 4597a22
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 15 deletions.
diff --git a/app/chunks/models.py b/app/chunks/models.py
@@ -1,4 +1,4 @@
-from typing import Literal, Optional, Sequence, Union
+from typing import Literal, Optional, Sequence, Union, Annotated
 
 from pydantic import BaseModel, Field
 
@@ -27,16 +27,12 @@ class RetrieveRequest(BaseModel):
     """Whether to include a generated summary."""
 
 
-class BaseChunk(BaseModel):
-    kind: Literal["text", "raw_text", "image"]
-
-    score: Optional[float] = None
-    """The similarity score of this chunk."""
-
-
-class TextChunk(BaseChunk):
+class TextChunk(BaseModel):
+    id: Optional[int] = None
+    document_id: int
     kind: Literal["text"] = "text"
     raw: bool
+
     text: str = Field(default="", description="Text content of the chunk.")
     start_char_idx: Optional[int] = Field(
         default=None, description="Start char index of the chunk."
@@ -46,20 +42,31 @@ class TextChunk(BaseChunk):
     )
 
 
-class ImageChunk(BaseChunk):
+class ImageChunk(BaseModel):
+    id: Optional[int] = None
+    document_id: int
     kind: Literal["image"] = "image"
+
     text: Optional[str] = Field(..., description="Textual description of the image.")
     image: Optional[str] = Field(..., description="Image of the node.")
     image_mimetype: Optional[str] = Field(..., description="Mimetype of the image.")
     image_path: Optional[str] = Field(..., description="Path of the image.")
     image_url: Optional[str] = Field(..., description="URL of the image.")
 
+Chunk = Annotated[Union[TextChunk, ImageChunk], Field(discriminator='kind')]
+
+class RetrieveResult(BaseModel):
+    score: Optional[float] = None
+    """The similarity score of this chunk."""
+
+    chunk: Chunk
+    """Retrieved chunks."""
 
 class RetrieveResponse(BaseModel):
     """The response from a chunk retrieval request."""
 
     summary: Optional[str]
     """Summary of the retrieved chunks."""
 
-    chunks: Sequence[Union[TextChunk, ImageChunk]]
-    """Retrieved chunks."""
+    results: Sequence[RetrieveResult]
+    """Retrieved results."""
diff --git a/app/chunks/router.py b/app/chunks/router.py
@@ -1,15 +1,48 @@
-from typing import Union
+from typing import Union, Annotated, List
 
-from fastapi import APIRouter
+from fastapi import APIRouter, Query, Path
 from llama_index.schema import NodeWithScore
 from loguru import logger
 
+from app.common.db import PgConnectionDep, PgPoolDep
 from app.ingest.store import StoreDep
 
-from .models import ImageChunk, RetrieveRequest, RetrieveResponse, TextChunk
+from .models import Chunk, ImageChunk, RetrieveRequest, RetrieveResponse, TextChunk
 
 router = APIRouter(prefix="/chunks")
 
+@router.get("/")
+async def list_chunks(
+    conn: PgConnectionDep,
+    collection_id: Annotated[int | None, Query(description="Limit to chunks associated with this collection")] = None,
+    document_id: Annotated[int | None, Query(description="Limit to chunks associated with this document")] = None,
+) -> List[Chunk]:
+    """List chunks."""
+
+    # TODO: handle collection & document ID
+    results = await conn.fetch(
+        """
+        SELECT id, document_id, kind, text
+        FROM chunk
+        """
+    )
+    return [Chunk.model_validate(dict(result)) for result in results]
+
+PathChunkId = Annotated[int, Path(..., description="The chunk ID.")]
+
+@router.get("/{id}")
+async def get_chunk(
+    conn: PgConnectionDep, id: PathChunkId
+) -> Chunk:
+    # TODO: Test / return not found?
+    result = await conn.fetchrow(
+        """
+        SELECT id, document_id, kind, text
+        FROM chunk WHERE id = $1
+        """,
+        id,
+    )
+    return Chunk.model_validate(dict(result))
 
 @router.post("/retrieve")
 async def retrieve_chunks(