Skip to content

Commit

Permalink
Quick & dirty get endpoints for chunks
Browse files Browse the repository at this point in the history
This makes some changes to the chunk data model to make it easier to
return as a resource - I expect this will evolve further as we start
actually populating the DB with chunks.
  • Loading branch information
kerinin committed Jan 24, 2024
1 parent e8c7e3d commit 4597a22
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 15 deletions.
31 changes: 19 additions & 12 deletions app/chunks/models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Literal, Optional, Sequence, Union
from typing import Literal, Optional, Sequence, Union, Annotated

from pydantic import BaseModel, Field

Expand Down Expand Up @@ -27,16 +27,12 @@ class RetrieveRequest(BaseModel):
"""Whether to include a generated summary."""


class BaseChunk(BaseModel):
kind: Literal["text", "raw_text", "image"]

score: Optional[float] = None
"""The similarity score of this chunk."""


class TextChunk(BaseChunk):
class TextChunk(BaseModel):
id: Optional[int] = None
document_id: int
kind: Literal["text"] = "text"
raw: bool

text: str = Field(default="", description="Text content of the chunk.")
start_char_idx: Optional[int] = Field(
default=None, description="Start char index of the chunk."
Expand All @@ -46,20 +42,31 @@ class TextChunk(BaseChunk):
)


class ImageChunk(BaseChunk):
class ImageChunk(BaseModel):
id: Optional[int] = None
document_id: int
kind: Literal["image"] = "image"

text: Optional[str] = Field(..., description="Textual description of the image.")
image: Optional[str] = Field(..., description="Image of the node.")
image_mimetype: Optional[str] = Field(..., description="Mimetype of the image.")
image_path: Optional[str] = Field(..., description="Path of the image.")
image_url: Optional[str] = Field(..., description="URL of the image.")

Chunk = Annotated[Union[TextChunk, ImageChunk], Field(discriminator='kind')]

class RetrieveResult(BaseModel):
score: Optional[float] = None
"""The similarity score of this chunk."""

chunk: Chunk
"""Retrieved chunks."""

class RetrieveResponse(BaseModel):
"""The response from a chunk retrieval request."""

summary: Optional[str]
"""Summary of the retrieved chunks."""

chunks: Sequence[Union[TextChunk, ImageChunk]]
"""Retrieved chunks."""
results: Sequence[RetrieveResult]
"""Retrieved results."""
39 changes: 36 additions & 3 deletions app/chunks/router.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,48 @@
from typing import Union
from typing import Union, Annotated, List

from fastapi import APIRouter
from fastapi import APIRouter, Query, Path
from llama_index.schema import NodeWithScore
from loguru import logger

from app.common.db import PgConnectionDep, PgPoolDep
from app.ingest.store import StoreDep

from .models import ImageChunk, RetrieveRequest, RetrieveResponse, TextChunk
from .models import Chunk, ImageChunk, RetrieveRequest, RetrieveResponse, TextChunk

router = APIRouter(prefix="/chunks")

@router.get("/")
async def list_chunks(
conn: PgConnectionDep,
collection_id: Annotated[int | None, Query(description="Limit to chunks associated with this collection")] = None,
document_id: Annotated[int | None, Query(description="Limit to chunks associated with this document")] = None,
) -> List[Chunk]:
"""List chunks."""

# TODO: handle collection & document ID
results = await conn.fetch(
"""
SELECT id, document_id, kind, text
FROM chunk
"""
)
return [Chunk.model_validate(dict(result)) for result in results]

PathChunkId = Annotated[int, Path(..., description="The chunk ID.")]

@router.get("/{id}")
async def get_chunk(
conn: PgConnectionDep, id: PathChunkId
) -> Chunk:
# TODO: Test / return not found?
result = await conn.fetchrow(
"""
SELECT id, document_id, kind, text
FROM chunk WHERE id = $1
""",
id,
)
return Chunk.model_validate(dict(result))

@router.post("/retrieve")
async def retrieve_chunks(
Expand Down

0 comments on commit 4597a22

Please sign in to comment.