-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ref: Move chunks/documents out of unstructured (#4)
* Eliminate the top level collection for simpler ingest / retrieval. * Eliminate the "unstructured" distinction * Separate documents and chunks * Add more openapi annotations for client generation Co-authored-by: Ryan Michael <kerinin@gmail.com>
- Loading branch information
1 parent
4549505
commit 093be1f
Showing
13 changed files
with
198 additions
and
152 deletions.
There are no files selected for viewing
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from typing import Optional, Sequence | ||
|
||
from pydantic import BaseModel | ||
|
||
from app.common.models import Chunk | ||
|
||
class RetrieveResponse(BaseModel): | ||
"""The response from a chunk retrieval request.""" | ||
|
||
synthesized_text: Optional[str] | ||
"""Synthesized text across all chunks, if requested.""" | ||
|
||
chunks: Sequence[Chunk] | ||
"""Retrieved chunks.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
from fastapi import APIRouter | ||
|
||
from app.common.models import Chunk, RetrieveRequest | ||
from app.ingest.store import StoreDep | ||
from .models import RetrieveResponse | ||
|
||
router = APIRouter(tags=["chunks"], prefix="/chunks") | ||
|
||
@router.post("/retrieve") | ||
async def retrieve( | ||
store: StoreDep, request: RetrieveRequest | ||
) -> RetrieveResponse: | ||
"""Retrieve chunks based on a given query.""" | ||
|
||
results = store.index.as_query_engine( | ||
similarity_top_k=request.n, | ||
response_mode=request.synthesis_mode.value, | ||
# TODO: metadata filters / ACLs | ||
).query(request.query) | ||
|
||
chunks = [Chunk.from_llama_index(node) for node in results.source_nodes] | ||
return RetrieveResponse( | ||
synthesized_text=results.response, | ||
chunks=chunks, | ||
) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from typing import Optional, Sequence | ||
|
||
from pydantic import BaseModel | ||
|
||
from app.common.models import Chunk | ||
|
||
class RetrievedDocument(BaseModel): | ||
chunks: Sequence[Chunk] | ||
"""Retrieved chunks in the given document..""" | ||
|
||
class RetrieveResponse(BaseModel): | ||
"""The response from a chunk retrieval request.""" | ||
|
||
synthesized_text: Optional[str] | ||
"""Synthesized text across all documents, if requested.""" | ||
|
||
documents: Sequence[RetrievedDocument] | ||
"""Retrieved documents.""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
from typing import Annotated | ||
|
||
from fastapi import APIRouter, Body, HTTPException, status | ||
from loguru import logger | ||
|
||
from app.common.models import RetrieveRequest | ||
from app.documents.models import RetrieveResponse | ||
from app.ingest.extract import extract | ||
from app.ingest.extract.source import ExtractSource | ||
from app.ingest.store import StoreDep | ||
|
||
router = APIRouter(tags=["documents"], prefix="/documents") | ||
|
||
@router.put("/") | ||
async def add( | ||
store: StoreDep, | ||
url: Annotated[str, Body(..., description="The URL of the document to add.")], | ||
): | ||
"""Add a document to the unstructured collection. | ||
Parameters: | ||
- collection: The ID of the collection to add to. | ||
- document: The URL of the document to add. | ||
""" | ||
|
||
# Load the content. | ||
logger.debug("Loading content from {}", url) | ||
documents = await extract( | ||
ExtractSource( | ||
url, | ||
) | ||
) | ||
logger.debug("Loaded {} pages from {}", len(documents), url) | ||
if not documents: | ||
raise HTTPException( | ||
status_code=status.HTTP_412_PRECONDITION_FAILED, | ||
detail=f"No content retrieved from '{url}'", | ||
) | ||
|
||
logger.debug("Inserting {} documents from {}", len(documents), url) | ||
nodes = await store.ingestion_pipeline.arun(documents=documents) | ||
logger.debug("Done. Inserted {} nodes", len(nodes)) | ||
|
||
@router.post("/retrieve") | ||
async def retrieve( | ||
_store: StoreDep, _request: RetrieveRequest | ||
) -> RetrieveResponse: | ||
"""Retrieve documents based on a given query.""" | ||
raise NotImplementedError() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,9 @@ | ||
from fastapi import APIRouter | ||
|
||
from app.unstructured.router import router as unstructured_router | ||
from app.chunks.router import router as chunks_router | ||
from app.documents.router import router as documents_router | ||
|
||
api_router = APIRouter(prefix="/api") | ||
|
||
api_router.include_router(unstructured_router) | ||
api_router.include_router(documents_router) | ||
api_router.include_router(chunks_router) |
Oops, something went wrong.