Skip to content

Commit

Permalink
Block out an admin UI (#30)
Browse files Browse the repository at this point in the history
* Block out an admin UI

* Move docs

* Back out collection name

---------

Co-authored-by: Ben Chambers <35960+bjchambers@users.noreply.github.com>
  • Loading branch information
kerinin and bjchambers authored Jan 29, 2024
1 parent f07de0e commit 4993b5a
Show file tree
Hide file tree
Showing 13 changed files with 345 additions and 29 deletions.
1 change: 1 addition & 0 deletions dewy/chunks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ class TextChunk(BaseModel):
kind: Literal["text"] = "text"

raw: bool
text: str
start_char_idx: Optional[int] = Field(
default=None, description="Start char index of the chunk."
)
Expand Down
16 changes: 12 additions & 4 deletions dewy/chunks/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from dewy.common.collection_embeddings import CollectionEmbeddings
from dewy.common.db import PgPoolDep

from .models import Chunk, RetrieveRequest, RetrieveResponse
from .models import Chunk, RetrieveRequest, RetrieveResponse, TextChunk

router = APIRouter(prefix="/chunks")

Expand All @@ -19,22 +19,29 @@ async def list_chunks(
document_id: Annotated[
int | None, Query(description="Limit to chunks associated with this document")
] = None,
page: int | None = 1,
perPage: int | None = 10,
) -> List[Chunk]:
"""List chunks."""

# TODO: handle collection & document ID
results = await pg_pool.fetch(
"""
SELECT chunk.id, chunk.document_id, chunk.kind, chunk.text
SELECT chunk.id, chunk.document_id, chunk.kind, TRUE as raw, chunk.text
FROM chunk
JOIN document ON document.id = chunk.document_id
WHERE document.collection_id = coalesce($1, document.collection_id)
AND chunk.document_id = coalesce($2, chunk.document_id)
JOIN document ON document.id = chunk.document_id
ORDER BY chunk.id
OFFSET $4
LIMIT $3
""",
collection_id,
document_id,
perPage,
page,
)
return [Chunk.model_validate(dict(result)) for result in results]
return [TextChunk.model_validate(dict(result)) for result in results]


PathChunkId = Annotated[int, Path(..., description="The chunk ID.")]
Expand Down Expand Up @@ -74,5 +81,6 @@ async def retrieve_chunks(
return RetrieveResponse(
summary=None,
text_results=text_results if request.include_text_chunks else [],
image_results=[],
# image_results=image_results if request.include_image_chunks else [],
)
7 changes: 5 additions & 2 deletions dewy/common/collection_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,10 @@ async def retrieve_text_chunks(self, query: str, n: int = 10) -> List[TextResult
async with self._pg_pool.acquire() as conn:
logger.info("Executing SQL query for chunks from {}", self.collection_id)
embeddings = await conn.fetch(
self.collection_id, self._retrieve_chunks, embedded_query, n
self._retrieve_chunks,
self.collection_id,
embedded_query,
n
)
embeddings = [
TextResult(
Expand Down Expand Up @@ -219,7 +222,7 @@ async def ingest(self, document_id: int, url: str) -> None:
INSERT INTO chunk (document_id, kind, text)
VALUES ($1, $2, $3);
""",
[(document_id, "text", text_chunk) for text_chunk in text_chunks],
[(document_id, "text", text_chunk.encode('utf-8').decode('utf-8', 'ignore').replace("\x00", "\uFFFD")) for text_chunk in text_chunks],
)

# Then, embed each of those chunks.
Expand Down
4 changes: 2 additions & 2 deletions dewy/common/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,15 @@ async def extract(
"""Extract documents from a local or remote URL."""
import httpx

async with httpx.AsyncClient() as client:
async with httpx.AsyncClient(follow_redirects=True) as client:
# Determine the extension by requesting the headers.
response = await client.head(url)
response.raise_for_status()
content_type = response.headers["content-type"]
logger.debug("Content type of {} is {}", url, content_type)

# Load the content.
if content_type == "application/pdf":
if content_type.startswith("application/pdf"):
from tempfile import NamedTemporaryFile

with NamedTemporaryFile(suffix=".pdf") as temp_file:
Expand Down
7 changes: 3 additions & 4 deletions dewy/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@


class CreateRequest(BaseModel):
"""The name of the collection the document should be added to."""
collection_id: Optional[int] = None
"""The id of the collection the document should be added to. Either `collection` or `collection_id` must be provided"""

collection_id: int

"""The URL of the document to add."""
url: str
"""The URL of the document to add."""


class IngestState(Enum):
Expand Down
2 changes: 1 addition & 1 deletion dewy/documents/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ async def add_document(
) -> Document:
"""Add a document."""

row = None
async with pg_pool.acquire() as conn:
row = None
row = await conn.fetchrow(
"""
INSERT INTO document (collection_id, url, ingest_state)
Expand Down
38 changes: 26 additions & 12 deletions frontend/src/App.tsx
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
import {
Admin,
Resource,
ListGuesser,
EditGuesser,
ShowGuesser,
CustomRoutes,
houseLightTheme as lightTheme,
houseDarkTheme as darkTheme,
Menu
} from "react-admin";
import FolderIcon from '@mui/icons-material/Folder';
import ArticleIcon from '@mui/icons-material/Article';
import SegmentIcon from '@mui/icons-material/Segment';
import { Route } from "react-router-dom";
import { dataProvider } from "./dataProvider";
import { CollectionList, CollectionCreate, CollectionEdit } from "./Collection";
import { DocumentList, DocumentCreate, DocumentEdit } from "./Document";
import { ChunkList } from "./Chunk";
import { Search } from "./Search";
import { MyLayout } from "./MyLayout";

export const App = () => (
<Admin
Expand All @@ -16,24 +24,30 @@ export const App = () => (
theme={lightTheme}
darkTheme={darkTheme}
defaultTheme="light"
layout={MyLayout}
>
<Resource
name="collections"
list={ListGuesser}
edit={EditGuesser}
show={ShowGuesser}
list={CollectionList}
create={CollectionCreate}
recordRepresentation={(record) => record.name}
icon={FolderIcon}
/>
<Resource
name="documents"
list={ListGuesser}
edit={EditGuesser}
show={ShowGuesser}
list={DocumentList}
edit={DocumentEdit}
create={DocumentCreate}
recordRepresentation={(record) => record.url}
icon={ArticleIcon}
/>
<Resource
name="chunks"
list={ListGuesser}
edit={EditGuesser}
show={ShowGuesser}
list={ChunkList}
icon={SegmentIcon}
/>
<CustomRoutes>
<Route path="/search" element={<Search />} />
</CustomRoutes>
</Admin>
);
69 changes: 69 additions & 0 deletions frontend/src/Chunk.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import {
List,
ListBase,
TopToolbar,
FilterButton,
Pagination,
SearchInput,
TextInput,
WithListContext,
useListContext,
TextField,
RichTextField,
ChipField,
ReferenceInput,
RecordContextProvider,
WrapperField,
ReferenceField,
ListToolbar,
Title,
SimpleShowLayout,
simpleList
} from 'react-admin';

import { Stack, Typography, Paper, Card, Accordion } from '@mui/material';

type Chunk = {
id: number;
kind: string;
};

const ListActions = () => (
<TopToolbar >
<FilterButton/>
</TopToolbar>
);

const listFilters = [
<TextInput label="Kind" source="kind" defaultValue="all"/>,
<ReferenceInput source="collection_id" reference="collections"/>,
<ReferenceInput source="document_id" reference="documents"/>,
];

const ChunkListView = () => {
const { data, isLoading } = useListContext();
if (isLoading) return null;

return (
<>
{data.map((chunk) => <RecordContextProvider key={chunk.id} value={chunk}>
<Card sx={{padding: 2, margin: 1}}>
<SimpleShowLayout>
<ChipField source="kind"/>
<RichTextField source="text"/>
<ReferenceField source="document_id" reference="documents" />
</SimpleShowLayout>
</Card>
</RecordContextProvider>)}
</>
)
};

export const ChunkList = () => (
<ListBase >
<Title title="Chunks"/>
<ListToolbar actions={<ListActions/>} filters={listFilters}/>
<ChunkListView />
<Pagination />
</ListBase>
);
80 changes: 80 additions & 0 deletions frontend/src/Collection.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import {
List,
Datagrid,
TextField,
TopToolbar,
FilterButton,
SearchInput,
EditButton,
CreateButton,
Create,
Edit,
SimpleForm,
CheckboxGroupInput,
TextInput,
SelectInput,
required
} from 'react-admin';

const ListActions = () => (
<TopToolbar>
<CreateButton/>
</TopToolbar>
);
export const CollectionList = () => (
<List actions={<ListActions/>} >
<Datagrid>
<TextField source="name" />
<TextField source="text_embedding_model" />
<TextField source="text_distance_metric" />
<TextField source="llm_model" />
</Datagrid>
</List>
);

export const ChunkingConfig = () => (
<>
<CheckboxGroupInput label="Chunks to Extract" source="extract" defaultValue={["snippets"]} choices={[
{id: "snippets", name: "Snippets"},
{id: "summaries", name: "Summaries"},
{id: "images", name: "Images"}
]}/>
<CheckboxGroupInput label="Retrieve Using" source="index" defaultValue={["text"]} choices={[
{id: "text", name: "Text"},
{id: "questions_answered", name: "Questions Answered"},
{id: "statements", name: "Statements"},
]}/>
</>
)

const Form = () => (
<SimpleForm>
<TextInput source="name" validate={[required()]} fullWidth />
<SelectInput source="text_embedding_model" defaultValue="hf:BAAI/bge-small-en" choices={[
{id: 'hf:BAAI/bge-small-en', name: 'BAAI/bge-small-en'},
{id: 'openai:text-embedding-ada-002', name: 'OpenAI/text_embedding_ada_002'},
]}/>
<SelectInput source="text_distance_metric" defaultValue="cosine" choices={[
{id: 'cosine', name: 'Cosine'},
{id: 'ip', name: 'Inner Product'},
{id: 'l2', name: 'L2-Norm'},
]}/>
<SelectInput source="llm_model" defaultValue="huggingface:StabilityAI/stablelm-tuned-alpha-3b" choices={[
{id: 'huggingface:StabilityAI/stablelm-tuned-alpha-3b', name: 'stablelm-tuned-alpha-3b'},
{id: 'openai:gpt-3.5-turbo', name: 'gpt-3.5-turbo'},
]}/>
<ChunkingConfig />
</SimpleForm>
)

export const CollectionCreate = () => (
<Create redirect="list">
<Form/>
</Create>
);

export const CollectionEdit = () => (
<Edit redirect="list">
<Form/>
</Edit>
);
Loading

0 comments on commit 4993b5a

Please sign in to comment.