-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Block out an admin UI #30
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,7 @@ | |
from dewy.common.collection_embeddings import CollectionEmbeddings | ||
from dewy.common.db import PgPoolDep | ||
|
||
from .models import Chunk, RetrieveRequest, RetrieveResponse | ||
from .models import Chunk, RetrieveRequest, RetrieveResponse, TextChunk | ||
|
||
router = APIRouter(prefix="/chunks") | ||
|
||
|
@@ -19,22 +19,29 @@ async def list_chunks( | |
document_id: Annotated[ | ||
int | None, Query(description="Limit to chunks associated with this document") | ||
] = None, | ||
page: int | None = 1, | ||
perPage: int | None = 10, | ||
) -> List[Chunk]: | ||
"""List chunks.""" | ||
|
||
# TODO: handle collection & document ID | ||
results = await pg_pool.fetch( | ||
""" | ||
SELECT chunk.id, chunk.document_id, chunk.kind, chunk.text | ||
SELECT chunk.id, chunk.document_id, chunk.kind, TRUE as raw, chunk.text | ||
FROM chunk | ||
JOIN document ON document.id = chunk.document_id | ||
WHERE document.collection_id = coalesce($1, document.collection_id) | ||
AND chunk.document_id = coalesce($2, chunk.document_id) | ||
JOIN document ON document.id = chunk.document_id | ||
ORDER BY chunk.id | ||
OFFSET $4 | ||
LIMIT $3 | ||
""", | ||
collection_id, | ||
document_id, | ||
perPage, | ||
page, | ||
) | ||
return [Chunk.model_validate(dict(result)) for result in results] | ||
return [TextChunk.model_validate(dict(result)) for result in results] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Creating a union from a dict doesn't seem to work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. Mypy also complains about this. I suspect we may need to revisit how we convern from |
||
|
||
|
||
PathChunkId = Annotated[int, Path(..., description="The chunk ID.")] | ||
|
@@ -74,5 +81,6 @@ async def retrieve_chunks( | |
return RetrieveResponse( | ||
summary=None, | ||
text_results=text_results if request.include_text_chunks else [], | ||
image_results=[], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Required field There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we could set a default in the response if we wanted. But this is more explicit, and will likely better align to the implementation we eventually have anyway. |
||
# image_results=image_results if request.include_image_chunks else [], | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -172,7 +172,10 @@ async def retrieve_text_chunks(self, query: str, n: int = 10) -> List[TextResult | |
async with self._pg_pool.acquire() as conn: | ||
logger.info("Executing SQL query for chunks from {}", self.collection_id) | ||
embeddings = await conn.fetch( | ||
self.collection_id, self._retrieve_chunks, embedded_query, n | ||
self._retrieve_chunks, | ||
self.collection_id, | ||
embedded_query, | ||
n | ||
) | ||
embeddings = [ | ||
TextResult( | ||
|
@@ -219,7 +222,7 @@ async def ingest(self, document_id: int, url: str) -> None: | |
INSERT INTO chunk (document_id, kind, text) | ||
VALUES ($1, $2, $3); | ||
""", | ||
[(document_id, "text", text_chunk) for text_chunk in text_chunks], | ||
[(document_id, "text", text_chunk.encode('utf-8').decode('utf-8', 'ignore').replace("\x00", "\uFFFD")) for text_chunk in text_chunks], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handling invalid UTF8 and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fun. Feels like we may want to make that into a clearer utility at some point if this continues to be common. Please add a comment on that and maybe put in a helper function already... I suspect we'll want to remember why we're doing that at some point... |
||
) | ||
|
||
# Then, embed each of those chunks. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,12 +5,11 @@ | |
|
||
|
||
class CreateRequest(BaseModel): | ||
"""The name of the collection the document should be added to.""" | ||
collection_id: Optional[int] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be more idiomatic to do |
||
"""The id of the collection the document should be added to. Either `collection` or `collection_id` must be provided""" | ||
|
||
collection_id: int | ||
|
||
"""The URL of the document to add.""" | ||
url: str | ||
"""The URL of the document to add.""" | ||
|
||
|
||
class IngestState(Enum): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import { | ||
List, | ||
ListBase, | ||
TopToolbar, | ||
FilterButton, | ||
Pagination, | ||
SearchInput, | ||
TextInput, | ||
WithListContext, | ||
useListContext, | ||
TextField, | ||
RichTextField, | ||
ChipField, | ||
ReferenceInput, | ||
RecordContextProvider, | ||
WrapperField, | ||
ReferenceField, | ||
ListToolbar, | ||
Title, | ||
SimpleShowLayout, | ||
simpleList | ||
} from 'react-admin'; | ||
|
||
import { Stack, Typography, Paper, Card, Accordion } from '@mui/material'; | ||
|
||
type Chunk = { | ||
id: number; | ||
kind: string; | ||
}; | ||
|
||
const ListActions = () => ( | ||
<TopToolbar > | ||
<FilterButton/> | ||
</TopToolbar> | ||
); | ||
|
||
const listFilters = [ | ||
<TextInput label="Kind" source="kind" defaultValue="all"/>, | ||
<ReferenceInput source="collection_id" reference="collections"/>, | ||
<ReferenceInput source="document_id" reference="documents"/>, | ||
]; | ||
|
||
const ChunkListView = () => { | ||
const { data, isLoading } = useListContext(); | ||
if (isLoading) return null; | ||
|
||
return ( | ||
<> | ||
{data.map((chunk) => <RecordContextProvider key={chunk.id} value={chunk}> | ||
<Card sx={{padding: 2, margin: 1}}> | ||
<SimpleShowLayout> | ||
<ChipField source="kind"/> | ||
<RichTextField source="text"/> | ||
<ReferenceField source="document_id" reference="documents" /> | ||
</SimpleShowLayout> | ||
</Card> | ||
</RecordContextProvider>)} | ||
</> | ||
) | ||
}; | ||
|
||
export const ChunkList = () => ( | ||
<ListBase > | ||
<Title title="Chunks"/> | ||
<ListToolbar actions={<ListActions/>} filters={listFilters}/> | ||
<ChunkListView /> | ||
<Pagination /> | ||
</ListBase> | ||
); |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import { | ||
List, | ||
Datagrid, | ||
TextField, | ||
TopToolbar, | ||
FilterButton, | ||
SearchInput, | ||
EditButton, | ||
CreateButton, | ||
Create, | ||
Edit, | ||
SimpleForm, | ||
CheckboxGroupInput, | ||
TextInput, | ||
SelectInput, | ||
required | ||
} from 'react-admin'; | ||
|
||
const ListActions = () => ( | ||
<TopToolbar> | ||
<CreateButton/> | ||
</TopToolbar> | ||
); | ||
export const CollectionList = () => ( | ||
<List actions={<ListActions/>} > | ||
<Datagrid> | ||
<TextField source="name" /> | ||
<TextField source="text_embedding_model" /> | ||
<TextField source="text_distance_metric" /> | ||
<TextField source="llm_model" /> | ||
</Datagrid> | ||
</List> | ||
); | ||
|
||
export const ChunkingConfig = () => ( | ||
<> | ||
<CheckboxGroupInput label="Chunks to Extract" source="extract" defaultValue={["snippets"]} choices={[ | ||
{id: "snippets", name: "Snippets"}, | ||
{id: "summaries", name: "Summaries"}, | ||
{id: "images", name: "Images"} | ||
]}/> | ||
<CheckboxGroupInput label="Retrieve Using" source="index" defaultValue={["text"]} choices={[ | ||
{id: "text", name: "Text"}, | ||
{id: "questions_answered", name: "Questions Answered"}, | ||
{id: "statements", name: "Statements"}, | ||
]}/> | ||
</> | ||
) | ||
|
||
const Form = () => ( | ||
<SimpleForm> | ||
<TextInput source="name" validate={[required()]} fullWidth /> | ||
<SelectInput source="text_embedding_model" defaultValue="hf:BAAI/bge-small-en" choices={[ | ||
{id: 'hf:BAAI/bge-small-en', name: 'BAAI/bge-small-en'}, | ||
{id: 'openai:text-embedding-ada-002', name: 'OpenAI/text_embedding_ada_002'}, | ||
]}/> | ||
<SelectInput source="text_distance_metric" defaultValue="cosine" choices={[ | ||
{id: 'cosine', name: 'Cosine'}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As a note -- openai suggests |
||
{id: 'ip', name: 'Inner Product'}, | ||
{id: 'l2', name: 'L2-Norm'}, | ||
]}/> | ||
<SelectInput source="llm_model" defaultValue="huggingface:StabilityAI/stablelm-tuned-alpha-3b" choices={[ | ||
{id: 'huggingface:StabilityAI/stablelm-tuned-alpha-3b', name: 'stablelm-tuned-alpha-3b'}, | ||
{id: 'openai:gpt-3.5-turbo', name: 'gpt-3.5-turbo'}, | ||
]}/> | ||
<ChunkingConfig /> | ||
</SimpleForm> | ||
) | ||
|
||
export const CollectionCreate = () => ( | ||
<Create redirect="list"> | ||
<Form/> | ||
</Create> | ||
); | ||
|
||
export const CollectionEdit = () => ( | ||
<Edit redirect="list"> | ||
<Form/> | ||
</Edit> | ||
); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Missing text field