Skip to content

Commit

Permalink
feat: Reset the document's state when a file is uploaded. (#92)
Browse files Browse the repository at this point in the history
* Reset the document's state when a file is uploaded.

This allows the content associated with a file to be changed.
Delete any existing chunks, embeddings, and data from the document row.

* Refactor tests to use shared functions, and name more accurately

* Linting & formatting

* Comments

* Comments

* Formatting
  • Loading branch information
kerinin authored Feb 21, 2024
1 parent e4226fc commit 8db2e71
Show file tree
Hide file tree
Showing 5 changed files with 164 additions and 18 deletions.
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ Don't forget to give the project a star! Thanks again!
```sh
cd frontend && npm install && npm run build
```
1. Build the client
```sh
cd dewy-client && poetry install
```
1. Run the Dewy service
```sh
poetry run dewy
Expand All @@ -254,7 +258,7 @@ If you're in a `poetry shell`, you can omit the `poetry run`:
* Running tests: `poetry run pytest`
* Linting (and formatting): `poetry run ruff check --fix`
* Formatting: `poetry run ruff format`
* Type Checking: `poetry run mypy app`
* Type Checking: `poetry run mypy dewy`
<p align="right">(<a href="#readme-top">back to top</a>)</p
Expand Down
65 changes: 49 additions & 16 deletions dewy/document/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,22 +141,55 @@ async def upload_document_content(

document = None
async with pg_pool.acquire() as conn:
document = await get_document(conn, document_id)

content_bytes = await content.read()
background.add_task(
ingest_document,
document.id,
pg_pool,
config,
IngestContent(
filename=content.filename,
content_type=content.content_type,
size=content.size,
content_bytes=content_bytes,
),
)
return document
async with conn.transaction():
# Delete any existing embeddings
await conn.execute(
"""
DELETE FROM embedding e
USING chunk c
WHERE e.chunk_id = c.id
AND c.document_id = $1
""",
document_id,
)
# Delete any existing chunks
await conn.execute(
"""
DELETE from chunk c
WHERE c.document_id = $1
""",
document_id,
)
# Update the document if it exists
await conn.execute(
"""
UPDATE document
SET
ingest_state = 'pending',
url = NULL,
ingest_error = NULL,
extracted_text = NULL
WHERE id = $1
""",
document_id,
)

document = await get_document(conn, document_id)

content_bytes = await content.read()
background.add_task(
ingest_document,
document.id,
pg_pool,
config,
IngestContent(
filename=content.filename,
content_type=content.content_type,
size=content.size,
content_bytes=content_bytes,
),
)
return document


PathDocumentId = Annotated[int, Path(..., description="The document ID.")]
Expand Down
Binary file added test_data/nearly_empty2.pdf
Binary file not shown.
8 changes: 8 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,14 @@
with open(NEARLY_EMPTY_PATH, "rb") as file:
NEARLY_EMPTY_BYTES = file.read()

NEARLY_EMPTY_PATH2 = os.path.join(TEST_DATA_DIR, "nearly_empty2.pdf")
assert os.path.isfile(NEARLY_EMPTY_PATH2)
NEARLY_EMPTY_TEXT2 = " This is another nearly empty document. \n"

NEARLY_EMPTY_BYTES2 = None
with open(NEARLY_EMPTY_PATH2, "rb") as file:
NEARLY_EMPTY_BYTES2 = file.read()


@pytest.fixture(scope="session")
async def app(pg, event_loop):
Expand Down
103 changes: 102 additions & 1 deletion tests/test_documents.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import asyncio
import json
import random
import string
Expand All @@ -12,8 +13,22 @@
get_document_status,
list_chunks,
list_documents,
upload_document_content,
)
from dewy_client.models import (
AddDocumentRequest,
BodyUploadDocumentContent,
CollectionCreate,
IngestState,
)
from dewy_client.types import File

from tests.conftest import (
NEARLY_EMPTY_BYTES,
NEARLY_EMPTY_BYTES2,
NEARLY_EMPTY_TEXT,
NEARLY_EMPTY_TEXT2,
)
from dewy_client.models import AddDocumentRequest, CollectionCreate, IngestState


@dataclass
Expand Down Expand Up @@ -47,6 +62,36 @@ async def doc_fixture(client) -> DocFixture:
)


async def upload_test_pdf(client, document_id, payload):
document = await upload_document_content.asyncio(
client=client,
document_id=document_id,
body=BodyUploadDocumentContent(
content=File(
payload=payload,
file_name=f"file-${document_id}.pdf",
mime_type="application/pdf",
),
),
)
assert document
assert document.extracted_text is None
assert document.url is None
assert document.ingest_state == IngestState.PENDING
assert document.ingest_error is None


async def document_ingested(client, document_id):
status = await get_document_status.asyncio(document_id, client=client)
while getattr(status, "ingest_state", IngestState.PENDING) == IngestState.PENDING:
await asyncio.sleep(0.1)
status = await get_document_status.asyncio(document_id, client=client)
assert status
assert status.id == document_id
assert status.ingest_state == IngestState.INGESTED
assert status.ingest_error is None


async def test_list_documents_filtered(client, doc_fixture):
docs = await list_documents.asyncio(client=client, collection=doc_fixture.collection_name)

Expand Down Expand Up @@ -161,3 +206,59 @@ async def test_add_document_ingest_error(client):
client=client, collection=collection.name, document_id=document.id
)
assert len(chunks) == 0


async def test_upload_document_unknown_document_id(client, doc_fixture):
response = await upload_document_content.asyncio_detailed(
client=client,
document_id=1_000_000,
body=BodyUploadDocumentContent(
content=File(
payload=NEARLY_EMPTY_BYTES,
file_name="file-name-1",
mime_type="application/pdf",
),
),
)
assert response.status_code == 404


async def test_document_lifecycle(client, doc_fixture):
# 1. Upload a PDF for one of the fixutre docs and verify the document is "pending"
await upload_test_pdf(client, doc_fixture.doc1, NEARLY_EMPTY_BYTES)

# 2. Wait for ingestion to complete (would be nicer if we could hook into the queue somehow)
# and verify the PDF has been ingested correctly
await document_ingested(client, doc_fixture.doc1)

document2 = await get_document.asyncio(doc_fixture.doc1, client=client)
assert document2
assert document2.id == doc_fixture.doc1
assert document2.extracted_text == NEARLY_EMPTY_TEXT
assert document2.url is None
assert document2.ingest_state == IngestState.INGESTED
assert document2.ingest_error is None

chunks = await list_chunks.asyncio(client=client, document_id=doc_fixture.doc1)
assert chunks

# 3. Upload a revised PDF and verify the document is back into "pending" state
await upload_test_pdf(client, doc_fixture.doc1, NEARLY_EMPTY_BYTES2)

# 4. Wait for the new doc to be ingested and verify it was ingested correctly
await document_ingested(client, doc_fixture.doc1)

document3 = await get_document.asyncio(doc_fixture.doc1, client=client)
assert document3
assert document3.id == doc_fixture.doc1
assert document3.extracted_text == NEARLY_EMPTY_TEXT2
assert document3.url is None
assert document3.ingest_state == IngestState.INGESTED
assert document3.ingest_error is None

chunks2 = await list_chunks.asyncio(client=client, document_id=doc_fixture.doc1)
assert chunks2

original_ids = {c.id for c in chunks}
new_ids = {c.id for c in chunks2}
assert original_ids.isdisjoint(new_ids)

0 comments on commit 8db2e71

Please sign in to comment.