From 742c8ee96bccc7b95a555e0da47d0b57e94cd531 Mon Sep 17 00:00:00 2001 From: Ben Chambers <35960+bjchambers@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:24:02 -0800 Subject: [PATCH] add openai api key --- .github/workflows/ci.yml | 2 ++ dewy/common/collection_embeddings.py | 4 ++++ tests/test_e2e.py | 4 ++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3a83d39..8b2a9b3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -36,6 +36,8 @@ jobs: run: poetry install - name: pytest run: poetry run pytest -v + env: + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} python_lint: runs-on: ubuntu-latest diff --git a/dewy/common/collection_embeddings.py b/dewy/common/collection_embeddings.py index 8151372..62f3583 100644 --- a/dewy/common/collection_embeddings.py +++ b/dewy/common/collection_embeddings.py @@ -214,6 +214,10 @@ async def ingest(self, document_id: int, url: str) -> None: async with conn.transaction(): def encode_chunk(c: str) -> str: + # We believe that either invalid unicode or the occurrence of nulls was + # causing problems that *looked* like only the first page from a PDF was + # being indexed (https://github.com/DewyKB/dewy/issues/20). We do not + # know that all of this is truly necessary. encoded = c.encode("utf-8").decode("utf-8", "ignore") return encoded.replace("\x00", "\uFFFD") diff --git a/tests/test_e2e.py b/tests/test_e2e.py index 5c0e66c..fdabad4 100644 --- a/tests/test_e2e.py +++ b/tests/test_e2e.py @@ -22,7 +22,7 @@ async def create_collection(client, text_embedding_model: str) -> int: async def ingest(client, collection: int, url: str) -> int: add_request = AddDocumentRequest(collection_id=collection, url=url) add_response = await client.put( - "/api/documents/", data=add_request.model_dump_json() + "/api/documents/", content=add_request.model_dump_json() ) assert add_response.status_code == 200 @@ -52,7 +52,7 @@ async def retrieve(client, collection: int, query: str) -> RetrieveResponse: collection_id=collection, query=query, include_image_chunks=False ) - response = await client.post("/api/chunks/retrieve", data=request.model_dump_json()) + response = await client.post("/api/chunks/retrieve", content=request.model_dump_json()) assert response.status_code == 200 return RetrieveResponse.model_validate_json(response.content)