DewyKB · bjchambers · Jan 29, 2024 · Jan 29, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,56 @@
+name: CI
+
+on:
+    push:
+      branches:
+        - master
+    pull_request:
+
+concurrency:
+    group: ${{ github.workflow }}-${{ github.ref }}
+    cancel-in-progress: true
+
+jobs:
+    python_test:
+      runs-on: ubuntu-latest
+      steps:
+        - uses: actions/checkout@v4
+        - name: Install Python
+          uses: actions/setup-python@v4
+          # see details (matrix, python-version, python-version-file, etc.)
+          # https://github.com/actions/setup-python
+          with:
+            python-version: '3.11'
+        - name: Install poetry
+          uses: abatilo/actions-poetry@v2
+        - name: Setup a local virtual environment (if no poetry.toml file)
+          run: |
+            poetry config virtualenvs.create true --local
+            poetry config virtualenvs.in-project true --local
+        - uses: actions/cache@v3
+          name: Define a cache for the virtual environment based on the dependencies lock file
+          with:
+            path: ./.venv
+            key: venv-${{ hashFiles('poetry.lock') }}
+        - name: Install the project dependencies
+          run: poetry install
+        - name: pytest
+          run: poetry run pytest -v
+          env:
+            OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+    python_lint:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - name: Ruff Lint
+              uses: chartboost/ruff-action@v1
+
+    python_format:
+        runs-on: ubuntu-latest
+        steps:
+            - uses: actions/checkout@v4
+            - name: Ruff Format (Check)
+              uses: chartboost/ruff-action@v1
+              with:
+                  args: format --check
diff --git a/.github/workflows/lint-pr.yml b/.github/workflows/lint-pr.yml
@@ -0,0 +1,15 @@
+name: "Lint PR"
+on:
+  pull_request:
+    types:
+      - opened
+      - edited
+      - synchronize
+
+jobs:
+  lint_pr:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: amannn/action-semantic-pull-request@v5.4.0
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/dewy/chunks/models.py b/dewy/chunks/models.py
@@ -10,7 +10,7 @@ class TextChunk(BaseModel):
     text: str
 
     raw: bool
-    text: str 
+    text: str
     start_char_idx: Optional[int] = Field(
         default=None, description="Start char index of the chunk."
     )

diff --git a/dewy/common/collection_embeddings.py b/dewy/common/collection_embeddings.py
@@ -212,13 +212,26 @@ async def ingest(self, document_id: int, url: str) -> None:
         # TODO: support indirect embeddings
         async with self._pg_pool.acquire() as conn:
             async with conn.transaction():
+
+                def encode_chunk(c: str) -> str:
+                    # We believe that either invalid unicode or the occurrence
+                    # of nulls was causing problems that *looked* like only the
+                    # first page from a PDF was being indexed
+                    # (https://github.com/DewyKB/dewy/issues/20). We do not know
+                    # that all of this is truly necessary.
+                    encoded = c.encode("utf-8").decode("utf-8", "ignore")
+                    return encoded.replace("\x00", "\uFFFD")
+
                 # First, insert the chunks.
                 await conn.executemany(
                     """
                     INSERT INTO chunk (document_id, kind, text)
                     VALUES ($1, $2, $3);
                     """,
-                    [(document_id, "text", text_chunk.encode('utf-8').decode('utf-8', 'ignore').replace("\x00", "\uFFFD")) for text_chunk in text_chunks],
+                    [
+                        (document_id, "text", encode_chunk(text_chunk))
+                        for text_chunk in text_chunks
+                    ],
                 )
 
                 # Then, embed each of those chunks.

diff --git a/dewy/documents/models.py b/dewy/documents/models.py
@@ -6,7 +6,7 @@
 
 class AddDocumentRequest(BaseModel):
     collection_id: Optional[int] = None
-    """The id of the collection the document should be added to. Either `collection` or `collection_id` must be provided"""
+    """The id of the collection the document should be added to."""
 
     url: str
     """The URL of the document to add."""

diff --git a/tests/test_e2e.py b/tests/test_e2e.py
@@ -22,7 +22,7 @@ async def create_collection(client, text_embedding_model: str) -> int:
 async def ingest(client, collection: int, url: str) -> int:
     add_request = AddDocumentRequest(collection_id=collection, url=url)
     add_response = await client.put(
-        "/api/documents/", data=add_request.model_dump_json()
+        "/api/documents/", content=add_request.model_dump_json()
     )
     assert add_response.status_code == 200
 
@@ -37,21 +37,24 @@ async def ingest(client, collection: int, url: str) -> int:
 
     return document_id
 
+
 async def list_chunks(client, collection: int, document: int):
-    response = await client.get("/api/chunks/", params = {
-        'collection_id': collection,
-        'document_id': document
-    })
+    response = await client.get(
+        "/api/chunks/", params={"collection_id": collection, "document_id": document}
+    )
     assert response.status_code == 200
     ta = TypeAdapter(List[Chunk])
     return ta.validate_json(response.content)
 
+
 async def retrieve(client, collection: int, query: str) -> RetrieveResponse:
     request = RetrieveRequest(
         collection_id=collection, query=query, include_image_chunks=False
     )
 
-    response = await client.post("/api/chunks/retrieve", data=request.model_dump_json())
+    response = await client.post(
+        "/api/chunks/retrieve", content=request.model_dump_json()
+    )
     assert response.status_code == 200
     return RetrieveResponse.model_validate_json(response.content)