Skip to content

Commit

Permalink
ci: Setup github actions
Browse files Browse the repository at this point in the history
Notes:

- This only tests under Python 3.11. We currently don't support older
  versions.
- This adds a "lint-pr" step that ensures "conventional commits" so we
  can use PRs to generate the change log (eg., new features, fixes,
  etc.).
  • Loading branch information
bjchambers committed Jan 29, 2024
1 parent cfde7a7 commit f845ca5
Show file tree
Hide file tree
Showing 6 changed files with 96 additions and 9 deletions.
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: CI

on:
push:
branches:
- master
pull_request:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
python_test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v4
# see details (matrix, python-version, python-version-file, etc.)
# https://github.com/actions/setup-python
with:
python-version: '3.11'
- name: Install poetry
uses: abatilo/actions-poetry@v2
- name: Setup a local virtual environment (if no poetry.toml file)
run: |
poetry config virtualenvs.create true --local
poetry config virtualenvs.in-project true --local
- uses: actions/cache@v3
name: Define a cache for the virtual environment based on the dependencies lock file
with:
path: ./.venv
key: venv-${{ hashFiles('poetry.lock') }}
- name: Install the project dependencies
run: poetry install
- name: pytest
run: poetry run pytest -v
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

python_lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Ruff Lint
uses: chartboost/ruff-action@v1

python_format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Ruff Format (Check)
uses: chartboost/ruff-action@v1
with:
args: format --check
15 changes: 15 additions & 0 deletions .github/workflows/lint-pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: "Lint PR"
on:
pull_request:
types:
- opened
- edited
- synchronize

jobs:
lint_pr:
runs-on: ubuntu-latest
steps:
- uses: amannn/action-semantic-pull-request@v5.4.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
2 changes: 1 addition & 1 deletion dewy/chunks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class TextChunk(BaseModel):
text: str

raw: bool
text: str
text: str
start_char_idx: Optional[int] = Field(
default=None, description="Start char index of the chunk."
)
Expand Down
15 changes: 14 additions & 1 deletion dewy/common/collection_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,26 @@ async def ingest(self, document_id: int, url: str) -> None:
# TODO: support indirect embeddings
async with self._pg_pool.acquire() as conn:
async with conn.transaction():

def encode_chunk(c: str) -> str:
# We believe that either invalid unicode or the occurrence
# of nulls was causing problems that *looked* like only the
# first page from a PDF was being indexed
# (https://github.com/DewyKB/dewy/issues/20). We do not know
# that all of this is truly necessary.
encoded = c.encode("utf-8").decode("utf-8", "ignore")
return encoded.replace("\x00", "\uFFFD")

# First, insert the chunks.
await conn.executemany(
"""
INSERT INTO chunk (document_id, kind, text)
VALUES ($1, $2, $3);
""",
[(document_id, "text", text_chunk.encode('utf-8').decode('utf-8', 'ignore').replace("\x00", "\uFFFD")) for text_chunk in text_chunks],
[
(document_id, "text", encode_chunk(text_chunk))
for text_chunk in text_chunks
],
)

# Then, embed each of those chunks.
Expand Down
2 changes: 1 addition & 1 deletion dewy/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class AddDocumentRequest(BaseModel):
collection_id: Optional[int] = None
"""The id of the collection the document should be added to. Either `collection` or `collection_id` must be provided"""
"""The id of the collection the document should be added to."""

url: str
"""The URL of the document to add."""
Expand Down
15 changes: 9 additions & 6 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ async def create_collection(client, text_embedding_model: str) -> int:
async def ingest(client, collection: int, url: str) -> int:
add_request = AddDocumentRequest(collection_id=collection, url=url)
add_response = await client.put(
"/api/documents/", data=add_request.model_dump_json()
"/api/documents/", content=add_request.model_dump_json()
)
assert add_response.status_code == 200

Expand All @@ -37,21 +37,24 @@ async def ingest(client, collection: int, url: str) -> int:

return document_id


async def list_chunks(client, collection: int, document: int):
response = await client.get("/api/chunks/", params = {
'collection_id': collection,
'document_id': document
})
response = await client.get(
"/api/chunks/", params={"collection_id": collection, "document_id": document}
)
assert response.status_code == 200
ta = TypeAdapter(List[Chunk])
return ta.validate_json(response.content)


async def retrieve(client, collection: int, query: str) -> RetrieveResponse:
request = RetrieveRequest(
collection_id=collection, query=query, include_image_chunks=False
)

response = await client.post("/api/chunks/retrieve", data=request.model_dump_json())
response = await client.post(
"/api/chunks/retrieve", content=request.model_dump_json()
)
assert response.status_code == 200
return RetrieveResponse.model_validate_json(response.content)

Expand Down

0 comments on commit f845ca5

Please sign in to comment.