Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: Setup github actions #39

Merged
merged 1 commit into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: CI

on:
push:
branches:
- master
pull_request:

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
python_test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v4
# see details (matrix, python-version, python-version-file, etc.)
# https://github.com/actions/setup-python
with:
python-version: '3.11'
- name: Install poetry
uses: abatilo/actions-poetry@v2
- name: Setup a local virtual environment (if no poetry.toml file)
run: |
poetry config virtualenvs.create true --local
poetry config virtualenvs.in-project true --local
- uses: actions/cache@v3
name: Define a cache for the virtual environment based on the dependencies lock file
with:
path: ./.venv
key: venv-${{ hashFiles('poetry.lock') }}
- name: Install the project dependencies
run: poetry install
- name: pytest
run: poetry run pytest -v
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}

python_lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Ruff Lint
uses: chartboost/ruff-action@v1

python_format:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Ruff Format (Check)
uses: chartboost/ruff-action@v1
with:
args: format --check
15 changes: 15 additions & 0 deletions .github/workflows/lint-pr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: "Lint PR"
on:
pull_request:
types:
- opened
- edited
- synchronize

jobs:
lint_pr:
runs-on: ubuntu-latest
steps:
- uses: amannn/action-semantic-pull-request@v5.4.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
2 changes: 1 addition & 1 deletion dewy/chunks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class TextChunk(BaseModel):
text: str

raw: bool
text: str
text: str
start_char_idx: Optional[int] = Field(
default=None, description="Start char index of the chunk."
)
Expand Down
15 changes: 14 additions & 1 deletion dewy/common/collection_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,13 +212,26 @@ async def ingest(self, document_id: int, url: str) -> None:
# TODO: support indirect embeddings
async with self._pg_pool.acquire() as conn:
async with conn.transaction():

def encode_chunk(c: str) -> str:
# We believe that either invalid unicode or the occurrence
# of nulls was causing problems that *looked* like only the
# first page from a PDF was being indexed
# (https://github.com/DewyKB/dewy/issues/20). We do not know
# that all of this is truly necessary.
encoded = c.encode("utf-8").decode("utf-8", "ignore")
return encoded.replace("\x00", "\uFFFD")

# First, insert the chunks.
await conn.executemany(
"""
INSERT INTO chunk (document_id, kind, text)
VALUES ($1, $2, $3);
""",
[(document_id, "text", text_chunk.encode('utf-8').decode('utf-8', 'ignore').replace("\x00", "\uFFFD")) for text_chunk in text_chunks],
[
(document_id, "text", encode_chunk(text_chunk))
for text_chunk in text_chunks
],
)

# Then, embed each of those chunks.
Expand Down
2 changes: 1 addition & 1 deletion dewy/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

class AddDocumentRequest(BaseModel):
collection_id: Optional[int] = None
"""The id of the collection the document should be added to. Either `collection` or `collection_id` must be provided"""
"""The id of the collection the document should be added to."""

url: str
"""The URL of the document to add."""
Expand Down
15 changes: 9 additions & 6 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ async def create_collection(client, text_embedding_model: str) -> int:
async def ingest(client, collection: int, url: str) -> int:
add_request = AddDocumentRequest(collection_id=collection, url=url)
add_response = await client.put(
"/api/documents/", data=add_request.model_dump_json()
"/api/documents/", content=add_request.model_dump_json()
)
assert add_response.status_code == 200

Expand All @@ -37,21 +37,24 @@ async def ingest(client, collection: int, url: str) -> int:

return document_id


async def list_chunks(client, collection: int, document: int):
response = await client.get("/api/chunks/", params = {
'collection_id': collection,
'document_id': document
})
response = await client.get(
"/api/chunks/", params={"collection_id": collection, "document_id": document}
)
assert response.status_code == 200
ta = TypeAdapter(List[Chunk])
return ta.validate_json(response.content)


async def retrieve(client, collection: int, query: str) -> RetrieveResponse:
request = RetrieveRequest(
collection_id=collection, query=query, include_image_chunks=False
)

response = await client.post("/api/chunks/retrieve", data=request.model_dump_json())
response = await client.post(
"/api/chunks/retrieve", content=request.model_dump_json()
)
assert response.status_code == 200
return RetrieveResponse.model_validate_json(response.content)

Expand Down
Loading