Skip to content

Commit

Permalink
Merge pull request #21 from Sinaptik-AI/release/v0.1.2
Browse files Browse the repository at this point in the history
Release v0.2.0
  • Loading branch information
gventuri authored Oct 14, 2024
2 parents cfa51d6 + 19eed60 commit 36d8d46
Show file tree
Hide file tree
Showing 94 changed files with 2,701 additions and 11,396 deletions.
15 changes: 13 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,21 @@ jobs:
cd backend
poetry install --all-extras --with dev --verbose
- name: Run python tests
- name: Run python tests with coverage
run: |
cd backend
poetry run pytest
poetry run pytest --cov=app --cov-report=xml --cov-report=term-missing
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v4
with:
file: ./backend/coverage.xml
token: ${{ secrets.CODECOV_TOKEN }}

- name: Run Ruff
run: |
cd backend
poetry run ruff check .
- name: Install frontend dependencies
run: |
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
.env

.DS_Store
.DS_Store
46 changes: 46 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files

- repo: https://github.com/psf/black
rev: 24.3.0
hooks:
- id: black
language_version: python3.11

- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
args: ["--profile", "black"]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.3.4
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]

- repo: local
hooks:
- id: pytest-check
name: pytest-check
entry: bash -c 'cd backend && poetry run pytest'
language: system
pass_filenames: false
always_run: false
files: ^(backend/|tests/)
types: [python]

- id: coverage-check
name: coverage-check
entry: bash -c 'cd backend && poetry run coverage erase && poetry run coverage run -m pytest && poetry run coverage report -m && poetry run coverage xml'
language: system
pass_filenames: false
always_run: false
files: ^(backend/|tests/)
types: [python]
20 changes: 20 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"editor.formatOnSave": true,
"[javascript]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
"[typescript]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
"[typescriptreact]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
},
"prettier.configPath": "frontend/.prettierrc",
"prettier.ignorePath": "frontend/.prettierignore",
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff"
},
"ruff.organizeImports": true,
"ruff.fixAll": true,
"ruff.path": ["backend"]
}
16 changes: 16 additions & 0 deletions backend/.coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
[run]
source = app
omit =
*/tests/*
*/__init__.py

[report]
exclude_lines =
pragma: no cover
def __repr__
if self.debug:
if __name__ == .__main__.:
raise NotImplementedError
pass
except ImportError:
def __str__
5 changes: 5 additions & 0 deletions backend/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,8 @@ __pycache__/
!/instance/
uploads/
processed/


# coverage
.coverage
coverage.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import sqlite

# revision identifiers, used by Alembic.
revision: str = "0371afa4fbf0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import sqlite

# revision identifiers, used by Alembic.
revision: str = "883990c30c41"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
Expand Down
1 change: 0 additions & 1 deletion backend/alembic/versions/e1e475154dfc_add_process_step.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import sqlite


# revision identifiers, used by Alembic.
Expand Down
137 changes: 116 additions & 21 deletions backend/app/api/v1/chat.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import traceback
from typing import Optional

from app.config import settings
from app.database import get_db
from app.logger import Logger
from app.models.asset_content import AssetProcessingStatus
from app.repositories import project_repository, user_repository
from app.repositories import conversation_repository
from app.repositories import (
conversation_repository,
project_repository,
user_repository,
)
from app.requests import chat_query
from app.utils import clean_text
from app.vectorstore.chroma import ChromaDB
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from sqlalchemy.orm import Session
from app.config import settings


chat_router = APIRouter()

Expand All @@ -24,22 +28,46 @@ class ChatRequest(BaseModel):
logger = Logger()


def group_by_start_end(references):
grouped_references = {}
for ref in references:
key = (ref["start"], ref["end"])
grouped_ref = grouped_references.setdefault(
key, {"start": ref["start"], "end": ref["end"], "references": []}
)
for existing_ref in grouped_ref["references"]:
if (
existing_ref["asset_id"] == ref["asset_id"]
and existing_ref["page_number"] == ref["page_number"]
):
existing_ref["source"].extend(ref["source"])
break
else:
grouped_ref["references"].append(ref)
return list(grouped_references.values())


@chat_router.post("/project/{project_id}", status_code=200)
def chat(project_id: int, chat_request: ChatRequest, db: Session = Depends(get_db)):
try:
vectorstore = ChromaDB(f"panda-etl-{project_id}")
docs = vectorstore.get_relevant_docs(

docs, doc_ids, _ = vectorstore.get_relevant_segments(
chat_request.query, settings.max_relevant_docs
)

file_names = project_repository.get_assets_filename(
db, [metadata["doc_id"] for metadata in docs["metadatas"][0]]
)
extracted_documents = docs["documents"][0]
unique_doc_ids = list(set(doc_ids))
file_names = project_repository.get_assets_filename(db, unique_doc_ids)

doc_id_to_filename = {
doc_id: filename for doc_id, filename in zip(unique_doc_ids, file_names)
}

ordered_file_names = [doc_id_to_filename[doc_id] for doc_id in doc_ids]

docs_formatted = [
{"filename": filename, "quote": quote}
for filename, quote in zip(file_names, extracted_documents)
for filename, quote in zip(ordered_file_names, docs)
]

api_key = user_repository.get_user_api_key(db)
Expand All @@ -63,34 +91,98 @@ def chat(project_id: int, chat_request: ChatRequest, db: Session = Depends(get_d
)
conversation_id = str(conversation.id)

content = response["response"]
content_length = len(content)
clean_content = clean_text(content)
text_references = []
not_exact_matched_refs = []

for reference in response["references"]:
sentence = reference["sentence"]

for reference_content in reference["references"]:
original_filename = reference_content["file"]
original_sentence = reference_content["sentence"]

doc_sent, doc_ids, doc_metadata = vectorstore.get_relevant_segments(
original_sentence,
k=5,
num_surrounding_sentences=0,
metadata_filter={"filename": original_filename},
)

# Search for exact match
best_match_index = 0

for index, sent in enumerate(doc_sent):
if clean_text(original_sentence) in clean_text(sent):
best_match_index = index

metadata = doc_metadata[best_match_index]
sent = doc_sent[best_match_index]

index = clean_content.find(clean_text(sentence))

if index != -1:
text_reference = {
"asset_id": metadata["asset_id"],
"project_id": metadata["project_id"],
"page_number": metadata["page_number"],
"filename": original_filename,
"source": [sent],
"start": index,
"end": index + len(sentence),
}
text_references.append(text_reference)
else:
no_exact_reference = {
"asset_id": metadata["asset_id"],
"project_id": metadata["project_id"],
"page_number": metadata["page_number"],
"filename": original_filename,
"source": [sent],
"start": 0,
"end": content_length,
}
not_exact_matched_refs.append(no_exact_reference)

# group text references based on start and end
if len(text_references) > 0:
refs = group_by_start_end(text_references)
else:
refs = group_by_start_end(not_exact_matched_refs)

conversation_repository.create_conversation_message(
db,
conversation_id=conversation_id,
query=chat_request.query,
response=response["response"],
response=content,
)

return {
"status": "success",
"message": "chat response successfully returned!",
"message": "Chat response successfully generated.",
"data": {
"conversation_id": conversation_id,
"response": response["response"],
"response": content,
"response_references": refs,
},
}

except HTTPException:
raise

except Exception as e:
logger.error(traceback.print_exc())
raise HTTPException(status_code=400, detail="Unable to process query!")
except Exception:
logger.error(traceback.format_exc())
raise HTTPException(
status_code=400,
detail="Unable to process the chat query. Please try again.",
)


@chat_router.get("/project/{project_id}/status", status_code=200)
def chat_status(project_id: int, db: Session = Depends(get_db)):
try:

asset_contents = project_repository.get_assets_without_content(
db=db, project_id=project_id
)
Expand All @@ -109,13 +201,16 @@ def chat_status(project_id: int, db: Session = Depends(get_db)):

return {
"status": "success",
"message": "Chat response successfully returned!",
"message": "Chat message successfully generated.",
"data": {"status": status},
}

except HTTPException:
raise

except Exception as e:
logger.error(traceback.print_exc())
raise HTTPException(status_code=400, detail="Unable to process query!")
except Exception:
logger.error(traceback.format_exc())
raise HTTPException(
status_code=400,
detail="Unable to process the chat query. Please try again.",
)
Loading

0 comments on commit 36d8d46

Please sign in to comment.