Skip to content

Commit

Permalink
feat[PdfSource]: implementation for extraction sources in pdf (#12)
Browse files Browse the repository at this point in the history
* fix[projectList]: filtering with is statement not working

* feat[PdfSource]: implementation for extraction sources in pdf

* fix[PdfHighlight]: ci test failure

* fix[ci_errors]: ignore public folder for prettier

* prettier pdf worker file

* feat[pdf]: minor improvements

* feat[pdf]: improve code and add additional checks

* fix[pdf_viewer]: custom size for highlight pdf

* refactor(pdf): move file to ee folder
  • Loading branch information
ArslanSaleem authored Oct 7, 2024
1 parent f906d35 commit 1f4bbd7
Show file tree
Hide file tree
Showing 15 changed files with 79,951 additions and 156 deletions.
1 change: 1 addition & 0 deletions backend/app/api/v1/process_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ def get_process_step_output_reference(
"data": {
"id": process_step.id,
"process_id": process_step.process_id,
"project_id": process_step.process.project_id,
"asset_id": process_step.asset_id,
"output_reference": process_step.output_references,
},
Expand Down
8 changes: 6 additions & 2 deletions backend/app/processing/file_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,12 @@ def process_segmentation(project_id: int, asset_content_id: int, api_key: str):
vectorstore.add_docs(
docs=asset_content.content["content"],
metadatas=[
{"asset_id": asset_content.asset_id, "project_id": project_id}
for _ in asset_content.content["content"]
{
"asset_id": asset_content.asset_id,
"project_id": project_id,
"page_number": asset_content.content["page_number_data"][index],
}
for index, _ in enumerate(asset_content.content["content"])
],
)

Expand Down
28 changes: 25 additions & 3 deletions backend/app/processing/process_queue.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ def process_step_task(
else:
# Handle non-extractive summary process
pdf_content = ""
vectorstore = ChromaDB(
f"panda-etl-{process.project_id}", similary_threshold=3
)
if (
(
"multiple_fields" not in process.details
Expand All @@ -136,9 +139,6 @@ def process_step_task(
and asset_content.content
and asset_content.content["word_count"] > 500
):
vectorstore = ChromaDB(
f"panda-etl-{process.project_id}", similary_threshold=3
)

for field in process.details["fields"]:
relevant_docs = vectorstore.get_relevant_docs(
Expand Down Expand Up @@ -188,6 +188,28 @@ def process_step_task(
pdf_content=pdf_content if pdf_content else None,
)

for context in data["context"]:
for sources in context:
page_numbers = []
for source in sources["sources"]:
relevant_docs = vectorstore.get_relevant_docs(
source,
where={
"$and": [
{"asset_id": process_step.asset.id},
{"project_id": process.project_id},
]
},
k=1,
)

if len(relevant_docs["metadatas"][0]) > 0:
page_numbers.append(
relevant_docs["metadatas"][0][0]["page_number"]
)

sources["page_numbers"] = page_numbers

# Update process step output outside the expensive operations
with SessionLocal() as db:
process_repository.update_process_step_status(
Expand Down
2 changes: 1 addition & 1 deletion backend/app/repositories/project_repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def get_projects(db: Session, page: int = 1, page_size: int = 20):
models.Project,
func.count(models.Asset.id).label("asset_count"), # Count the assets
)
.filter(models.Project.deleted_at is None)
.filter(models.Project.deleted_at == None)
.outerjoin(models.Asset, models.Project.id == models.Asset.project_id)
.group_by(models.Project.id)
.offset((page - 1) * page_size)
Expand Down
19 changes: 17 additions & 2 deletions frontend/.lintstagedrc.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,19 @@
module.exports = {
"*.{js,jsx,ts,tsx}": ["eslint --fix", "prettier --write"],
"*.{json,css,md}": ["prettier --write"],
"**/*.{js,jsx,ts,tsx}": (filenames) => {
const filteredFiles = filenames.filter(
(file) => !file.startsWith("public/"),
);
if (filteredFiles.length === 0) return [];
return [
`eslint --fix ${filteredFiles.join(" ")}`,
`prettier --write ${filteredFiles.join(" ")}`,
];
},
"**/*.{css,scss,md}": (filenames) => {
const filteredFiles = filenames.filter(
(file) => !file.startsWith("public/"),
);
if (filteredFiles.length === 0) return [];
return [`prettier --write ${filteredFiles.join(" ")}`];
},
};
Loading

0 comments on commit 1f4bbd7

Please sign in to comment.