Skip to content

Commit

Permalink
feat[SIN-169]: use sentence match for smaller references
Browse files Browse the repository at this point in the history
  • Loading branch information
gventuri committed Oct 20, 2024
1 parent 16103f6 commit f0cb37b
Showing 2 changed files with 127 additions and 38 deletions.
106 changes: 70 additions & 36 deletions backend/app/processing/process_queue.py
Original file line number Diff line number Diff line change
@@ -25,6 +25,7 @@

from app.utils import clean_text
from app.vectorstore.chroma import ChromaDB
import re

executor = ThreadPoolExecutor(max_workers=5)

@@ -289,10 +290,7 @@ def extract_process(api_key, process, process_step, asset_content):
pdf_content = ""
vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similary_threshold=3)
if (
(
"multiple_fields" not in process.details
or not process.details["multiple_fields"]
)
("multiple_fields" not in process.details or not process.details["multiple_fields"])
and asset_content.content
and asset_content.content.get("word_count", 0) > 500
):
@@ -310,13 +308,13 @@ def extract_process(api_key, process, process_step, asset_content):

for index, metadata in enumerate(relevant_docs["metadatas"][0]):
segment_data = [relevant_docs["documents"][0][index]]
if metadata["previous_sentence_id"] != -1:
if metadata.get("previous_sentence_id", -1) != -1:
prev_sentence = vectorstore.get_relevant_docs_by_id(
ids=[metadata["previous_sentence_id"]]
)
segment_data = [prev_sentence["documents"][0]] + segment_data

if metadata["next_sentence_id"] != -1:
if metadata.get("next_sentence_id", -1) != -1:
next_sentence = vectorstore.get_relevant_docs_by_id(
ids=[metadata["next_sentence_id"]]
)
@@ -338,48 +336,84 @@ def extract_process(api_key, process, process_step, asset_content):
pdf_content=pdf_content if pdf_content else None,
)

vectorstore = ChromaDB(f"panda-etl-{process.project_id}", similary_threshold=3)
all_relevant_docs = []

for context in data["context"]:
for sources in context:
page_numbers = []
for source_index, source in enumerate(sources["sources"]):

relevant_docs = vectorstore.get_relevant_docs(
source,
where={
"$and": [
{"asset_id": process_step.asset.id},
{"project_id": process.project_id},
]
},
k=5,
)

most_relevant_index = 0
match = False
clean_source = clean_text(source)
# search for exact match Index
for index, relevant_doc in enumerate(relevant_docs["documents"][0]):
if clean_source in clean_text(relevant_doc):
most_relevant_index = index
match = True

if not match and len(relevant_docs["documents"][0]) > 0:
sources["sources"][source_index] = relevant_docs["documents"][0][0]

if len(relevant_docs["metadatas"][0]) > 0:
page_numbers.append(
relevant_docs["metadatas"][0][most_relevant_index][
"page_number"
]
if len(source) < 30:
best_match = find_best_match_for_short_reference(
source,
all_relevant_docs,
process_step.asset.id,
process.project_id
)
if best_match:
sources["sources"][source_index] = best_match["text"]
page_numbers.append(best_match["page_number"])
else:
relevant_docs = vectorstore.get_relevant_docs(
source,
where={
"$and": [
{"asset_id": process_step.asset.id},
{"project_id": process.project_id},
]
},
k=5,
)
all_relevant_docs.append(relevant_docs)

most_relevant_index = 0
match = False
clean_source = clean_text(source)
# search for exact match Index
for index, relevant_doc in enumerate(relevant_docs["documents"][0]):
if clean_source in clean_text(relevant_doc):
most_relevant_index = index
match = True
break

if not match and len(relevant_docs["documents"][0]) > 0:
sources["sources"][source_index] = relevant_docs["documents"][0][0]

if len(relevant_docs["metadatas"][0]) > 0:
page_numbers.append(
relevant_docs["metadatas"][0][most_relevant_index]["page_number"]
)

sources["page_numbers"] = page_numbers
if page_numbers:
sources["page_numbers"] = page_numbers

return {
"fields": data["fields"],
"context": data["context"],
}

def find_best_match_for_short_reference(source, all_relevant_docs, asset_id, project_id):
source_words = set(re.findall(r'\w+', source.lower()))
if not source_words:
return None # Return None if the source is empty

best_match = None
best_match_score = 0
threshold = 0.8

for relevant_docs in all_relevant_docs:
for doc, metadata in zip(relevant_docs["documents"][0], relevant_docs["metadatas"][0]):
if metadata["asset_id"] == asset_id and metadata["project_id"] == project_id:
doc_words = set(re.findall(r'\w+', doc.lower()))
common_words = source_words.intersection(doc_words)
match_score = len(common_words) / len(source_words)

if match_score > best_match_score:
best_match_score = match_score
best_match = {"text": doc, "page_number": metadata["page_number"]}

return best_match if best_match_score >= threshold else None


def update_process_step_status(
db, process_step, status, output=None, output_references=None
59 changes: 57 additions & 2 deletions backend/tests/processing/test_process_queue.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
extractive_summary_process,
extract_process,
update_process_step_status,
find_best_match_for_short_reference,
)
from app.exceptions import CreditLimitExceededException
from app.models import ProcessStepStatus
@@ -66,13 +67,13 @@ def test_extract_process(mock_chroma, mock_extract_data):
"metadatas": [[{
"page_number": 1,
"previous_sentence_id": -1,
"next_sentence_id": -1 # Add this line
"next_sentence_id": -1
}]],
"documents": [["Test document"]]
}
mock_extract_data.return_value = {
"fields": {"field1": "value1"},
"context": [[{"sources": ["source1"]}]]
"context": [[{"sources": ["source1"], "page_numbers": [1]}]]
}

process = Mock(id=1, project_id=1, details={"fields": [{"key": "field1"}]})
@@ -105,3 +106,57 @@ def test_update_process_step_status():
output=mock_output,
output_references=mock_output_references
)

@patch('app.processing.process_queue.re.findall')
def test_find_best_match_for_short_reference(mock_findall):
mock_findall.side_effect = [
['ai', 'and', 'machine', 'learning'],
['this', 'is', 'a', 'long', 'document', 'about', 'ai', 'and', 'machine', 'learning'], # For the document text
['quantum', 'computing'],
['this', 'is', 'a', 'long', 'document', 'about', 'ai', 'and', 'machine', 'learning'], # For the document text again
['another', 'document', 'talking', 'about', 'natural', 'language', 'processing'], # For the second document
[],
['this', 'is', 'a', 'long', 'document', 'about', 'ai', 'and', 'machine', 'learning'] # For the document text one more time
]
all_relevant_docs = [
{
"documents": [["This is a long document about AI and machine learning."]],
"metadatas": [[{"asset_id": 1, "project_id": 1, "page_number": 1}]]
},
{
"documents": [["Another document talking about natural language processing."]],
"metadatas": [[{"asset_id": 1, "project_id": 1, "page_number": 2}]]
}
]

# Test with a good match
result = find_best_match_for_short_reference("AI and machine learning", all_relevant_docs, 1, 1)
assert result is not None
assert "text" in result
assert "page_number" in result
assert "AI" in result["text"] and "machine learning" in result["text"]

# Test with a poor match
result = find_best_match_for_short_reference("Quantum computing", all_relevant_docs, 1, 1)
assert result is None

assert mock_findall.call_count == 6

@patch('app.processing.process_queue.ChromaDB')
@patch('app.processing.process_queue.extract_data')
def test_chroma_db_initialization(mock_extract_data, mock_chroma):
mock_chroma_instance = Mock()
mock_chroma.return_value = mock_chroma_instance
mock_extract_data.return_value = {
"fields": {"field1": "value1"},
"context": [[{"sources": ["source1"], "page_numbers": [1]}]]
}

process = Mock(id=1, project_id=1, details={"fields": [{"key": "field1"}]})
process_step = Mock(id=1, asset=Mock(id=1))
asset_content = Mock(content={"word_count": 100, "content": ["Short content"]})

extract_process("api_key", process, process_step, asset_content)

mock_chroma.assert_called_with(f"panda-etl-{process.project_id}", similary_threshold=3)
assert mock_chroma.call_count >= 1

0 comments on commit f0cb37b

Please sign in to comment.