Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MDS-6105] Permit Condition Extraction Improvements #3230

Merged
merged 19 commits into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions services/permits/.env-example
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ AZURE_DEPLOYMENT_NAME=mds-permits-turbo
AZURE_BASE_URL=https://emli-mdsopenai.openai.azure.com/
DEBUG_MODE=true
OAUTHLIB_INSECURE_TRANSPORT=1
DOCUMENTINTELLIGENCE_ENDPOINT=https://mds-doc-intelligence.cognitiveservices.azure.com/
DOCUMENTINTELLIGENCE_API_KEY=
DOCUMENTINTELLIGENCE_API_VERSION=2023-07-31
71 changes: 48 additions & 23 deletions services/permits/app/compare_extraction_results.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
###
# Utility script to compare extracted permit conditions from CSV files to generate a csv and html report of how well they match
# Usage: python compare_extraction_results.py --csv_pairs <auto_extracted_csv> <manual_extracted_csv> --csv_pairs <auto_extracted_csv> <manual_extracted_csv> ...
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just included a couple of tweaks to the report generation to include the extracted meta dict in the report html

# CSV files should have the following columns: section_title, section_paragraph, paragraph_title, subparagraph, clause, subclause, page_number, condition_text
# CSV files should have the following columns: section_title, section_paragraph, condition_title, subparagraph, clause, subclause, page_number, condition_text
###
import argparse
import json
import logging
import os
from difflib import SequenceMatcher

import numpy as np
import pandas as pd
from app.permit_conditions.validator.permit_condition_model import PermitCondition
from diff_match_patch import diff_match_patch
from fuzzywuzzy import fuzz
from jinja2 import Environment, FileSystemLoader
from natsort import natsorted
from pydantic import ValidationError

logger = logging.getLogger(__name__)
Expand All @@ -22,29 +25,47 @@ def create_content_instances(df):
content_list = []
for _, row in df.iterrows():
try:
try:
if isinstance(row.get("meta"), str):
meta = row.get("meta").replace("\"\"", "\"")
meta = json.loads(meta)
else:
meta = row.get('meta', {"bounding_box": {}})

except json.JSONDecodeError:
logger.error(f"Failed parsing of permit condition meta: {row.get('meta')}")
raise
content = PermitCondition(
section_title=row["section_title"],
section_paragraph=row["section_paragraph"],
paragraph_title=row["paragraph_title"],
section=row["section"],
paragraph=row["paragraph"],
subparagraph=row["subparagraph"],
clause=row["clause"],
subclause=row["subclause"],
subclause=row.get("subclause", ''),
subsubclause=row.get('subsubclause', ''),
condition_title=row.get("condition_title"),
page_number=int(row["page_number"]) if (row.get("page_number") and row['page_number'] != '') else 0,
condition_text=row["condition_text"],
original_condition_text=row["condition_text"],
meta=meta
)
except ValidationError as e:
logger.error(f"Failed parsing of permit condition: {e}")
logger.error(row)
raise

# This will be used as the text for comparison purposes
text = f"""
{content.section_paragraph}. {content.section_title}
{content.subparagraph}. {content.paragraph_title}
{"("+content.clause + ")" if content.clause else ""} {"("+content.subclause + ")" if content.subclause else ""}

{content.condition_text}
if content.condition_title:
txt = f"{content.condition_title}\n\n{content.condition_text}"
else:
txt = content.condition_text

section = '.'.join(filter(None, [content.section, content.paragraph, content.subparagraph, content.clause, content.subclause, content.subsubclause]))

text = f"""
{section}
{txt}
"""

content.condition_text = text
Expand All @@ -59,10 +80,12 @@ def create_comparison_key(condition):
filter(
None,
[
condition.section_paragraph,
condition.section,
condition.paragraph,
condition.subparagraph,
condition.clause,
condition.subclause,
condition.subsubclause,
],
)
)
Expand Down Expand Up @@ -153,17 +176,18 @@ def validate_condition(csv_pairs):
"DiffHTML": diff_html,
"state": "missing",
"match_percentage": 0,
"metadata": {}
}
)

comparison_results.append(
{
"Key": key,
"auto_section_title": "",
"auto_paragraph_title": "",
"auto_condition_title": "",
"auto_extracted_condition": "",
"manual_section_title": manual_content_dict[key].section_title,
"manual_paragraph_title": manual_content_dict[key].paragraph_title,
"manual_condition_title": manual_content_dict[key].condition_title,
"manual_extracted_condition": manual_content_dict[key].original_condition_text,
"match_percentage": 0,
"is_match": False,
Expand All @@ -178,17 +202,18 @@ def validate_condition(csv_pairs):
"DiffHTML": diff_html,
"state": "added",
"match_percentage": 0,
"metadata": auto_content_dict[key].meta if auto_content_dict[key].meta else {"bounding_box": {}}
}
)

comparison_results.append(
{
"Key": key,
"auto_section_title": auto_content_dict[key].section_title,
"auto_paragraph_title": auto_content_dict[key].paragraph_title,
"auto_condition_title": auto_content_dict[key].condition_title,
"auto_extracted_condition": auto_content_dict[key].original_condition_text,
"manual_section_title": "",
"manual_paragraph_title": "",
"manual_condition_title": "",
"manual_extracted_condition": "",
"manual_extracted_condition": "",
"match_percentage": 0,
Expand All @@ -205,13 +230,11 @@ def validate_condition(csv_pairs):
context["comparison_results"] += match_results["context_comparison_results"]
comparison_results += match_results["comparison_results"]

context["all_conditions"] = sorted(
(
context["comparison_results"]
+ context["missing_conditions"]
+ context["added_conditions"]
),
key=lambda c: c.get("Key"),
context["all_conditions"] = natsorted(
context["comparison_results"]
+ context["missing_conditions"]
+ context["added_conditions"],
key=lambda x: x["Key"],
)

# 5. Calculate the overall match_percentage (how many conditions match 100% between the two csvs)
Expand Down Expand Up @@ -264,6 +287,7 @@ def compare_matching_conditions(
auto_condition_text = auto_content_dict[key].condition_text
manual_condition_text = manual_content_dict[key].condition_text
match_percentage = fuzz.ratio(auto_condition_text.replace('\n', ''), manual_condition_text.replace('\n', ''))

is_match = match_percentage >= 100

if is_match:
Expand All @@ -273,10 +297,10 @@ def compare_matching_conditions(
{
"Key": key,
"auto_section_title": auto_content_dict[key].section_title,
"auto_paragraph_title": auto_content_dict[key].paragraph_title,
"auto_condition_title": auto_content_dict[key].condition_title,
"auto_extracted_condition": auto_content_dict[key].original_condition_text,
"manual_section_title": manual_content_dict[key].section_title,
"manual_paragraph_title": manual_content_dict[key].paragraph_title,
"manual_condition_title": manual_content_dict[key].condition_title,
"manual_extracted_condition": manual_content_dict[key].original_condition_text,
"match_percentage": match_percentage,
"is_match": is_match,
Expand All @@ -290,6 +314,7 @@ def compare_matching_conditions(
"DiffHTML": diff_html,
"state": "match" if is_match else "nomatch",
"match_percentage": match_percentage,
"metadata": auto_content_dict[key].meta if auto_content_dict[key].meta else {"bounding_box": {}}
}
)

Expand Down
25 changes: 22 additions & 3 deletions services/permits/app/extract_and_validate_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import os
from time import sleep

import oauthlib
from app.compare_extraction_results import validate_condition
from dotenv import find_dotenv, load_dotenv
from oauthlib.oauth2 import BackendApplicationClient
Expand Down Expand Up @@ -42,12 +43,31 @@ def authenticate_with_oauth():
)
return oauth_session

def refresh_token(oauth_session):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added automatic refreshing of auth token here as it would sometimes timeout due to how long the extraction process takes

oauth_session.fetch_token(
TOKEN_URL,
client_secret=PERMITS_CLIENT_SECRET,
)

return oauth_session

def request(oauth_session, url, method, **kwargs):
try:
response = getattr(oauth_session, method)(url, **kwargs)
response.raise_for_status()
except oauthlib.oauth2.rfc6749.errors.TokenExpiredError:
print('Token expired. Refreshing token...')
oauth_session = refresh_token(oauth_session)
response = getattr(oauth_session, method)(url, **kwargs)
response.raise_for_status()

return response

def extract_conditions_from_pdf(pdf_path, oauth_session):
# Kick off the permit conditions extraction process
with open(pdf_path, "rb") as pdf_file:
files = {"file": (os.path.basename(pdf_path), pdf_file, "application/pdf")}
response = oauth_session.post(f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions", files=files)
response = request(oauth_session, f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions", 'post', files=files)
response.raise_for_status()

task_id = response.json().get('id')
Expand All @@ -60,12 +80,11 @@ def extract_conditions_from_pdf(pdf_path, oauth_session):
# Poll the status endpoint until the task is complete
while status not in ("SUCCESS", "FAILURE"):
sleep(3)
status_response = oauth_session.get(f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions/status", params={"task_id": task_id})
status_response = request(oauth_session, f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions/status", 'get', params={"task_id": task_id})
status_response.raise_for_status()

status = status_response.json().get('status')

print(json.dumps(status_response.json(), indent=2))

if status != "SUCCESS":
raise Exception(f"Failed to extract conditions from PDF. Task status: {status}")
Expand Down
Loading
Loading