bcgov · simensma-fresh · Sep 3, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 7, 2024
diff --git a/services/permits/.env-example b/services/permits/.env-example
@@ -21,3 +21,6 @@ AZURE_DEPLOYMENT_NAME=mds-permits-turbo
 AZURE_BASE_URL=https://emli-mdsopenai.openai.azure.com/
 DEBUG_MODE=true
 OAUTHLIB_INSECURE_TRANSPORT=1
+DOCUMENTINTELLIGENCE_ENDPOINT=https://mds-doc-intelligence.cognitiveservices.azure.com/
+DOCUMENTINTELLIGENCE_API_KEY=
+DOCUMENTINTELLIGENCE_API_VERSION=2023-07-31
diff --git a/services/permits/app/compare_extraction_results.py b/services/permits/app/compare_extraction_results.py
@@ -1,18 +1,21 @@
 ###
 # Utility script to compare extracted permit conditions from CSV files to generate a csv and html report of how well they match
 # Usage: python compare_extraction_results.py --csv_pairs <auto_extracted_csv> <manual_extracted_csv> --csv_pairs <auto_extracted_csv> <manual_extracted_csv> ...
-# CSV files should have the following columns: section_title, section_paragraph, paragraph_title, subparagraph, clause, subclause, page_number, condition_text
+# CSV files should have the following columns: section_title, section_paragraph, condition_title, subparagraph, clause, subclause, page_number, condition_text
 ###
 import argparse
+import json
 import logging
 import os
+from difflib import SequenceMatcher
 
 import numpy as np
 import pandas as pd
 from app.permit_conditions.validator.permit_condition_model import PermitCondition
 from diff_match_patch import diff_match_patch
 from fuzzywuzzy import fuzz
 from jinja2 import Environment, FileSystemLoader
+from natsort import natsorted
 from pydantic import ValidationError
 
 logger = logging.getLogger(__name__)
@@ -22,29 +25,47 @@ def create_content_instances(df):
     content_list = []
     for _, row in df.iterrows():
         try:
+            try:
+                if isinstance(row.get("meta"), str):
+                    meta = row.get("meta").replace("\"\"", "\"")
+                    meta = json.loads(meta)
+                else:
+                    meta = row.get('meta', {"bounding_box": {}})
+
+            except json.JSONDecodeError:
+                logger.error(f"Failed parsing of permit condition meta: {row.get('meta')}")
+                raise
             content = PermitCondition(
                 section_title=row["section_title"],
-                section_paragraph=row["section_paragraph"],
-                paragraph_title=row["paragraph_title"],
+                section=row["section"],
+                paragraph=row["paragraph"],
                 subparagraph=row["subparagraph"],
                 clause=row["clause"],
-                subclause=row["subclause"],
+                subclause=row.get("subclause", ''),
+                subsubclause=row.get('subsubclause', ''),
+                condition_title=row.get("condition_title"),
                 page_number=int(row["page_number"]) if (row.get("page_number") and row['page_number'] != '') else 0,
                 condition_text=row["condition_text"],
                 original_condition_text=row["condition_text"],
+                meta=meta
             )
         except ValidationError as e:
             logger.error(f"Failed parsing of permit condition: {e}")
             logger.error(row)
             raise
 
         # This will be used as the text for comparison purposes
-        text = f"""
-            {content.section_paragraph}. {content.section_title}
-            {content.subparagraph}. {content.paragraph_title}
-            {"("+content.clause + ")" if content.clause else ""} {"("+content.subclause + ")" if content.subclause else ""}
 
-            {content.condition_text}
+        if content.condition_title:
+            txt = f"{content.condition_title}\n\n{content.condition_text}"
+        else:
+            txt = content.condition_text
+
+        section = '.'.join(filter(None, [content.section, content.paragraph, content.subparagraph, content.clause, content.subclause, content.subsubclause]))
+
+        text = f"""
+            {section}
+            {txt}
         """
 
         content.condition_text = text
@@ -59,10 +80,12 @@ def create_comparison_key(condition):
         filter(
             None,
             [
-                condition.section_paragraph,
+                condition.section,
+                condition.paragraph,
                 condition.subparagraph,
                 condition.clause,
                 condition.subclause,
+                condition.subsubclause,
             ],
         )
     )
@@ -153,17 +176,18 @@ def validate_condition(csv_pairs):
                     "DiffHTML": diff_html,
                     "state": "missing",
                     "match_percentage": 0,
+                    "metadata": {}
                 }
             )
 
             comparison_results.append(
                 {
                     "Key": key,
                     "auto_section_title": "",
-                    "auto_paragraph_title": "",
+                    "auto_condition_title": "",
                     "auto_extracted_condition": "",
                     "manual_section_title": manual_content_dict[key].section_title,
-                    "manual_paragraph_title": manual_content_dict[key].paragraph_title,
+                    "manual_condition_title": manual_content_dict[key].condition_title,
                     "manual_extracted_condition": manual_content_dict[key].original_condition_text,
                     "match_percentage": 0,
                     "is_match": False,
@@ -178,17 +202,18 @@ def validate_condition(csv_pairs):
                     "DiffHTML": diff_html,
                     "state": "added",
                     "match_percentage": 0,
+                    "metadata": auto_content_dict[key].meta if auto_content_dict[key].meta else {"bounding_box": {}}
                 }
             )
 
             comparison_results.append(
                 {
                     "Key": key,
                     "auto_section_title": auto_content_dict[key].section_title,
-                    "auto_paragraph_title": auto_content_dict[key].paragraph_title,
+                    "auto_condition_title": auto_content_dict[key].condition_title,
                     "auto_extracted_condition": auto_content_dict[key].original_condition_text,
                     "manual_section_title": "",
-                    "manual_paragraph_title": "",
+                    "manual_condition_title": "",
                     "manual_extracted_condition": "",
                     "manual_extracted_condition": "",
                     "match_percentage": 0,
@@ -205,13 +230,11 @@ def validate_condition(csv_pairs):
         context["comparison_results"] += match_results["context_comparison_results"]
         comparison_results += match_results["comparison_results"]
 
-        context["all_conditions"] = sorted(
-            (
-                context["comparison_results"]
-                + context["missing_conditions"]
-                + context["added_conditions"]
-            ),
-            key=lambda c: c.get("Key"),
+        context["all_conditions"] = natsorted(
+            context["comparison_results"]
+            + context["missing_conditions"]
+            + context["added_conditions"],
+            key=lambda x: x["Key"],
         )
 
         # 5. Calculate the overall match_percentage (how many conditions match 100% between the two csvs)
@@ -264,6 +287,7 @@ def compare_matching_conditions(
             auto_condition_text = auto_content_dict[key].condition_text
             manual_condition_text = manual_content_dict[key].condition_text
             match_percentage = fuzz.ratio(auto_condition_text.replace('\n', ''), manual_condition_text.replace('\n', ''))
+
             is_match = match_percentage >= 100
 
             if is_match:
@@ -273,10 +297,10 @@ def compare_matching_conditions(
                 {
                     "Key": key,
                     "auto_section_title": auto_content_dict[key].section_title,
-                    "auto_paragraph_title": auto_content_dict[key].paragraph_title,
+                    "auto_condition_title": auto_content_dict[key].condition_title,
                     "auto_extracted_condition": auto_content_dict[key].original_condition_text,
                     "manual_section_title": manual_content_dict[key].section_title,
-                    "manual_paragraph_title": manual_content_dict[key].paragraph_title,
+                    "manual_condition_title": manual_content_dict[key].condition_title,
                     "manual_extracted_condition": manual_content_dict[key].original_condition_text,
                     "match_percentage": match_percentage,
                     "is_match": is_match,
@@ -290,6 +314,7 @@ def compare_matching_conditions(
                     "DiffHTML": diff_html,
                     "state": "match" if is_match else "nomatch",
                     "match_percentage": match_percentage,
+                    "metadata": auto_content_dict[key].meta if auto_content_dict[key].meta else {"bounding_box": {}}
                 }
             )
 

diff --git a/services/permits/app/extract_and_validate_pdf.py b/services/permits/app/extract_and_validate_pdf.py
@@ -8,6 +8,7 @@
 import os
 from time import sleep
 
+import oauthlib
 from app.compare_extraction_results import validate_condition
 from dotenv import find_dotenv, load_dotenv
 from oauthlib.oauth2 import BackendApplicationClient
@@ -42,12 +43,31 @@ def authenticate_with_oauth():
     )
     return oauth_session
 
+def refresh_token(oauth_session):
+    oauth_session.fetch_token(
+        TOKEN_URL,
+        client_secret=PERMITS_CLIENT_SECRET,
+    )
+
+    return oauth_session
+
+def request(oauth_session, url, method, **kwargs):
+    try:
+        response = getattr(oauth_session, method)(url, **kwargs)
+        response.raise_for_status()
+    except oauthlib.oauth2.rfc6749.errors.TokenExpiredError:
+        print('Token expired. Refreshing token...')
+        oauth_session = refresh_token(oauth_session)
+        response = getattr(oauth_session, method)(url, **kwargs)
+        response.raise_for_status()
+
+    return response
 
 def extract_conditions_from_pdf(pdf_path, oauth_session):
     # Kick off the permit conditions extraction process
     with open(pdf_path, "rb") as pdf_file:
         files = {"file": (os.path.basename(pdf_path), pdf_file, "application/pdf")}
-        response = oauth_session.post(f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions", files=files)
+        response = request(oauth_session, f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions", 'post', files=files)
         response.raise_for_status()
 
     task_id = response.json().get('id')
@@ -60,12 +80,11 @@ def extract_conditions_from_pdf(pdf_path, oauth_session):
     # Poll the status endpoint until the task is complete
     while status not in ("SUCCESS", "FAILURE"):
         sleep(3)
-        status_response = oauth_session.get(f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions/status", params={"task_id": task_id})
+        status_response = request(oauth_session, f"{PERMIT_SERVICE_ENDPOINT}/permit_conditions/status", 'get', params={"task_id": task_id})
         status_response.raise_for_status()
 
         status = status_response.json().get('status')
 
-        print(json.dumps(status_response.json(), indent=2))
 
     if status != "SUCCESS":
         raise Exception(f"Failed to extract conditions from PDF. Task status: {status}")