From 209eec6e92258aee32942d8fe7ca28c4b19e8e8a Mon Sep 17 00:00:00 2001 From: vaclav Date: Fri, 13 Dec 2024 11:52:49 +0100 Subject: [PATCH] Simplified logic, removed the regex matching based on the discussion on slack. --- .../rossum-formulas/serverless-functions.md | 58 ++++--------------- 1 file changed, 11 insertions(+), 47 deletions(-) diff --git a/docs/learn/rossum-formulas/serverless-functions.md b/docs/learn/rossum-formulas/serverless-functions.md index 55a5be77..e6c065bd 100644 --- a/docs/learn/rossum-formulas/serverless-functions.md +++ b/docs/learn/rossum-formulas/serverless-functions.md @@ -355,57 +355,26 @@ The config examples are numbered for easier orientation: ``` -## Fetch and Analyze OCR Text for Pattern Matches +## Fetch OCR Text (page_data) -This function retrieves and processes textual data from Rossum's page_data API for an annotation. It is designed to: +This function retrieves textual data from Rossum's page_data API for an annotation and processes it to: -1. Fetch Text Data: Make an HTTP GET request to the page_data endpoint of a specific annotation using the provided rossum_authorization_token. +1. Fetch OCR document content Data: Make an HTTP GET request to the page_data endpoint of a specific annotation using the provided rossum_authorization_token. 1. Retry Mechanism: Handle transient network or server issues by retrying up to 3 times in case of a non-200 HTTP response or exceptions. -1. Analyze Text Content: Parse the fetched text content for specific patterns defined in the find_variants function (e.g., formats like xxxxxx.xx.xxx or xxxxxx xx xxx). -1. Return Matches: Return a structured list of matches, including the page ID, the matched text, and the extracted patterns. +1. Process Text Content: Iterate through the fetched text content for custom manipulations or pattern analysis. ```py -import re import requests -def find_strings(text): - """ - Find specific patterns in the given text. - - :param text: String to search patterns in. - :return: List of matches for defined patterns. - """ - # Example patterns to match, change these for your use case - patterns = [ - r'\b[\w\d]{6}\.\w{2}\.\w{3}\b', # xxxxxx.xx.xxx - r'\b[\w\d]{6}\.\w{3}\.\w{2}\b', # xxxxxx.xxx.xx - r'\b[\w\d]{6} \w{2} \w{3}\b', # xxxxxx xx xxx (spaces instead of dots) - r'\b[\w\d]{6} \w{3} \w{2}\b', # xxxxxx xxx xx (spaces instead of dots) - r'\b[\w\d]{6}[-_\s]?\w{2}[-_\s]?\w{3}\b', # xxxxxx xx xxx with optional - or _ - r'\b[\w\d]{6}[-_\s]?\w{3}[-_\s]?\w{2}\b', # xxxxxx xxx xx with optional - or _ - ] - - # Find all matches for each pattern - matches = [] - for pattern in patterns: - matches.extend(re.findall(pattern, text)) - - return matches -def fetch_and_analyze_ocr_text(payload): +def get_ocr_document_content(payload): """ - Fetch page_data from annotation and check for text matching variants. - + Fetch page_data from annotation. :param payload: Dictionary containing the payload with annotation information. - :return: List of matches found in the page_data text. """ - matches = [] token = payload.get("rossum_authorization_token") annotation_url = payload.get("annotation", {}).get("url") - if not token or not annotation_url: - return matches - retries = 3 for attempt in range(retries): try: @@ -417,23 +386,18 @@ def fetch_and_analyze_ocr_text(payload): if page_req.status_code == 200: results = page_req.json().get("results", []) + # This part is optional iteration through all the text nodes for page in results: for item in page.get("items", []): ocr_text = item.get("text", "") if ocr_text: - ocr_matches = find_strings(ocr_text) - if ocr_matches: - matches.append({ - "page_id": page.get("id"), - "ocr_text": ocr_text, - "ocr_matches": ocr_matches - }) + # Here will be any kind of manipulation with the text you need to do. + print(ocr_text) + break # Exit retry loop if request is successful else: print(f"Attempt {attempt + 1} failed with status code {page_req.status_code}. Retrying...") except requests.RequestException as e: print(f"Attempt {attempt + 1} encountered an exception: {e}. Retrying...") - - return matches -``` +``` \ No newline at end of file