From 209eec6e92258aee32942d8fe7ca28c4b19e8e8a Mon Sep 17 00:00:00 2001
From: vaclav <vaclav.rut@rossum.ai>
Date: Fri, 13 Dec 2024 11:52:49 +0100
Subject: [PATCH] Simplified logic, removed the regex matching based on the
 discussion on slack.

---
 .../rossum-formulas/serverless-functions.md   | 58 ++++---------------
 1 file changed, 11 insertions(+), 47 deletions(-)
diff --git a/docs/learn/rossum-formulas/serverless-functions.md b/docs/learn/rossum-formulas/serverless-functions.md
index 55a5be77..e6c065bd 100644
--- a/docs/learn/rossum-formulas/serverless-functions.md
+++ b/docs/learn/rossum-formulas/serverless-functions.md
@@ -355,57 +355,26 @@ The config examples are numbered for easier orientation:
 ```
 </details>
 
-## Fetch and Analyze OCR Text for Pattern Matches
+## Fetch OCR Text (page_data)
 
-This function retrieves and processes textual data from Rossum's page_data API for an annotation. It is designed to:
+This function retrieves textual data from Rossum's page_data API for an annotation and processes it to:
 
-1. Fetch Text Data: Make an HTTP GET request to the page_data endpoint of a specific annotation using the provided rossum_authorization_token.
+1. Fetch OCR document content Data: Make an HTTP GET request to the page_data endpoint of a specific annotation using the provided rossum_authorization_token.
 1. Retry Mechanism: Handle transient network or server issues by retrying up to 3 times in case of a non-200 HTTP response or exceptions.
-1. Analyze Text Content: Parse the fetched text content for specific patterns defined in the find_variants function (e.g., formats like xxxxxx.xx.xxx or xxxxxx xx xxx).
-1. Return Matches: Return a structured list of matches, including the page ID, the matched text, and the extracted patterns.
+1. Process Text Content: Iterate through the fetched text content for custom manipulations or pattern analysis.
 
 ```py
-import re
 import requests
 
-def find_strings(text):
-    """
-    Find specific patterns in the given text.
-
-    :param text: String to search patterns in.
-    :return: List of matches for defined patterns.
-    """
-    # Example patterns to match, change these for your use case
-    patterns = [
-        r'\b[\w\d]{6}\.\w{2}\.\w{3}\b',  # xxxxxx.xx.xxx
-        r'\b[\w\d]{6}\.\w{3}\.\w{2}\b',  # xxxxxx.xxx.xx
-        r'\b[\w\d]{6} \w{2} \w{3}\b',      # xxxxxx xx xxx (spaces instead of dots)
-        r'\b[\w\d]{6} \w{3} \w{2}\b',      # xxxxxx xxx xx (spaces instead of dots)
-        r'\b[\w\d]{6}[-_\s]?\w{2}[-_\s]?\w{3}\b',  # xxxxxx xx xxx with optional - or _
-        r'\b[\w\d]{6}[-_\s]?\w{3}[-_\s]?\w{2}\b',  # xxxxxx xxx xx with optional - or _
-    ]
-
-    # Find all matches for each pattern
-    matches = []
-    for pattern in patterns:
-        matches.extend(re.findall(pattern, text))
-
-    return matches
 
-def fetch_and_analyze_ocr_text(payload):
+def get_ocr_document_content(payload):
     """
-    Fetch page_data from annotation and check for text matching variants.
-
+    Fetch page_data from annotation.
     :param payload: Dictionary containing the payload with annotation information.
-    :return: List of matches found in the page_data text.
     """
-    matches = []
     token = payload.get("rossum_authorization_token")
     annotation_url = payload.get("annotation", {}).get("url")
 
-    if not token or not annotation_url:
-        return matches
-
     retries = 3
     for attempt in range(retries):
         try:
@@ -417,23 +386,18 @@ def fetch_and_analyze_ocr_text(payload):
 
             if page_req.status_code == 200:
                 results = page_req.json().get("results", [])
+                # This part is optional iteration through all the text nodes
                 for page in results:
                     for item in page.get("items", []):
                         ocr_text = item.get("text", "")
                         if ocr_text:
-                            ocr_matches = find_strings(ocr_text)
-                            if ocr_matches:
-                                matches.append({
-                                    "page_id": page.get("id"),
-                                    "ocr_text": ocr_text,
-                                    "ocr_matches": ocr_matches
-                                })
+                            # Here will be any kind of manipulation with the text you need to do.
+                            print(ocr_text)
+
                 break  # Exit retry loop if request is successful
             else:
                 print(f"Attempt {attempt + 1} failed with status code {page_req.status_code}. Retrying...")
 
         except requests.RequestException as e:
             print(f"Attempt {attempt + 1} encountered an exception: {e}. Retrying...")
-
-    return matches
-```
+```
\ No newline at end of file