Skip to content

Commit

Permalink
feat(ocr_utils): Update docstrings and typing
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie committed May 14, 2024
1 parent e3855f0 commit 6dd78f1
Showing 1 changed file with 17 additions and 49 deletions.
66 changes: 17 additions & 49 deletions doctor/lib/ocr_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@
from PIL import Image


def deskew(obj) -> bool:
def deskew(obj: dict) -> bool:
"""Remove skewed text from a page
CTM stands for current transformation matrix.
Pdf plumber has a method to calculate the angle of text which we use here
Traditionally this is only seen in circular stamps which confuses the
content, or in perpendicular text of the ninth circuit courts which also
confuses the text.
Expand All @@ -25,7 +28,7 @@ def deskew(obj) -> bool:
return True


def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool):
def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool) -> str:
"""Extract page text
Using pdf plumber extract out the text of the document that is not
Expand Down Expand Up @@ -58,7 +61,7 @@ def get_page_text(page: pdfplumber.PDF.pages, strip_margin: bool):
return doc_text


def page_images(page):
def page_images(page: pdfplumber.pdf.Page) -> bool:
"""Does the page have images of a certain size
Meant to exclude images that might be lines
Expand All @@ -72,7 +75,7 @@ def page_images(page):
return False


def page_annotations(page):
def page_annotations(page: pdfplumber.pdf.Page) -> bool:
"""Does the page have annotations which could contain text
:param page: pdf plumber
Expand All @@ -86,7 +89,7 @@ def page_annotations(page):
return False


def find_average_char_width(block_data):
def find_average_char_width(block_data: pd.Series) -> int:
"""Average character width for a block of text
:param block_data:
Expand All @@ -96,50 +99,15 @@ def find_average_char_width(block_data):
return (fd.width / fd.text.str.len()).mean()


# def validate_ocr_text(row, img):
# """Review OCR results for low confidence and reprocess if necessary
#
# :param row:
# :param img:
# :return:
# """
# # If low confidence in the margins of drop character as likely artifact
# if row["left"] < 370 and row["conf"] <= 40:
# row["text"] = " " * len(row["text"])
# # if very low confidence and small - reprocess word with OCR for single line
# # this will give us a better chance to get the word right or remove junk
# elif row["conf"] < 10 and len(row["text"]) >= 3:
# # Give us a buffer around the word to increase OCR-ability
# bbox = (
# row["left"] - 5,
# row["top"] - 3,
# row["left"] + row["width"] + 5,
# row["top"] + row["height"] + 3,
# )
# config = "f'-c preserve_interword_spaces=1x1 --psm 7 -l eng'"
# word_df = pd.DataFrame(
# pytesseract.image_to_data(
# img.crop(bbox), config=config, output_type=Output.DICT
# )
# )
# # If new word above low confidence - use new word
# new_words = " ".join(
# word_df.loc[word_df["conf"] > 10, "text"].tolist()
# )
# if new_words:
# row["text"] = new_words
# else:
# # Otherwise identify unknown word/words with empty box
# row["text"] = "□" * len(row["text"])
# return row


def validate_ocr_text(row, img):
"""
def validate_ocr_text(row: pd.Series, img: Image) -> pd.Series:
"""Validate the OCR results
:param row:
:param img:
:return:
Take a look at our OCR and remove the bad results
and if possible reprocess words one by one
:param row: row of ocr results
:param img: Page image
:return: Updated row if necessary
"""
if row["left"] < 370 and row["conf"] <= 40:
row["text"] = " " * len(row["text"])
Expand Down Expand Up @@ -170,7 +138,7 @@ def validate_ocr_text(row, img):
def add_newlines(row: pd.Series, state: dict) -> dict:
"""Add new linebreaks into the ocr'd page
identify where line breaks should be added
Identify where line breaks should be added
:param row: the row of data from tesseract
:param state: the location data used to decide where line breaks should be
Expand Down

0 comments on commit 6dd78f1

Please sign in to comment.