Skip to content

Commit

Permalink
Merge pull request #9 from eriknovak/feature/file-reader
Browse files Browse the repository at this point in the history
Re-implement file reading methods + add unit tests
  • Loading branch information
eriknovak authored May 29, 2024
2 parents 0b6967a + 483510a commit 19c2340
Show file tree
Hide file tree
Showing 7 changed files with 216 additions and 19 deletions.
109 changes: 91 additions & 18 deletions anonipy/utils/file_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,114 @@
The file system utilities
"""

import os
import re
import json
from typing import Union, Any
from typing import Union

import textract
from docx import Document
from pypdf import PdfReader


# Define namespaces
WORD_NAMESPACES = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

# =====================================
# Helper functions
# =====================================


def text_decode(text: str, decode: Union[str, bool] = True) -> str:
if not decode:
return text
if isinstance(decode, str):
return text.decode(decode)
if isinstance(decode, bool):
return text.decode("utf-8")
def remove_extra_spaces(text: str) -> str:
text = text.strip()
# remove extra spaces
text = re.sub(" +", " ", text)
text = re.sub("\n{2,}", "\n\n", text)
return text


def remove_page_numbers(text: str) -> str:
page_number_pattern = re.compile(r"^\s*\d+\s*$|\s*\d+\s*$")
filtered_lines = [
line.strip()
for line in text.splitlines()
if not page_number_pattern.match(line)
]
return "\n".join(filtered_lines)


# =====================================
# PDF extractor
# =====================================


def extract_text_from_pdf(pdf_path: str) -> str:
pdf_reader = PdfReader(pdf_path)

pages_text = []
for page in pdf_reader.pages:
text = page.extract_text(extraction_mode="layout")
text = remove_page_numbers(text)
text = remove_extra_spaces(text)
pages_text.append(text)
document_text = "\n".join(pages_text)

def get_variable_name(var: Any) -> str:
for name, value in globals().items():
if value is var:
return name
return None
return document_text


# =====================================
# Word extractor
# =====================================


def _word_process_paragraph(p) -> str:
return p.text


def _word_process_table(t) -> str:
table_text = []
for row in t.findall(".//w:tr", WORD_NAMESPACES):
row_text = []
for cell in row.findall(".//w:tc", WORD_NAMESPACES):
cell_text = []
for p in cell.findall(".//w:p", WORD_NAMESPACES):
cell_text.append(p.text)
row_text.append(" ".join(cell_text))
table_text.append(" ".join(row_text))
return "\n".join(table_text)


def extract_text_from_word(doc_path: str) -> str:
doc = Document(doc_path)
content = []
for element in doc.element.body:
if element.tag.endswith("p"):
# element is a paragraph
text = _word_process_paragraph(element)
content.append(text)
elif element.tag.endswith("tbl"):
# element is a table
text = _word_process_table(element)
content.append(text)
document_text = "\n".join(content)
return document_text


# =====================================
# Main functions
# =====================================


def open_file(file_path: str, encode: Union[str, bool] = True) -> str:
text = textract.process(file_path)
text = text_decode(text, encode)
return text
def open_file(file_path: str) -> str:
_, file_extension = os.path.splitext(file_path)
if file_extension.lower() == ".pdf":
return extract_text_from_pdf(file_path)
elif file_extension.lower() in [".doc", ".docx"]:
return extract_text_from_word(file_path)
elif file_extension.lower() == ".txt":
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
else:
raise ValueError(f"The file extension is not supported: {file_extension}")


def open_json(file_path: str) -> dict:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ lingua-language-detector
guidance==0.1.14
sentencepiece
# File readers
textract
pypdf
python-docx
# Monitoring
tqdm
Binary file added test/resources/example.docx
Binary file not shown.
Binary file added test/resources/example.pdf
Binary file not shown.
22 changes: 22 additions & 0 deletions test/resources/example.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Medical Record

Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789

Examination Procedure:

John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.

Medication Prescribed:

Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.

Next Examination Date:

15-11-2024
67 changes: 67 additions & 0 deletions test/resources/example_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
WORD_TEXT = """\
Medical Record
Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789
Examination Procedure:
John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.
Medication Prescribed:
Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
Next Examination Date:
15-11-2024
""".strip()

PDF_TEXT = """\
Medical Record
Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789
Examination Procedure:
John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.
Medication Prescribed:
Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
Next Examination Date:
15-11-2024
""".strip()

TXT_TEXT = """\
Medical Record
Patient Name: John Doe
Date of Birth: 15-01-1985
Date of Examination: 20-05-2024
Social Security Number: 123-45-6789
Examination Procedure:
John Doe underwent a routine physical examination. The procedure included measuring vital signs
(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
test. The patient also reported occasional headaches and dizziness, prompting a neurological
assessment and an MRI scan to rule out any underlying issues.
Medication Prescribed:
Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
Next Examination Date:
15-11-2024
""".strip()
34 changes: 34 additions & 0 deletions test/test_file_system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest

from anonipy.utils.file_system import open_file

# =====================================
# Helper functions
# =====================================

from test.resources.example_outputs import WORD_TEXT, PDF_TEXT, TXT_TEXT

resources = {
"word": "./test/resources/example.docx",
"pdf": "./test/resources/example.pdf",
"txt": "./test/resources/example.txt",
}

# =====================================
# Test Entity Extractor
# =====================================


class TestFileSystem(unittest.TestCase):
def test_open_file_word(self):
self.assertEqual(open_file(resources["word"]), WORD_TEXT)

def test_open_file_pdf(self):
self.assertEqual(open_file(resources["pdf"]), PDF_TEXT)

def test_open_file_txt(self):
self.assertEqual(open_file(resources["txt"]), TXT_TEXT)


if __name__ == "__main__":
unittest.main()

0 comments on commit 19c2340

Please sign in to comment.