Merge pull request #9 from eriknovak/feature/file-reader

Re-implement file reading methods + add unit tests
eriknovak · May 29, 2024 · 19c2340 · 19c2340
2 parents 0b6967a + 483510a
commit 19c2340
Show file tree

Hide file tree

Showing 7 changed files with 216 additions and 19 deletions.
diff --git a/anonipy/utils/file_system.py b/anonipy/utils/file_system.py
@@ -2,41 +2,114 @@
 The file system utilities
 """
 
+import os
+import re
 import json
-from typing import Union, Any
+from typing import Union
 
-import textract
+from docx import Document
+from pypdf import PdfReader
+
+
+# Define namespaces
+WORD_NAMESPACES = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
 
 # =====================================
 # Helper functions
 # =====================================
 
 
-def text_decode(text: str, decode: Union[str, bool] = True) -> str:
-    if not decode:
-        return text
-    if isinstance(decode, str):
-        return text.decode(decode)
-    if isinstance(decode, bool):
-        return text.decode("utf-8")
+def remove_extra_spaces(text: str) -> str:
+    text = text.strip()
+    # remove extra spaces
+    text = re.sub(" +", " ", text)
+    text = re.sub("\n{2,}", "\n\n", text)
+    return text
+
+
+def remove_page_numbers(text: str) -> str:
+    page_number_pattern = re.compile(r"^\s*\d+\s*$|\s*\d+\s*$")
+    filtered_lines = [
+        line.strip()
+        for line in text.splitlines()
+        if not page_number_pattern.match(line)
+    ]
+    return "\n".join(filtered_lines)
+
+
+# =====================================
+# PDF extractor
+# =====================================
+
+
+def extract_text_from_pdf(pdf_path: str) -> str:
+    pdf_reader = PdfReader(pdf_path)
 
+    pages_text = []
+    for page in pdf_reader.pages:
+        text = page.extract_text(extraction_mode="layout")
+        text = remove_page_numbers(text)
+        text = remove_extra_spaces(text)
+        pages_text.append(text)
+    document_text = "\n".join(pages_text)
 
-def get_variable_name(var: Any) -> str:
-    for name, value in globals().items():
-        if value is var:
-            return name
-    return None
+    return document_text
+
+
+# =====================================
+# Word extractor
+# =====================================
+
+
+def _word_process_paragraph(p) -> str:
+    return p.text
+
+
+def _word_process_table(t) -> str:
+    table_text = []
+    for row in t.findall(".//w:tr", WORD_NAMESPACES):
+        row_text = []
+        for cell in row.findall(".//w:tc", WORD_NAMESPACES):
+            cell_text = []
+            for p in cell.findall(".//w:p", WORD_NAMESPACES):
+                cell_text.append(p.text)
+            row_text.append(" ".join(cell_text))
+        table_text.append(" ".join(row_text))
+    return "\n".join(table_text)
+
+
+def extract_text_from_word(doc_path: str) -> str:
+    doc = Document(doc_path)
+    content = []
+    for element in doc.element.body:
+        if element.tag.endswith("p"):
+            # element is a paragraph
+            text = _word_process_paragraph(element)
+            content.append(text)
+        elif element.tag.endswith("tbl"):
+            # element is a table
+            text = _word_process_table(element)
+            content.append(text)
+    document_text = "\n".join(content)
+    return document_text
 
 
 # =====================================
 # Main functions
 # =====================================
 
 
-def open_file(file_path: str, encode: Union[str, bool] = True) -> str:
-    text = textract.process(file_path)
-    text = text_decode(text, encode)
-    return text
+def open_file(file_path: str) -> str:
+    _, file_extension = os.path.splitext(file_path)
+    if file_extension.lower() == ".pdf":
+        return extract_text_from_pdf(file_path)
+    elif file_extension.lower() in [".doc", ".docx"]:
+        return extract_text_from_word(file_path)
+    elif file_extension.lower() == ".txt":
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    else:
+        raise ValueError(f"The file extension is not supported: {file_extension}")
 
 
 def open_json(file_path: str) -> dict:

diff --git a/requirements.txt b/requirements.txt
@@ -8,6 +8,7 @@ lingua-language-detector
 guidance==0.1.14
 sentencepiece
 # File readers
-textract
+pypdf
+python-docx
 # Monitoring
 tqdm
diff --git a/test/resources/example.docx b/test/resources/example.docx
diff --git a/test/resources/example.pdf b/test/resources/example.pdf
diff --git a/test/resources/example.txt b/test/resources/example.txt
@@ -0,0 +1,22 @@
+Medical Record
+
+Patient Name: John Doe
+Date of Birth: 15-01-1985
+Date of Examination: 20-05-2024
+Social Security Number: 123-45-6789
+
+Examination Procedure:
+
+John Doe underwent a routine physical examination. The procedure included measuring vital signs
+(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
+test. The patient also reported occasional headaches and dizziness, prompting a neurological
+assessment and an MRI scan to rule out any underlying issues.
+
+Medication Prescribed:
+
+Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
+Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
+
+Next Examination Date:
+
+15-11-2024
diff --git a/test/resources/example_outputs.py b/test/resources/example_outputs.py
@@ -0,0 +1,67 @@
+WORD_TEXT = """\
+Medical Record
+
+Patient Name: John Doe
+Date of Birth: 15-01-1985
+Date of Examination: 20-05-2024
+Social Security Number: 123-45-6789
+Examination Procedure:
+John Doe underwent a routine physical examination. The procedure included measuring vital signs (blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress test. The patient also reported occasional headaches and dizziness, prompting a neurological assessment and an MRI scan to rule out any underlying issues.
+
+Medication Prescribed:
+Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
+Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
+
+Next Examination Date:
+15-11-2024
+""".strip()
+
+PDF_TEXT = """\
+Medical Record
+
+Patient Name: John Doe
+Date of Birth: 15-01-1985
+Date of Examination: 20-05-2024
+Social Security Number: 123-45-6789
+
+Examination Procedure:
+
+John Doe underwent a routine physical examination. The procedure included measuring vital signs
+(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
+test. The patient also reported occasional headaches and dizziness, prompting a neurological
+assessment and an MRI scan to rule out any underlying issues.
+
+Medication Prescribed:
+
+Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
+Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
+
+Next Examination Date:
+
+15-11-2024
+""".strip()
+
+TXT_TEXT = """\
+Medical Record
+
+Patient Name: John Doe
+Date of Birth: 15-01-1985
+Date of Examination: 20-05-2024
+Social Security Number: 123-45-6789
+
+Examination Procedure:
+
+John Doe underwent a routine physical examination. The procedure included measuring vital signs
+(blood pressure, heart rate, temperature), a comprehensive blood panel, and a cardiovascular stress
+test. The patient also reported occasional headaches and dizziness, prompting a neurological
+assessment and an MRI scan to rule out any underlying issues.
+
+Medication Prescribed:
+
+Ibuprofen 200 mg Take one tablet every 6-8 hours as needed for headache and pain relief.
+Lisinopril 10 mg Take one tablet daily to manage high blood pressure.
+
+Next Examination Date:
+
+15-11-2024
+""".strip()
diff --git a/test/test_file_system.py b/test/test_file_system.py
@@ -0,0 +1,34 @@
+import unittest
+
+from anonipy.utils.file_system import open_file
+
+# =====================================
+# Helper functions
+# =====================================
+
+from test.resources.example_outputs import WORD_TEXT, PDF_TEXT, TXT_TEXT
+
+resources = {
+    "word": "./test/resources/example.docx",
+    "pdf": "./test/resources/example.pdf",
+    "txt": "./test/resources/example.txt",
+}
+
+# =====================================
+# Test Entity Extractor
+# =====================================
+
+
+class TestFileSystem(unittest.TestCase):
+    def test_open_file_word(self):
+        self.assertEqual(open_file(resources["word"]), WORD_TEXT)
+
+    def test_open_file_pdf(self):
+        self.assertEqual(open_file(resources["pdf"]), PDF_TEXT)
+
+    def test_open_file_txt(self):
+        self.assertEqual(open_file(resources["txt"]), TXT_TEXT)
+
+
+if __name__ == "__main__":
+    unittest.main()