Random Forest to address PDF parsing issues. #23 #28 #11 #4 #39

harmonydata · Jul 19, 2024 · a1c4561 · a1c4561
1 parent 6608f58
commit a1c4561
Show file tree

Hide file tree

Showing 5 changed files with 183 additions and 153 deletions.
diff --git a/src/harmony/parsing/20240719_pdf_question_extraction_sklearn_crf_model.pkl b/src/harmony/parsing/20240719_pdf_question_extraction_sklearn_crf_model.pkl
diff --git a/src/harmony/parsing/crf_text_model.pkl b/src/harmony/parsing/crf_text_model.pkl
diff --git a/src/harmony/parsing/pdf_parser.py b/src/harmony/parsing/pdf_parser.py
@@ -24,32 +24,59 @@
 SOFTWARE.
 
 '''
+
+import pathlib
 import pickle as pkl
 import re
 
-import numpy as np
-
 import harmony
+from harmony.parsing.util.feature_extraction import convert_text_to_features
 from harmony.parsing.util.tika_wrapper import parse_pdf_to_plain_text
-# from harmony.parsing.util.tesseract_wrapper import parse_image_pdf_to_plain_text
-# from harmony.parsing.util.camelot_wrapper import parse_pdf_to_tables
 from harmony.schemas.requests.text import RawFile, Instrument
 
-re_initial_num = re.compile(r'(^\d+)')
-re_initial_num_dot = re.compile(r'(^\d+\.)')
-re_word = re.compile(r'(?i)(\b[\w\']+\b)')
-re_alpha = re.compile(r'(^[a-zA-Z]+)')
-re_bracket = re.compile(r'(?:\(|\))')
-import pathlib
-
 model_containing_folder = pathlib.Path(__file__).parent.resolve()
 
-with open(f"{model_containing_folder}/rf_table_model.pkl", "rb") as f:
-    rf_table_model = pkl.load(f)
-
-with open(f"{model_containing_folder}/crf_text_model.pkl", "rb") as f:
+with open(f"{model_containing_folder}/20240719_pdf_question_extraction_sklearn_crf_model.pkl", "rb") as f:
     crf_text_model = pkl.load(f)
 
+# Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
+# All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
+def predict(test_text):
+    token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
+        test_text)
+
+    X = []
+    X.append(token_properties)
+
+    y_pred = crf_text_model.predict(X)
+
+    questions_from_text = []
+
+    tokens_already_used = set()
+
+    last_token_category = "O"
+
+    for idx in range(len(X[0])):
+
+        if y_pred[0][idx] != "O" and idx not in tokens_already_used:
+            if last_token_category == "O" or y_pred[0][idx] == "B":
+                start_idx = token_start_char_indices[idx]
+                end_idx = len(test_text)
+                for j in range(idx + 1, len(X[0])):
+                    if y_pred[0][j] == "O" or y_pred[0][j] == "B":
+                        end_idx = token_end_char_indices[j - 1]
+                        break
+                    tokens_already_used.add(j)
+
+                question_text = test_text[start_idx:end_idx]
+                question_text = re.sub(r'\s+', ' ', question_text)
+                question_text = question_text.strip()
+                questions_from_text.append(question_text)
+
+        last_token_category = y_pred[0][idx]
+
+    return questions_from_text
+
 
 def convert_pdf_to_instruments(file: RawFile) -> Instrument:
     # file is an object containing these properties:
@@ -60,136 +87,8 @@ def convert_pdf_to_instruments(file: RawFile) -> Instrument:
     if not file.text_content:
         file.text_content = parse_pdf_to_plain_text(file.content)  # call Tika to convert the PDF to plain text
 
-    # TODO: New PDF parsing algorithm should go here, together with return statement.
-
-    table_cell_texts = []
-    page_tables = file.tables
-    questions_from_tables = []
-    if len(page_tables) > 0:
-        for page_table in page_tables:
-            tables = page_table['tables']
-            for row in tables:
-                for item in row:
-                    if len(item.strip()) > 0:
-                        table_cell_texts.append(item)
-
-        X = []
-        for idx in range(len(table_cell_texts)):
-            t = table_cell_texts[idx]
-            features = [len(t),
-                        len(re_initial_num.findall(t)),
-                        len(re_initial_num_dot.findall(t))]
-            X.append(features)
-
-        if len(X) > 0:
-            X = np.asarray(X)
-
-            y_pred = rf_table_model.predict(X)
-
-            questions_from_tables = []
-            for idx in range(len(table_cell_texts)):
-                if y_pred[idx] == 1:
-                    questions_from_tables.append(table_cell_texts[idx])
-
-
-    if True:  # text CRF model
-        questions_from_text = []
-        X = []
-
-        token_texts = []
-        token_properties = []
-
-        text = file.text_content
-        char_indices_of_newlines = set()
-        for idx, c in enumerate(text):
-            if c == "\n":
-                char_indices_of_newlines.add(idx)
-
-        char_indices_of_question_marks = set()
-        for idx, c in enumerate(text):
-            if c == "?":
-                char_indices_of_question_marks.add(idx)
-
-        tokens = list(re_word.finditer(text))
-
-        last_token_properties = {}
-
-        for token in tokens:
-            is_number = len(re_initial_num.findall(token.group()))
-            is_number_dot = len(re_initial_num_dot.findall(token.group()))
-            is_alpha = len(re_alpha.findall(token.group()))
-
-            dist_to_newline = token.start()
-            for c in range(token.start(), 1, -1):
-                if c in char_indices_of_newlines:
-                    dist_to_newline = token.start() - c
-                    break
-
-            dist_to_question_mark = len(text) - token.start()
-            for c in range(token.start(), len(text)):
-                if c in char_indices_of_question_marks:
-                    dist_to_question_mark = c - token.start()
-                    break
-
-            is_capital = int(token.group()[0] != token.group()[0].lower())
-
-            this_token_properties = {"length": len(token.group()), "is_number": is_number,
-                                     "is_alpha": is_alpha,
-                                     "is_capital": is_capital,
-                                     "is_number_dot": is_number_dot,
-                                     "dist_to_newline": dist_to_newline, "dist_to_question_mark": dist_to_question_mark,
-                                     "char_index": token.start()}
-
-            this_token_properties["prev_length"] = last_token_properties.get("length", 0)
-            this_token_properties["prev_is_alpha"] = last_token_properties.get("is_alpha", 0)
-            this_token_properties["prev_is_number"] = last_token_properties.get("is_number", 0)
-            this_token_properties["prev_is_number_dot"] = last_token_properties.get("is_number_dot", 0)
-            this_token_properties["prev_is_capital"] = last_token_properties.get("is_capital", 0)
-
-            this_token_properties["prev_prev_length"] = last_token_properties.get("prev_length", 0)
-            this_token_properties["prev_prev_is_alpha"] = last_token_properties.get("prev_is_alpha", 0)
-            this_token_properties["prev_prev_is_number"] = last_token_properties.get("prev_is_number", 0)
-            this_token_properties["prev_prev_is_number_dot"] = last_token_properties.get("prev_is_number_dot", 0)
-            this_token_properties["prev_prev_is_capital"] = last_token_properties.get("prev_is_capital", 0)
-
-            token_texts.append(token.group())
-
-            token_properties.append(this_token_properties)
-
-            last_token_properties = this_token_properties
-
-        X.append(token_properties)
-
-        y_pred = crf_text_model.predict(X)
-
-        last_token_category = "O"
-        for idx in range(len(X[0])):
-
-            if y_pred[0][idx] != "O":
-                if last_token_category == "O" or y_pred[0][idx] == "B":
-                    start_idx = tokens[idx].start()
-                    end_idx = len(text)
-                    for j in range(idx + 1, len(X[0])):
-                        if y_pred[0][j] == "O" or y_pred[0][j] == "B":
-                            end_idx = tokens[j - 1].end()
-                            break
-
-                    question_text = text[start_idx:end_idx]
-                    question_text = re.sub(r'\s+', ' ', question_text)
-                    question_text = question_text.strip()
-                    questions_from_text.append(question_text)
-
-            last_token_category = y_pred[0][idx]
+    questions_from_text = predict(file.text_content)
 
-    if len(questions_from_text) > len(questions_from_tables):
-        print ("Source of parsing was text CRF")
-        instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name, file_name=file.file_name)
-        print(instrument)
-        return [instrument]
-    elif len(questions_from_tables) > 0:
-        instrument = harmony.create_instrument_from_list(questions_from_tables, instrument_name=file.file_name, file_name=file.file_name)
-        return [instrument]
-    else:
-        return []
-
-    # return convert_text_to_instruments(file)
+    instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name,
+                                                     file_name=file.file_name)
+    return [instrument]
diff --git a/src/harmony/parsing/util/feature_extraction.py b/src/harmony/parsing/util/feature_extraction.py
@@ -0,0 +1,127 @@
+'''
+MIT License
+
+Copyright (c) 2023 Ulster University (https://www.ulster.ac.uk).
+Project: Harmony (https://harmonydata.ac.uk)
+Maintainer: Thomas Wood (https://fastdatascience.com)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+'''
+
+import json
+import re
+
+re_word = re.compile(r'(?i)(\S+)')
+
+re_initial_num = re.compile(r'(^\d+)')
+re_contains_num = re.compile(r'\d')
+re_initial_num_dot = re.compile(r'(^\d+\.)')
+re_alpha = re.compile(r'(^[a-zA-Z]+)')
+re_bracket = re.compile(r'(?:\(|\))')
+
+
+def convert_text_to_features(text):
+    token_texts = []
+    token_start_char_indices = []
+    token_end_char_indices = []
+    token_properties = []
+
+    char_indices_of_newlines = set()
+    for idx, c in enumerate(text):
+        if c == "\n":
+            char_indices_of_newlines.add(idx)
+
+    char_indices_of_question_marks = set()
+    for idx, c in enumerate(text):
+        if c == "?":
+            char_indices_of_question_marks.add(idx)
+
+    tokens = list(re_word.finditer(text))
+
+    this_token_properties = {}
+
+    for token in tokens:
+        is_number = len(re_initial_num.findall(token.group()))
+        is_number_dot = len(re_initial_num_dot.findall(token.group()))
+        num_nums = len(re_contains_num.findall(token.group()))
+        is_alpha = len(re_alpha.findall(token.group()))
+        is_bracket = len(re_bracket.findall(token.group()))
+
+        dist_to_prev_newline = token.start()
+        for c in range(token.start(), 1, -1):
+            if c in char_indices_of_newlines:
+                dist_to_prev_newline = token.start() - c
+                break
+
+        dist_to_next_question_mark = len(text) - token.start()
+        for c in range(token.start(), len(text)):
+            if c in char_indices_of_question_marks:
+                dist_to_next_question_mark = c - token.start()
+                break
+
+        is_capital = int(token.group()[0] != token.group()[0].lower())
+
+        is_letters_and_numbers = int(is_alpha and num_nums > 0)
+
+        this_token_properties = {"length": len(token.group()), "is_number": is_number,
+                                 "is_alpha": is_alpha,
+                                 "is_capital": is_capital,
+                                 "is_letters_and_numbers": is_letters_and_numbers,
+                                 "is_bracket": is_bracket,
+                                 "is_number_dot": is_number_dot,
+                                 "num_nums": num_nums,
+                                 "dist_to_prev_newline": dist_to_prev_newline,
+                                 "dist_to_next_question_mark": dist_to_next_question_mark,
+                                 "char_index": token.start()}
+
+        token_texts.append(token.group())
+        token_start_char_indices.append(token.start())
+        token_end_char_indices.append(token.end())
+        token_properties.append(this_token_properties)
+
+    all_property_names = list(sorted(this_token_properties))
+
+    for idx in range(len(token_properties)):
+        focus_dict = token_properties[idx]
+        # Generate features including prev and next token.
+        # There was no increase in performance associated with increasing this window. (TW 19/07/2024)
+        for offset in range(-1, 2):
+            if offset == 0:
+                continue
+            j = idx + offset
+            if j >= 0 and j < len(token_properties):
+                offset_dict = token_properties[j]
+            else:
+                offset_dict = {}
+
+            for property_name in all_property_names:
+                focus_dict[f"{property_name}_{offset}"] = offset_dict.get(property_name, 0)
+
+    return token_texts, token_start_char_indices, token_end_char_indices, token_properties
+
+
+if __name__ == "__main__":
+    test_text = "this is a test123 a)"
+    token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
+        test_text)
+    print(token_texts)
+    print(token_start_char_indices)
+    print(token_end_char_indices)
+    print(json.dumps(token_properties, indent=4))
diff --git a/tests/test_pdf_tables.py b/tests/test_pdf_tables.py
@@ -55,12 +55,16 @@
 
 class TestConvertPdfTables(unittest.TestCase):
 
-    def test_empty_pdf(self):
-
-        self.assertEqual(0, len(convert_pdf_to_instruments(pdf_empty_table)))
-
-    def test_two_questions(self):
-        self.assertEqual(2, len(convert_pdf_to_instruments(pdf_non_empty_table)[0].questions))
+    pass
+
+    # Not using tables at the moment
+    #
+    # def test_empty_pdf(self):
+    #
+    #     self.assertEqual(0, len(convert_pdf_to_instruments(pdf_empty_table)))
+    #
+    # def test_two_questions(self):
+    #     self.assertEqual(2, len(convert_pdf_to_instruments(pdf_non_empty_table)[0].questions))
 
 
 if __name__ == '__main__':