Merge pull request #49 from harmonydata/pdf

Replace PDF parsing
harmonydata · Jul 19, 2024 · 7f648c3 · 7f648c3
2 parents 39d38b5 + 3d68229
commit 7f648c3
Show file tree

Hide file tree

Showing 19 changed files with 225 additions and 1,171 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -5,7 +5,8 @@ lxml==4.9.2
 langdetect==1.0.9
 XlsxWriter==3.0.9
 openpyxl==3.1.2
-spacy==3.5.3
 wget==3.2
 sentence-transformers==2.2.2
 numpy==1.26.4
+sklearn-crfsuite==0.5.0
+scikit-learn==1.5.0
diff --git a/src/harmony/parsing/20240719_pdf_question_extraction_sklearn_crf_model.pkl b/src/harmony/parsing/20240719_pdf_question_extraction_sklearn_crf_model.pkl
diff --git a/src/harmony/parsing/pdf_parser.py b/src/harmony/parsing/pdf_parser.py
@@ -25,21 +25,70 @@
 
 '''
 
-from harmony.parsing.text_parser import convert_text_to_instruments
+import pathlib
+import pickle as pkl
+import re
+
+import harmony
+from harmony.parsing.util.feature_extraction import convert_text_to_features
 from harmony.parsing.util.tika_wrapper import parse_pdf_to_plain_text
-# from harmony.parsing.util.tesseract_wrapper import parse_image_pdf_to_plain_text
-# from harmony.parsing.util.camelot_wrapper import parse_pdf_to_tables
 from harmony.schemas.requests.text import RawFile, Instrument
 
+model_containing_folder = pathlib.Path(__file__).parent.resolve()
+
+with open(f"{model_containing_folder}/20240719_pdf_question_extraction_sklearn_crf_model.pkl", "rb") as f:
+    crf_text_model = pkl.load(f)
+
+# Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
+# All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
+def predict(test_text):
+    token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
+        test_text)
+
+    X = []
+    X.append(token_properties)
+
+    y_pred = crf_text_model.predict(X)
+
+    questions_from_text = []
+
+    tokens_already_used = set()
+
+    last_token_category = "O"
+
+    for idx in range(len(X[0])):
+
+        if y_pred[0][idx] != "O" and idx not in tokens_already_used:
+            if last_token_category == "O" or y_pred[0][idx] == "B":
+                start_idx = token_start_char_indices[idx]
+                end_idx = len(test_text)
+                for j in range(idx + 1, len(X[0])):
+                    if y_pred[0][j] == "O" or y_pred[0][j] == "B":
+                        end_idx = token_end_char_indices[j - 1]
+                        break
+                    tokens_already_used.add(j)
+
+                question_text = test_text[start_idx:end_idx]
+                question_text = re.sub(r'\s+', ' ', question_text)
+                question_text = question_text.strip()
+                questions_from_text.append(question_text)
+
+        last_token_category = y_pred[0][idx]
+
+    return questions_from_text
+
+
 def convert_pdf_to_instruments(file: RawFile) -> Instrument:
     # file is an object containing these properties:
     # content: str - The raw file contents so if it's a PDF this is a byte sequence in base 64 encoding
     # text_content: str - this is empty but we will use Tika to populate this in this method
     # tables: list - this is a list of all the tables in the document. The front end has populated this field.
 
     if not file.text_content:
-        file.text_content = parse_pdf_to_plain_text(file.content) # call Tika to convert the PDF to plain text
+        file.text_content = parse_pdf_to_plain_text(file.content)  # call Tika to convert the PDF to plain text
 
-    # TODO: New PDF parsing algorithm should go here, together with return statement.
+    questions_from_text = predict(file.text_content)
 
-    return convert_text_to_instruments(file)
+    instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name,
+                                                     file_name=file.file_name)
+    return [instrument]
diff --git a/src/harmony/parsing/rf_table_model.pkl b/src/harmony/parsing/rf_table_model.pkl
diff --git a/src/harmony/parsing/text_extraction/__init__.py b/src/harmony/parsing/text_extraction/__init__.py