Skip to content

Commit

Permalink
Merge pull request #49 from harmonydata/pdf
Browse files Browse the repository at this point in the history
Replace PDF parsing
  • Loading branch information
woodthom2 committed Jul 19, 2024
2 parents 39d38b5 + 3d68229 commit 7f648c3
Show file tree
Hide file tree
Showing 19 changed files with 225 additions and 1,171 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ lxml==4.9.2
langdetect==1.0.9
XlsxWriter==3.0.9
openpyxl==3.1.2
spacy==3.5.3
wget==3.2
sentence-transformers==2.2.2
numpy==1.26.4
sklearn-crfsuite==0.5.0
scikit-learn==1.5.0
Binary file not shown.
61 changes: 55 additions & 6 deletions src/harmony/parsing/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,70 @@
'''

from harmony.parsing.text_parser import convert_text_to_instruments
import pathlib
import pickle as pkl
import re

import harmony
from harmony.parsing.util.feature_extraction import convert_text_to_features
from harmony.parsing.util.tika_wrapper import parse_pdf_to_plain_text
# from harmony.parsing.util.tesseract_wrapper import parse_image_pdf_to_plain_text
# from harmony.parsing.util.camelot_wrapper import parse_pdf_to_tables
from harmony.schemas.requests.text import RawFile, Instrument

model_containing_folder = pathlib.Path(__file__).parent.resolve()

with open(f"{model_containing_folder}/20240719_pdf_question_extraction_sklearn_crf_model.pkl", "rb") as f:
crf_text_model = pkl.load(f)

# Predict method is taken from the training repo. Use the training repo as the master copy of the predict method.
# All training code is in https://github.com/harmonydata/pdf-questionnaire-extraction
def predict(test_text):
token_texts, token_start_char_indices, token_end_char_indices, token_properties = convert_text_to_features(
test_text)

X = []
X.append(token_properties)

y_pred = crf_text_model.predict(X)

questions_from_text = []

tokens_already_used = set()

last_token_category = "O"

for idx in range(len(X[0])):

if y_pred[0][idx] != "O" and idx not in tokens_already_used:
if last_token_category == "O" or y_pred[0][idx] == "B":
start_idx = token_start_char_indices[idx]
end_idx = len(test_text)
for j in range(idx + 1, len(X[0])):
if y_pred[0][j] == "O" or y_pred[0][j] == "B":
end_idx = token_end_char_indices[j - 1]
break
tokens_already_used.add(j)

question_text = test_text[start_idx:end_idx]
question_text = re.sub(r'\s+', ' ', question_text)
question_text = question_text.strip()
questions_from_text.append(question_text)

last_token_category = y_pred[0][idx]

return questions_from_text


def convert_pdf_to_instruments(file: RawFile) -> Instrument:
# file is an object containing these properties:
# content: str - The raw file contents so if it's a PDF this is a byte sequence in base 64 encoding
# text_content: str - this is empty but we will use Tika to populate this in this method
# tables: list - this is a list of all the tables in the document. The front end has populated this field.

if not file.text_content:
file.text_content = parse_pdf_to_plain_text(file.content) # call Tika to convert the PDF to plain text
file.text_content = parse_pdf_to_plain_text(file.content) # call Tika to convert the PDF to plain text

# TODO: New PDF parsing algorithm should go here, together with return statement.
questions_from_text = predict(file.text_content)

return convert_text_to_instruments(file)
instrument = harmony.create_instrument_from_list(questions_from_text, instrument_name=file.file_name,
file_name=file.file_name)
return [instrument]
Binary file added src/harmony/parsing/rf_table_model.pkl
Binary file not shown.
27 changes: 0 additions & 27 deletions src/harmony/parsing/text_extraction/__init__.py

This file was deleted.

Loading

0 comments on commit 7f648c3

Please sign in to comment.