Skip to content

Commit

Permalink
Update library
Browse files Browse the repository at this point in the history
  • Loading branch information
woodthom2 committed Jul 1, 2023
1 parent 2cf1cba commit 5fa7384
Show file tree
Hide file tree
Showing 5 changed files with 244 additions and 63 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,23 @@ You can install from [PyPI](https://pypi.org/project/harmonydata/0.1.0/).
pip install harmonydata
```

## Loading all models

Harmony uses spaCy to help with text extraction from PDFs. spaCy models can be downloaded with the following command in Python:

```
from harmony import download_models
download_models()
```

## Optional environment variables

As an alternative to downloading models, you can set environment variables so that Harmony calls spaCy on a remote server. This is only necessary if you are making a server deployment of Harmony.

`HARMONY_CLASSIFIER_ENDPOINT` - this can be an Azure Functions deployment of the text triage spaCy model. Example: https://twspacytest.azurewebsites.net/api/triage
`HARMONY_NER_ENDPOINT` - this can be an Azure Functions deployment of the NER spaCy model. Example: https://twspacytest.azurewebsites.net/api/ner
`HARMONY_DATA_PATH` - determines where model files are stored. Defaults to `HOME DIRECTORY/harmony`

## Loading instruments from PDFs

If you have a local file, you can load it into a list of `Instrument` instances:
Expand Down
1 change: 1 addition & 0 deletions src/harmony/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .parsing import *
from .schemas import *
from .matching.matcher import match_instruments_with_function
from .util.model_helper import download_models
try:
from .matching.default_matcher import match_instruments
except:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,32 +1,44 @@
import spacy
import numpy as np
import re
import operator
from harmony.parsing.text_extraction.spacy_wrapper import mark_is_all_letters, \
get_candidate_questions_and_mark_as_spans, set_is_numbered_bullet, mark_candidate_options_as_spans
from harmony.parsing.text_extraction.smart_document_parser import parse_document, nlp, convert_to_dataframe, get_questions, add_candidate_options
from harmony.schemas.requests.text import RawFile, Instrument, Question
from harmony.parsing.text_extraction.options_words import OPTIONS_WORDS
from harmony.parsing.text_extraction.smart_table_analyser import get_questions_from_tables
from harmony.parsing.text_extraction.dictionary_options_matcher import options_matcher
import json
import os
import re

import numpy as np
import requests
import urllib
import json
from spacy.tokens import DocBin
import spacy
from harmony.parsing.text_extraction.smart_document_parser import nlp, convert_to_dataframe, \
get_questions, add_candidate_options
from spacy.tokens import DocBin

from harmony.parsing.text_extraction.dictionary_options_matcher import options_matcher
from harmony.parsing.text_extraction.options_words import OPTIONS_WORDS
from harmony.parsing.text_extraction.smart_table_analyser import get_questions_from_tables
from harmony.parsing.text_extraction.spacy_wrapper import mark_is_all_letters, \
get_candidate_questions_and_mark_as_spans, set_is_numbered_bullet, mark_candidate_options_as_spans
from harmony.schemas.requests.text import Question

nlp = spacy.blank("en")

# data_path = os.getenv("DATA_PATH")
spacy_models = {"ner":None, "triage":None}

def load_spacy_models():
if spacy_models["ner"] is None:
if os.environ.get("HARMONY_NER_ENDPOINT") is None or os.environ.get("HARMONY_NER_ENDPOINT") == "":
path = os.getenv("HARMONY_DATA_PATH", os.path.expanduser("~") + "/harmony") + '/11_ner_0_spacy/model-best'
if not os.path.isdir(path):
print(f"Could not find model at {path}")
print("Please run:\nfrom harmony import download_models\ndownload_models()")
raise Exception()
spacy_models["ner"] = spacy.load(path)

# # The trained NER recogniser
# nlp = spacy.load(
# data_path + '/11_ner_0_spacy/model-best')
#
#
# nlp_final_classifier = spacy.load(
# data_path + '/29_classifier_spacy/model-best')
if spacy_models["classifier"] is None:
if os.environ.get("HARMONY_CLASSIFIER_ENDPOINT") is None or os.environ.get("HARMONY_CLASSIFIER_ENDPOINT") == "":
path = os.getenv("HARMONY_DATA_PATH", os.path.expanduser("~") + "/harmony") + '/29_classifier_spacy/model-best'
if not os.path.isdir(path):
print(f"Could not find model at {path}")
print ("Please run:\nfrom harmony import download_models\ndownload_models()")
raise Exception()
spacy_models["classifier"] = spacy.load(path)


def add_manual_features(doc):
Expand All @@ -37,12 +49,17 @@ def add_manual_features(doc):


def annotate_document(page_text):
response = requests.get(
'https://twspacytest.azurewebsites.net/api/ner', json={"text": json.dumps([page_text])})
doc_bin = DocBin().from_bytes(response.content)
doc = list(doc_bin.get_docs(nlp.vocab))[0]
load_spacy_models()

if os.environ.get("HARMONY_CLASSIFIER_ENDPOINT") is not None and os.environ.get(
"HARMONY_CLASSIFIER_ENDPOINT") != "":
response = requests.get(
os.environ.get("HARMONY_CLASSIFIER_ENDPOINT"), json={"text": json.dumps([page_text])})
doc_bin = DocBin().from_bytes(response.content)
doc = list(doc_bin.get_docs(nlp.vocab))[0]
else:
doc = spacy_models["classifier"](page_text)

# doc = nlp(page_text)
add_manual_features(doc)

df = convert_to_dataframe(doc)
Expand All @@ -51,7 +68,7 @@ def annotate_document(page_text):

add_candidate_options(df, doc)

token_classes = np.zeros((2,len(doc, )))
token_classes = np.zeros((2, len(doc, )))

for span in doc.ents:
for ctr, token in enumerate(span):
Expand All @@ -71,17 +88,11 @@ def annotate_document(page_text):

return token_classes, doc, df

def extract_questions(page_text, tables):
all_annotations, doc,df = annotate_document(page_text)

def extract_questions(page_text, tables):
all_annotations, doc, df = annotate_document(page_text)

questions = []
# call to rule-based only
# for idx in range(len(df)):
# if df.is_question_to_include.iloc[idx]:
# questions.append(Question(question_text = re.sub(r'\n', ' ', df.span.iloc[idx].text)))
# if len(questions) > 0:
# return questions, all_annotations, df

cur_question_text = None

Expand All @@ -106,12 +117,15 @@ def extract_questions(page_text, tables):
if cur_question_text is not None:
cur_question_text = re.sub(r'^- +', '', re.sub(r'\s+', ' ', cur_question_text).strip())
if cur_question_text.lower() not in OPTIONS_WORDS:
questions.append(Question(question_text = cur_question_text, question_intro="", question_no=f"{len(questions)+1}", options=[]))
questions.append(Question(question_text=cur_question_text, question_intro="",
question_no=f"{len(questions) + 1}", options=[]))
cur_question_text = None
if cur_question_text is not None:
cur_question_text = re.sub(r'^- +', '', re.sub(r'\s+', ' ', cur_question_text).strip())
if cur_question_text.lower() not in OPTIONS_WORDS:
questions.append(Question(question_text=cur_question_text, question_intro="", question_no=f"{len(questions)+1}", options=[]))
questions.append(
Question(question_text=cur_question_text, question_intro="", question_no=f"{len(questions) + 1}",
options=[]))

# If any tables were detected in the PDF, extract questions from tables.
if len(tables) > 0:
Expand All @@ -122,35 +136,20 @@ def extract_questions(page_text, tables):
questions = questions_from_tables

questions_triaged = []
response = requests.get(
'https://twspacytest.azurewebsites.net/api/triage', json={"text": json.dumps([q.question_text for q in questions])})
doc_bin = DocBin().from_bytes(response.content)

for question, question_as_doc in zip(questions, doc_bin.get_docs(nlp.vocab)):
if os.environ.get("HARMONY_NER_ENDPOINT") is not None and os.environ.get("HARMONY_NER_ENDPOINT") != "":
response = requests.get(
os.environ.get("HARMONY_NER_ENDPOINT"), json={"text": json.dumps([q.question_text for q in questions])})
doc_bin = DocBin().from_bytes(response.content)
docs = doc_bin.get_docs(nlp.vocab)
else:
docs = spacy_models["ner"].pipe([q.question_text for q in questions])

for question, question_as_doc in zip(questions, docs):
if question_as_doc.cats["1"] > 0.5:
questions_triaged.append(question)
else:
print ("Excluding question", question.question_text)
print("Excluding question", question.question_text)
if len(questions_triaged) > len(questions) / 2 and len(questions_triaged) > 5:
questions = questions_triaged

# Remove common suffixes
# from collections import Counter
# suffixes = Counter()
# for q in questions:
# toks = q.question_text.split(" ")
# for i in range(1, 4):
# if i < len(toks) - 2:
# suffix = " ".join(toks[-i:])
# suffixes[suffix] += 1
# if len(suffixes) > 0:
# sorted_suffixes = sorted(suffixes.items(), key = operator.itemgetter(1))
# if sorted_suffixes[0][1] > len(questions) / 2 and sorted_suffixes[0][1] > 4:
# print ("Removing", sorted_suffixes[0][1])
# for q in questions:
# try:
# q.question_text = re.sub(sorted_suffixes[0][1] + "$", "", q.question_text)
# except:
# pass

return questions, all_annotations, df
return questions, all_annotations, df
108 changes: 108 additions & 0 deletions src/harmony/parsing/text_extraction/smart_document_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import re

import numpy as np
import pandas as pd
from spacy.tokens import Span
from harmony.parsing.text_extraction.sequence_finder import find_longest_uninterrupted_sequence
from harmony.parsing.text_extraction.spacy_wrapper import nlp
from harmony.schemas.requests.text import Question
from harmony.parsing.text_extraction.options_extractor import add_candidate_options


def normalise(text):
return re.sub(r'\W', '', text.lower())


def clean_question(text):
return re.sub(r'^\s*(-|\))\s*|\s*(-|\()\s*$', '', re.sub(r'\s+', ' ', text)).strip()


def get_question_from_span(question_span):
"""
Get the text of a question, excluding any of the leading or trailing Likert options
:param question_span:
:return:
"""
doc = question_span.doc
tokens_to_include = set(range(question_span.start, question_span.end))

# Logic to delete Likert options from end of text
tokens_to_exclude = set()
for option_span in doc.spans['CANDIDATE_OPTION']:
for i in range(option_span.start, option_span.end):
tokens_to_exclude.add(i)

for i in tokens_to_exclude:
if i + 1 in tokens_to_exclude or i - 1 in tokens_to_exclude:
if i in tokens_to_include:
tokens_to_include.remove(i)

if len(tokens_to_include) == 0:
return ""
start = question_span.start
end = max(tokens_to_include) + 1
if start < end:
question_span = doc[start:end]

return clean_question(question_span.text)


def convert_to_dataframe(doc, is_training=False):
df = pd.DataFrame({"span": list(doc.spans['CANDIDATE_QUESTION'])})

if is_training:
df["ground_truth"] = df.question.apply(lambda span: span._.ground_truth)

# df["question"] = df["span"].apply(lambda span: clean_question(span.text))
df["question"] = df["span"].apply(lambda span: get_question_from_span(span))

df["preceding_bullet_value"] = df["span"].apply(lambda span: span._.preceding_bullet_value)

return df


def is_acceptable_span(span: Span) -> bool:
if span.end - span.start < 2:
return False
question = get_question_from_span(span)
non_whitespace_text = re.sub(r'\W', '', question)
if len(non_whitespace_text) < 10:
return False
return True


def get_questions(df):
preceding_bullet_values = list(df.preceding_bullet_value)
longest_uninterrupted_sequence = find_longest_uninterrupted_sequence(preceding_bullet_values)

if longest_uninterrupted_sequence is not None:
is_question_to_include = np.zeros((len(df),), dtype=bool)
for idx, seq_type, value in longest_uninterrupted_sequence:
is_question_to_include[idx] = 1
df["is_question_to_include"] = is_question_to_include
else:
# df["prediction"] = list(predictions)
# df["is_question_to_include"] = df["prediction"] == 2
df["is_question_to_include"] = df.span.apply(is_acceptable_span)

df_pred = df[df["is_question_to_include"]]
df_pred.rename(columns={"preceding_bullet_value": "question_no"}, inplace=True)

return df_pred


def parse_document(text):
doc = nlp(text)
df = convert_to_dataframe(doc)

df = get_questions(df)
add_candidate_options(df, doc)

questions = []
for idx in range(len(df)):
if df.is_question_to_include.iloc[idx]:
options = df.options.iloc[idx]
question = Question(question_no=df.question_no.iloc[idx], question_intro="", question_text=df.question.iloc[idx], options=list(options))
questions.append(question)

return questions
56 changes: 56 additions & 0 deletions src/harmony/util/model_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
import requests

files = ["11_ner_0_spacy/model-best/config.cfg",
"11_ner_0_spacy/model-best/meta.json",
"11_ner_0_spacy/model-best/ner/cfg",
"11_ner_0_spacy/model-best/ner/model",
"11_ner_0_spacy/model-best/ner/moves",
"11_ner_0_spacy/model-best/tok2vec/.gitattributes",
"11_ner_0_spacy/model-best/tok2vec/cfg",
"11_ner_0_spacy/model-best/tok2vec/model",
"11_ner_0_spacy/model-best/tokenizer",
"11_ner_0_spacy/model-best/vocab/key2row",
"11_ner_0_spacy/model-best/vocab/lookups.bin",
"11_ner_0_spacy/model-best/vocab/strings.json",
"11_ner_0_spacy/model-best/vocab/vectors",
"11_ner_0_spacy/model-best/vocab/vectors.cfg",
"29_classifier_spacy/model-best/.gitattributes",
"29_classifier_spacy/model-best/config.cfg",
"29_classifier_spacy/model-best/meta.json",
"29_classifier_spacy/model-best/textcat/cfg",
"29_classifier_spacy/model-best/textcat/model",
"29_classifier_spacy/model-best/tok2vec/cfg",
"29_classifier_spacy/model-best/tok2vec/model",
"29_classifier_spacy/model-best/tokenizer",
"29_classifier_spacy/model-best/vocab/key2row",
"29_classifier_spacy/model-best/vocab/lookups.bin",
"29_classifier_spacy/model-best/vocab/strings.json",
"29_classifier_spacy/model-best/vocab/vectors",
"29_classifier_spacy/model-best/vocab/vectors.cfg",
]
def download_models(is_force=False):
"""
Downloads spaCy models to local.
"""
local_path = os.getenv("HARMONY_DATA_PATH", os.path.expanduser("~") + "/harmony")

print ("Downloading spaCy models to " + local_path ".\nSet environment variable HARMONY_DATA_PATH if you want to change model file location.")

remote_base = "https://raw.githubusercontent.com/harmonydata/models/main/"

for file_to_download in files:
url = remote_base + file_to_download
local_filename = local_path + "/" + file_to_download
if os.path.exists(local_filename) and not is_force:
print ("File exists: ", local_filename)
print ("Exiting.\nRun download_models(True) to force redownload.")
break

r = requests.get(url)

if not os.path.isdir(os.path.dirname(local_filename)):
os.makedirs(os.path.dirname(local_filename))

with open(local_filename, 'wb') as f:
f.write(r.content)

0 comments on commit 5fa7384

Please sign in to comment.