Update library

harmonydata · Jul 1, 2023 · 5fa7384 · 5fa7384
1 parent 2cf1cba
commit 5fa7384
Show file tree

Hide file tree

Showing 5 changed files with 244 additions and 63 deletions.
diff --git a/README.md b/README.md
@@ -37,6 +37,23 @@ You can install from [PyPI](https://pypi.org/project/harmonydata/0.1.0/).
 pip install harmonydata
 ```
 
+## Loading all models
+
+Harmony uses spaCy to help with text extraction from PDFs. spaCy models can be downloaded with the following command in Python:
+
+```
+from harmony import download_models
+download_models()
+```
+
+## Optional environment variables
+
+As an alternative to downloading models, you can set environment variables so that Harmony calls spaCy on a remote server. This is only necessary if you are making a server deployment of Harmony.
+
+`HARMONY_CLASSIFIER_ENDPOINT` - this can be an Azure Functions deployment of the text triage spaCy model. Example: https://twspacytest.azurewebsites.net/api/triage
+`HARMONY_NER_ENDPOINT` - this can be an Azure Functions deployment of the NER spaCy model. Example: https://twspacytest.azurewebsites.net/api/ner
+`HARMONY_DATA_PATH` - determines where model files are stored. Defaults to `HOME DIRECTORY/harmony`
+
 ## Loading instruments from PDFs
 
 If you have a local file, you can load it into a list of `Instrument` instances:

diff --git a/src/harmony/__init__.py b/src/harmony/__init__.py
@@ -9,6 +9,7 @@
 from .parsing import *
 from .schemas import *
 from .matching.matcher import match_instruments_with_function
+from .util.model_helper import download_models
 try:
     from .matching.default_matcher import match_instruments
 except:

diff --git a/src/harmony/parsing/text_extraction/ensemble_named_entity_recogniser.py b/src/harmony/parsing/text_extraction/ensemble_named_entity_recogniser.py
@@ -1,32 +1,44 @@
-import spacy
 import numpy as np
-import re
-import operator
-from harmony.parsing.text_extraction.spacy_wrapper import mark_is_all_letters, \
-    get_candidate_questions_and_mark_as_spans, set_is_numbered_bullet, mark_candidate_options_as_spans
-from harmony.parsing.text_extraction.smart_document_parser import parse_document, nlp, convert_to_dataframe, get_questions, add_candidate_options
-from harmony.schemas.requests.text import RawFile, Instrument, Question
-from harmony.parsing.text_extraction.options_words import OPTIONS_WORDS
-from harmony.parsing.text_extraction.smart_table_analyser import get_questions_from_tables
-from harmony.parsing.text_extraction.dictionary_options_matcher import options_matcher
+import json
 import os
+import re
+
+import numpy as np
 import requests
-import urllib
-import json
-from spacy.tokens import DocBin
 import spacy
+from harmony.parsing.text_extraction.smart_document_parser import nlp, convert_to_dataframe, \
+    get_questions, add_candidate_options
+from spacy.tokens import DocBin
+
+from harmony.parsing.text_extraction.dictionary_options_matcher import options_matcher
+from harmony.parsing.text_extraction.options_words import OPTIONS_WORDS
+from harmony.parsing.text_extraction.smart_table_analyser import get_questions_from_tables
+from harmony.parsing.text_extraction.spacy_wrapper import mark_is_all_letters, \
+    get_candidate_questions_and_mark_as_spans, set_is_numbered_bullet, mark_candidate_options_as_spans
+from harmony.schemas.requests.text import Question
 
 nlp = spacy.blank("en")
 
-# data_path = os.getenv("DATA_PATH")
+spacy_models = {"ner":None, "triage":None}
+
+def load_spacy_models():
+    if spacy_models["ner"]  is None:
+        if os.environ.get("HARMONY_NER_ENDPOINT") is None or os.environ.get("HARMONY_NER_ENDPOINT") == "":
+            path = os.getenv("HARMONY_DATA_PATH", os.path.expanduser("~") + "/harmony") + '/11_ner_0_spacy/model-best'
+            if not os.path.isdir(path):
+                print(f"Could not find model at {path}")
+                print("Please run:\nfrom harmony import download_models\ndownload_models()")
+                raise Exception()
+            spacy_models["ner"] = spacy.load(path)
 
-# # The trained NER recogniser
-# nlp = spacy.load(
-#     data_path + '/11_ner_0_spacy/model-best')
-#
-#
-# nlp_final_classifier = spacy.load(
-#     data_path + '/29_classifier_spacy/model-best')
+    if spacy_models["classifier"] is None:
+        if os.environ.get("HARMONY_CLASSIFIER_ENDPOINT") is None or os.environ.get("HARMONY_CLASSIFIER_ENDPOINT") == "":
+            path = os.getenv("HARMONY_DATA_PATH", os.path.expanduser("~") + "/harmony") + '/29_classifier_spacy/model-best'
+            if not os.path.isdir(path):
+                print(f"Could not find model at {path}")
+                print ("Please run:\nfrom harmony import download_models\ndownload_models()")
+                raise Exception()
+            spacy_models["classifier"] = spacy.load(path)
 
 
 def add_manual_features(doc):
@@ -37,12 +49,17 @@ def add_manual_features(doc):
 
 
 def annotate_document(page_text):
-    response = requests.get(
-        'https://twspacytest.azurewebsites.net/api/ner', json={"text": json.dumps([page_text])})
-    doc_bin = DocBin().from_bytes(response.content)
-    doc = list(doc_bin.get_docs(nlp.vocab))[0]
+    load_spacy_models()
+
+    if os.environ.get("HARMONY_CLASSIFIER_ENDPOINT") is not None and os.environ.get(
+            "HARMONY_CLASSIFIER_ENDPOINT") != "":
+        response = requests.get(
+            os.environ.get("HARMONY_CLASSIFIER_ENDPOINT"), json={"text": json.dumps([page_text])})
+        doc_bin = DocBin().from_bytes(response.content)
+        doc = list(doc_bin.get_docs(nlp.vocab))[0]
+    else:
+        doc = spacy_models["classifier"](page_text)
 
-    # doc = nlp(page_text)
     add_manual_features(doc)
 
     df = convert_to_dataframe(doc)
@@ -51,7 +68,7 @@ def annotate_document(page_text):
 
     add_candidate_options(df, doc)
 
-    token_classes = np.zeros((2,len(doc, )))
+    token_classes = np.zeros((2, len(doc, )))
 
     for span in doc.ents:
         for ctr, token in enumerate(span):
@@ -71,17 +88,11 @@ def annotate_document(page_text):
 
     return token_classes, doc, df
 
-def extract_questions(page_text, tables):
-    all_annotations, doc,df = annotate_document(page_text)
 
+def extract_questions(page_text, tables):
+    all_annotations, doc, df = annotate_document(page_text)
 
     questions = []
-    # call to rule-based only
-    # for idx in range(len(df)):
-    #     if df.is_question_to_include.iloc[idx]:
-    #         questions.append(Question(question_text = re.sub(r'\n',  ' ', df.span.iloc[idx].text)))
-    # if len(questions) > 0:
-    #     return questions, all_annotations, df
 
     cur_question_text = None
 
@@ -106,12 +117,15 @@ def extract_questions(page_text, tables):
             if cur_question_text is not None:
                 cur_question_text = re.sub(r'^- +', '', re.sub(r'\s+', ' ', cur_question_text).strip())
                 if cur_question_text.lower() not in OPTIONS_WORDS:
-                    questions.append(Question(question_text = cur_question_text, question_intro="", question_no=f"{len(questions)+1}", options=[]))
+                    questions.append(Question(question_text=cur_question_text, question_intro="",
+                                              question_no=f"{len(questions) + 1}", options=[]))
             cur_question_text = None
     if cur_question_text is not None:
         cur_question_text = re.sub(r'^- +', '', re.sub(r'\s+', ' ', cur_question_text).strip())
         if cur_question_text.lower() not in OPTIONS_WORDS:
-            questions.append(Question(question_text=cur_question_text, question_intro="", question_no=f"{len(questions)+1}", options=[]))
+            questions.append(
+                Question(question_text=cur_question_text, question_intro="", question_no=f"{len(questions) + 1}",
+                         options=[]))
 
     # If any tables were detected in the PDF, extract questions from tables.
     if len(tables) > 0:
@@ -122,35 +136,20 @@ def extract_questions(page_text, tables):
             questions = questions_from_tables
 
     questions_triaged = []
-    response = requests.get(
-        'https://twspacytest.azurewebsites.net/api/triage', json={"text": json.dumps([q.question_text for q in questions])})
-    doc_bin = DocBin().from_bytes(response.content)
-
-    for question, question_as_doc in zip(questions, doc_bin.get_docs(nlp.vocab)):
+    if os.environ.get("HARMONY_NER_ENDPOINT") is not None and os.environ.get("HARMONY_NER_ENDPOINT") != "":
+        response = requests.get(
+            os.environ.get("HARMONY_NER_ENDPOINT"), json={"text": json.dumps([q.question_text for q in questions])})
+        doc_bin = DocBin().from_bytes(response.content)
+        docs = doc_bin.get_docs(nlp.vocab)
+    else:
+        docs = spacy_models["ner"].pipe([q.question_text for q in questions])
+
+    for question, question_as_doc in zip(questions, docs):
         if question_as_doc.cats["1"] > 0.5:
             questions_triaged.append(question)
         else:
-            print ("Excluding question", question.question_text)
+            print("Excluding question", question.question_text)
     if len(questions_triaged) > len(questions) / 2 and len(questions_triaged) > 5:
         questions = questions_triaged
 
-    # Remove common suffixes
-    # from collections import Counter
-    # suffixes = Counter()
-    # for q in questions:
-    #     toks = q.question_text.split(" ")
-    #     for i in range(1, 4):
-    #         if i < len(toks) - 2:
-    #             suffix = " ".join(toks[-i:])
-    #             suffixes[suffix] += 1
-    # if len(suffixes) > 0:
-    #     sorted_suffixes = sorted(suffixes.items(), key = operator.itemgetter(1))
-    #     if sorted_suffixes[0][1] > len(questions) / 2 and sorted_suffixes[0][1] > 4:
-    #         print ("Removing", sorted_suffixes[0][1])
-    #         for q in questions:
-    #             try:
-    #                 q.question_text = re.sub(sorted_suffixes[0][1] + "$", "", q.question_text)
-    #             except:
-    #                 pass
-
-    return questions, all_annotations, df
+    return questions, all_annotations, df
diff --git a/src/harmony/parsing/text_extraction/smart_document_parser.py b/src/harmony/parsing/text_extraction/smart_document_parser.py
@@ -0,0 +1,108 @@
+import re
+
+import numpy as np
+import pandas as pd
+from spacy.tokens import Span
+from harmony.parsing.text_extraction.sequence_finder import find_longest_uninterrupted_sequence
+from harmony.parsing.text_extraction.spacy_wrapper import nlp
+from harmony.schemas.requests.text import Question
+from harmony.parsing.text_extraction.options_extractor import add_candidate_options
+
+
+def normalise(text):
+    return re.sub(r'\W', '', text.lower())
+
+
+def clean_question(text):
+    return re.sub(r'^\s*(-|\))\s*|\s*(-|\()\s*$', '', re.sub(r'\s+', ' ', text)).strip()
+
+
+def get_question_from_span(question_span):
+    """
+    Get the text of a question, excluding any of the leading or trailing Likert options
+    :param question_span:
+    :return:
+    """
+    doc = question_span.doc
+    tokens_to_include = set(range(question_span.start, question_span.end))
+
+    # Logic to delete Likert options from end of text
+    tokens_to_exclude = set()
+    for option_span in doc.spans['CANDIDATE_OPTION']:
+        for i in range(option_span.start, option_span.end):
+            tokens_to_exclude.add(i)
+
+    for i in tokens_to_exclude:
+        if i + 1 in tokens_to_exclude or i - 1 in tokens_to_exclude:
+            if i in tokens_to_include:
+                tokens_to_include.remove(i)
+
+    if len(tokens_to_include) == 0:
+        return ""
+    start = question_span.start
+    end = max(tokens_to_include) + 1
+    if start < end:
+        question_span = doc[start:end]
+
+    return clean_question(question_span.text)
+
+
+def convert_to_dataframe(doc, is_training=False):
+    df = pd.DataFrame({"span": list(doc.spans['CANDIDATE_QUESTION'])})
+
+    if is_training:
+        df["ground_truth"] = df.question.apply(lambda span: span._.ground_truth)
+
+    # df["question"] = df["span"].apply(lambda span: clean_question(span.text))
+    df["question"] = df["span"].apply(lambda span: get_question_from_span(span))
+
+    df["preceding_bullet_value"] = df["span"].apply(lambda span: span._.preceding_bullet_value)
+
+    return df
+
+
+def is_acceptable_span(span: Span) -> bool:
+    if span.end - span.start < 2:
+        return False
+    question = get_question_from_span(span)
+    non_whitespace_text = re.sub(r'\W', '', question)
+    if len(non_whitespace_text) < 10:
+        return False
+    return True
+
+
+def get_questions(df):
+    preceding_bullet_values = list(df.preceding_bullet_value)
+    longest_uninterrupted_sequence = find_longest_uninterrupted_sequence(preceding_bullet_values)
+
+    if longest_uninterrupted_sequence is not None:
+        is_question_to_include = np.zeros((len(df),), dtype=bool)
+        for idx, seq_type, value in longest_uninterrupted_sequence:
+            is_question_to_include[idx] = 1
+        df["is_question_to_include"] = is_question_to_include
+    else:
+        # df["prediction"] = list(predictions)
+        # df["is_question_to_include"] = df["prediction"] == 2
+        df["is_question_to_include"] = df.span.apply(is_acceptable_span)
+
+    df_pred = df[df["is_question_to_include"]]
+    df_pred.rename(columns={"preceding_bullet_value": "question_no"}, inplace=True)
+
+    return df_pred
+
+
+def parse_document(text):
+    doc = nlp(text)
+    df = convert_to_dataframe(doc)
+
+    df = get_questions(df)
+    add_candidate_options(df, doc)
+
+    questions = []
+    for idx in range(len(df)):
+        if df.is_question_to_include.iloc[idx]:
+            options = df.options.iloc[idx]
+            question = Question(question_no=df.question_no.iloc[idx], question_intro="", question_text=df.question.iloc[idx], options=list(options))
+            questions.append(question)
+
+    return questions
diff --git a/src/harmony/util/model_helper.py b/src/harmony/util/model_helper.py
@@ -0,0 +1,56 @@
+import os
+import requests
+
+files = ["11_ner_0_spacy/model-best/config.cfg",
+"11_ner_0_spacy/model-best/meta.json",
+"11_ner_0_spacy/model-best/ner/cfg",
+"11_ner_0_spacy/model-best/ner/model",
+"11_ner_0_spacy/model-best/ner/moves",
+"11_ner_0_spacy/model-best/tok2vec/.gitattributes",
+"11_ner_0_spacy/model-best/tok2vec/cfg",
+"11_ner_0_spacy/model-best/tok2vec/model",
+"11_ner_0_spacy/model-best/tokenizer",
+"11_ner_0_spacy/model-best/vocab/key2row",
+"11_ner_0_spacy/model-best/vocab/lookups.bin",
+"11_ner_0_spacy/model-best/vocab/strings.json",
+"11_ner_0_spacy/model-best/vocab/vectors",
+"11_ner_0_spacy/model-best/vocab/vectors.cfg",
+"29_classifier_spacy/model-best/.gitattributes",
+"29_classifier_spacy/model-best/config.cfg",
+"29_classifier_spacy/model-best/meta.json",
+"29_classifier_spacy/model-best/textcat/cfg",
+"29_classifier_spacy/model-best/textcat/model",
+"29_classifier_spacy/model-best/tok2vec/cfg",
+"29_classifier_spacy/model-best/tok2vec/model",
+"29_classifier_spacy/model-best/tokenizer",
+"29_classifier_spacy/model-best/vocab/key2row",
+"29_classifier_spacy/model-best/vocab/lookups.bin",
+"29_classifier_spacy/model-best/vocab/strings.json",
+"29_classifier_spacy/model-best/vocab/vectors",
+"29_classifier_spacy/model-best/vocab/vectors.cfg",
+]
+def download_models(is_force=False):
+    """
+    Downloads spaCy models to local.
+    """
+    local_path = os.getenv("HARMONY_DATA_PATH", os.path.expanduser("~") + "/harmony")
+
+    print ("Downloading spaCy models to " + local_path ".\nSet environment variable HARMONY_DATA_PATH if you want to change model file location.")
+
+    remote_base = "https://raw.githubusercontent.com/harmonydata/models/main/"
+
+    for file_to_download in files:
+        url = remote_base + file_to_download
+        local_filename = local_path + "/" + file_to_download
+        if os.path.exists(local_filename) and not is_force:
+            print ("File exists: ", local_filename)
+            print ("Exiting.\nRun download_models(True) to force redownload.")
+            break
+
+        r = requests.get(url)
+
+        if not os.path.isdir(os.path.dirname(local_filename)):
+            os.makedirs(os.path.dirname(local_filename))
+
+        with open(local_filename, 'wb') as f:
+            f.write(r.content)