fix JSON serialization and encoding to prepare query; updated setup f…

…or PY37; updated doc and example for process huge text
Lucaterre · Aug 1, 2022 · cb19878 · cb19878
1 parent 7e13a22
commit cb19878
Show file tree

Hide file tree

Showing 4 changed files with 22 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -466,10 +466,15 @@ doc._.metadata
 
 ### How to process a long text?
 
-When processing a long text it is possible to raise an error due to the limit set by `nlp.max_length`.
-It is possible to apply spaCy fishing on a very long text with [`nlp.pipe()`](https://spacy.io/api/language#pipe) method.
-We provide an example with the script [`process_long_text.py`](examples/process_long_text.py) that guarantees 
-to pass all the context of the entities to be disambiguated in entity-fishing.
+Process NER and disambiguate a long text can be really tricky. 
+In fact, spaCy can be raised an exception due to the default limit parameter `nlp.max_length`. 
+The strategy here is to pass a text as batch of sentences with [`nlp.pipe()`](https://spacy.io/api/language#pipe) method and, 
+then pass entities to spacyfishing with all context (not only the sentences, to help disambiguation) and 
+all entities with continuous characters offsets (start and end characters positions are re-calculated). 
+You can use a provided script [`process_long_text.py`](examples/process_long_text.py) that can help to process huge text. 
+For example, a text with `2 073` sentences and `12 901` to disambiguate can be processed in about a minute (with no extra information)
+and in less than 1 minute 30 (with extra information and properties filter applied).
+
 
 ## Configuration parameters
 

diff --git a/examples/process_long_text.py b/examples/process_long_text.py
@@ -18,6 +18,8 @@
     3. Apply complete pipeline on text and retrieve results.
 """
 
+import time
+
 import spacy
 from spacy import Language
 from spacy.tokens import Doc
@@ -67,6 +69,7 @@ def __call__(self, doc: Doc):
 
 
 if __name__ == '__main__':
+    start_time = time.time()
     # Set model, language, file that contains text to analyze
     model = "en_core_web_sm"
     language = "en"
@@ -76,6 +79,9 @@ def __call__(self, doc: Doc):
     sentences = text_preprocessor(open_file(filename))
     huge_text = " ".join(sentences)
 
+    print(f"* Total characters in document : {len(huge_text)}")
+    print(f"* Total sentences in document : {len(sentences)}")
+
     # Create pipeline
     huge_pipeline_linking = spacy.blank(language)
     huge_pipeline_linking.add_pipe('custom_ner', config={"model_name": model, "sentences_to_process": sentences})
@@ -88,3 +94,4 @@ def __call__(self, doc: Doc):
     for ent in doc_linked.ents:
         print(ent.text, ent.label_, ent._.kb_qid)
 
+    print("--- %s seconds ---" % (time.time() - start_time))
diff --git a/setup.py b/setup.py
@@ -35,7 +35,7 @@
     install_requires=install_requires,
     packages=find_packages(),
     classifiers=CLASSIFIERS,
-    python_requires='>=3.8',
+    python_requires='>=3.7',
     entry_points={
         'spacy_factories': 'entityfishing = spacyfishing.entity_fishing_linker:EntityFishing'
     },

diff --git a/spacyfishing/entity_fishing_linker.py b/spacyfishing/entity_fishing_linker.py
@@ -8,6 +8,7 @@
 
 import logging
 from typing import Tuple
+import json
 
 import requests
 
@@ -186,8 +187,9 @@ def prepare_data(text: str, terms: str, entities: list, language: dict, full: bo
         Returns:
             dict (dict): data ready to send.
         """
+
         return {
-            "query": str({
+            "query": json.dumps({
                 "text": text,
                 "shortText": terms,
                 "language": language,
@@ -201,7 +203,7 @@ def prepare_data(text: str, terms: str, entities: list, language: dict, full: bo
                 "mentions": [],
                 "customisation": "generic",
                 "full": "true" if full else "false"
-            })
+            }, ensure_ascii=False)
         }
 
     def updated_entities(self, doc: Doc, response: list) -> None:
@@ -339,6 +341,7 @@ def main_disambiguation_process(self,
                                          entities=entities,
                                          language=self.language,
                                          full=self.flag_extra)
+
         req = self.disambiguate_text(files=data_to_post)
         res, metadata = self.process_response(response=req)
         try: