Skip to content

Commit

Permalink
Merge branch '59_fix_missaligned' into 'develop'
Browse files Browse the repository at this point in the history
59 fix missaligned

Closes #59

See merge request llave-en-mano/ia2/ia2-cli!47
  • Loading branch information
cuococarlos committed May 18, 2021
2 parents 1c6763e + c4a4913 commit 1ccc4d8
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 20 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ El archivo de configuración `train_config.json` se debe generar a partir de `ex
- `path_data_testing`: directorio de la data de testing para evaluar el modelo. Si está incluida esta opción los conjuntos de entrenamiento y validación serán combinados y utilizados para entrenamiento. Ver [F. Chollet, _Deep Learning in Python_ , cap. 4.2](https://livebook.manning.com/book/deep-learning-with-python/chapter-4/44)
- `evaluate`: valor que determina que conjunto de datos usar para evaluar el modelo. Opciones `test` / `val`. `val` es el valor por defecto y no es necesario incluirlo
- `is_raw`: valor booleano que determina si el archivo será convertido (cuando is_raw sea True)
- `train_subset`: si el valor es diferente de cero, se usará un subset del dataset tomado aleatoriamente (número entero)
- `save_misaligneds_to_file`: valor booleano que determina si se guardarán en un archivo json las annotations que estén desalineadas y provoquen que el documento analizado se ignore para su uso
- `model_path`: directorio del modelo custom a utilizar
- `save_model_path`: directorio donde se guardará el modelo generado a partir del entrenamiento
- `entities`: entidadas a ser usadas para el entrenamiento.
Expand Down
Empty file removed data/.gitkeep
Empty file.
2 changes: 2 additions & 0 deletions example_train_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"path_data_validation": "data/your_validation_data.json",
"path_data_testing": "data/unified/testing.json",
"is_raw": false,
"save_misaligneds_to_file": false,
"train_subset": 0,
"model_path": "models/your_base_model/",
"entities": "PER, LOC, DIRECCIÓN, OCUPACIÓN/PROFESIÓN, PATENTE/DOMINIO",
Expand Down Expand Up @@ -34,6 +35,7 @@
"path_data_training": "data/your_train_data.json",
"path_data_validation": "data/your_validation_data.json",
"is_raw": false,
"save_misaligneds_to_file": false,
"train_subset": 0,
"model_path": "models/your_base_model/",
"entities": "PER, LOC, DIRECCIÓN, OCUPACIÓN/PROFESIÓN, PATENTE/DOMINIO",
Expand Down
210 changes: 191 additions & 19 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import time
import shutil
import utils
import functools
from spacy.util import minibatch, compounding, decaying
from spacy.scorer import Scorer
from spacy.gold import GoldParse
Expand Down Expand Up @@ -50,7 +51,6 @@
logger_fh.setFormatter(formatter)
logger.addHandler(logger_fh)


def convert_dataturks_to_spacy(dataturks_JSON_file_path, entity_list):
try:
training_data = []
Expand Down Expand Up @@ -225,6 +225,71 @@ def convert_dataturks_to_train_file(
except Exception:
logging.exception(f'An error occured writing the output file at "{output_file_path}".')

def calculate_by_entity(self, totalizer, entities):
for span in entities:
try:
totalizer[span[2]] = totalizer[span[2]] + 1
except:
totalizer[span[2]] = 1


def remove_misaligned_annotations_from(self, model_path: str, input_files_path: str, data_type: str):
"""
Given a Spacy model path, a data file path and a data type (it should be one of: training, validation, testing),
removes misaligned annotations to the given file and updates the model to the given path.
Before removing misaligned annotations, it creates a copy of the involved file.
:param model_path: A model path
:param input_files_path: A input file path to be "depured"
:param data_type: file data type (it should be one of: training, validation, testing)
"""

#we create a backup copy of the file to be modified
dest = input_files_path.replace('.json', f'_copy_with_misaligned.json')
orig = input_files_path
shutil.copyfile(orig, dest)

nlp = spacy.load(model_path)

misaligned_docs_qty = 0
misaligned_lost_by_entities = {}
only_misaligneds = 0
total_lost_misaligned = 1

with open(input_files_path, 'r') as f:
data = json.load(f)

for text, annotations in data:
doc_gold_text = nlp.make_doc(text)
annotation_ents = annotations.get('entities')
alignment_values = spacy.gold.biluo_tags_from_offsets(doc_gold_text, annotation_ents)
is_misaligned_doc = True if '-' in alignment_values else False
if is_misaligned_doc:
text_array = nlp(text)
misaligned_texts = self.get_misaligned_texts(alignment_values, text_array)
for i, annot in enumerate(annotation_ents):
annotation_text = str(text[annot[0]:annot[1]])
if any(annotation_text.replace(' ', '') in text for text in misaligned_texts):
# print(f"annotation_text {annotation_text}")
# print(f"esta MAL la annotation: {annot} con idx: {i}")
annotation_ents.pop(i)
only_misaligneds = only_misaligneds + 1

misaligned_docs_qty = misaligned_docs_qty + 1
self.calculate_by_entity(misaligned_lost_by_entities, annotation_ents)

with open(input_files_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=4)

if misaligned_docs_qty:
total_lost_misaligned = functools.reduce(lambda a,b: a+b, misaligned_lost_by_entities.values())

print(f'\n\nMisaligned docs for {data_type} data: {misaligned_docs_qty}/{len(data)} ({round(100*misaligned_docs_qty/len(data),2)}%).')
print(f'Entities that could be lost because of misaligned: {total_lost_misaligned}.')
print(f'Misaligned annotations removed: {only_misaligneds} ({round(only_misaligneds/total_lost_misaligned*100, 2)}% of total entities in related docs).')
print(f'Data without misaligned annotations saved in: {input_files_path}')


# =================================
# Model Training functions
# =================================
Expand All @@ -236,6 +301,7 @@ def get_best_model(
n_iter,
training_data,
path_best_model,
save_misaligneds_to_file,
validation_data=[],
testing_data=[],
callbacks={},
Expand Down Expand Up @@ -307,7 +373,6 @@ def get_best_model(
for batch in batches:
num_batches += 1
texts, annotations = zip(*batch)

# bz.append(len(texts))
nlp.update(
texts, # batch of raw texts
Expand All @@ -320,25 +385,25 @@ def get_best_model(
# run batch callbacks
for cb in callbacks["on_batch"]:
state = cb(state, logger, nlp, optimizer, disabled_pipes)

log_annotations = True if state["i"] == 0 else False
try:
# compute validation scores
val_f_score, val_precision_score, val_recall_score, val_per_type_score = -1, -1, -1, -1
if settings["evaluate"] == "val":
logger.info("Evaluating docs from validation data")
val_f_score, val_precision_score, val_recall_score, val_per_type_score = self.evaluate_multiple(
optimizer, nlp, val_texts, val_annotations
optimizer, nlp, val_texts, val_annotations, "validation", save_misaligneds_to_file, log_annotations
)

# train data score
logger.info("Evaluating docs from training data")
f_score, precision_score, recall_score, per_type_score = self.evaluate_multiple(
optimizer, nlp, tr_texts, tr_annotations
optimizer, nlp, tr_texts, tr_annotations, "training", save_misaligneds_to_file, log_annotations
)

numero_losses = losses.get("ner")

except Exception:
except Exception as e:
logger.exception("The batch has no training data.")

utils.save_state_history(
Expand Down Expand Up @@ -369,7 +434,7 @@ def get_best_model(
if settings["evaluate"] == "test" and state["evaluate_test"]:
logger.info("Evaluating docs from testing data")
test_f_score, test_precision_score, test_recall_score, test_per_type_score = self.evaluate_multiple(
optimizer, nlp, test_texts, test_annotations
optimizer, nlp, test_texts, test_annotations, "test", save_misaligneds_to_file, log_annotations
)
logger.info("############################################################")
logger.info("Evaluating saved model with test data")
Expand Down Expand Up @@ -475,6 +540,7 @@ def train(self, config: str):
0, # prefered save_best_model callback
train_config["is_raw"],
train_config["path_data_validation"],
train_config["save_misaligneds_to_file"],
path_data_testing=test_ds,
callbacks=c,
settings=s,
Expand All @@ -491,6 +557,7 @@ def train_model(
max_losses: float,
is_raw: bool = True,
path_data_validation: str = "",
save_misaligneds_to_file: bool = False,
path_data_testing: str = "",
callbacks={},
settings={},
Expand Down Expand Up @@ -574,7 +641,6 @@ def train_model(

# this initial save remove the pipelines from base model and is not restoring them
#
#
# we save the first model and then we update it if there is a better version of it
# logger.info(f"💾 Saving initial model")
# nlp.to_disk(model_path)
Expand Down Expand Up @@ -606,6 +672,7 @@ def train_model(
n_iter,
training_data,
path_best_model,
save_misaligneds_to_file,
validation_data=validation_data,
testing_data=testing_data,
callbacks=callbacks,
Expand Down Expand Up @@ -645,47 +712,153 @@ def evaluate(self, nlp, text: str, entity_ocurrences: list):
try:
doc_gold_text = nlp.make_doc(text)
alignment_values = spacy.gold.biluo_tags_from_offsets(doc_gold_text, entity_ocurrences.get("entities"))
is_missaligned_doc = True if "-" in alignment_values else False
is_misaligned_doc = True if '-' in alignment_values else False
gold = GoldParse(doc_gold_text, entities=entity_ocurrences.get("entities"))
pred_value = nlp(text)
scorer.score(pred_value, gold)
return scorer.scores, is_missaligned_doc
return scorer.scores, is_misaligned_doc, alignment_values
except Exception as e:
print(e)

def evaluate_multiple(self, optimizer, nlp, texts: list, entity_occurences: list):
def get_misaligned_texts(self, misaligned_array, text_array):
"""
Given a list of alignment values for a doc and an array with the tokenized doc,
it find and returns an array of matching misaligned annotations.
:param misaligned_array: A list of alignment values for a doc.
:param text_array: A list that represents a tokenized doc.
"""
misaligned_indexes = [i for i, x in enumerate(misaligned_array) if x == "-"]
misaligned_texts = []
i = 0
word_ended = False
next_index_is_consecutive = False
text = ""

while i < len(misaligned_indexes):
token_text = text_array[misaligned_indexes[i]].text
try:
next_index_is_consecutive = len(misaligned_indexes) > 1 and (misaligned_indexes[i+1] - misaligned_indexes[i]) == 1
except:
next_index_is_consecutive = False

if i == 0 or text == "":
it_follows_prev_index = False
text = token_text
else:
it_follows_prev_index = (misaligned_indexes[i] - misaligned_indexes[i-1]) == 1

if it_follows_prev_index:
text = text + " "+ token_text

word_ended = not next_index_is_consecutive

if word_ended:
misaligned_texts.append(text.replace(' ', ''))
text = ""

i = i+1

return misaligned_texts

def get_total_misaligneds(self, nlp, save_it_to_file = False, filename="misaligned.json", log_annotations=True , misaligned_docs=[]):
"""
IMPORTANT! This function is being called when training the model.
Given a filename and an array of misaligned docs, it finds for every doc only
matching misaligned annotations and saves them to a json file with a structure similar
to the one used by Spacy for training data (json object with text and annotations).
The file/s are being saved in logs folder.
:param filename: A filename for the json file that is going to be saved.
:param misaligned: A list of misaligned docs.
"""
path = f"logs/{filename}"
data = []
total_misaligneds = 0
for i in range(len(misaligned_docs)):
text_raw = misaligned_docs[i]["text"]
text_array = nlp(text_raw)
#to check how the doc is being tokenized and understand why the misaligned warning is arising
# if "validation" in filename:
# tok_exp = nlp.tokenizer.explain(text_raw)
# for t in tok_exp:
# print(t[1], "\t", t[0])
misaligned_texts = self.get_misaligned_texts(misaligned_docs[i]["alignment_values"], text_array)
total_misaligneds = total_misaligneds + len(misaligned_texts)

if save_it_to_file:
annotations = misaligned_docs[i]["entities"]["entities"]
misaligned_annotations = []

for i, annot in enumerate(annotations):
annotation_text = str(text_raw[annot[0]:annot[1]])
if any(annotation_text.replace(' ', '') in text for text in misaligned_texts):
annotation = list(annotations[i])
annotation.append(annotation_text)
misaligned_annotations.append(annotation)
#we save a part of the text in order to be able to search it in the full text file
data.append({"text": text_raw[0:700], "annotations": misaligned_annotations})

if save_it_to_file and log_annotations:
srsly.write_json(path, data)
logger.info(f"💾 {path} file saved!")

return total_misaligneds


def evaluate_multiple(self, optimizer, nlp, texts: list, entity_occurences: list, data_type: str, save_misaligneds_to_file: bool, log_annotations: bool):
f_score_sum = 0
precision_score_sum = 0
recall_score_sum = 0
missaligned_docs = 0
qty_misaligned_docs = 0
ents_per_type_sum = {}
misaligned_docs = []
total_by_entities = {}
misaligned_lost_by_entities = {}
only_misaligneds = 0
total_lost_misaligned = 1

for idx in range(len(texts)):
text = texts[idx]
entities_for_text = entity_occurences[idx]
with nlp.use_params(optimizer.averages):
scores, is_missaligned_doc = self.evaluate(nlp, text, entities_for_text)
missaligned_docs += 1 if is_missaligned_doc else 0
scores, is_misaligned_doc, alignment_values = self.evaluate(nlp, text, entities_for_text)
recall_score_sum += scores.get("ents_r")
precision_score_sum += scores.get("ents_p")
f_score_sum += scores.get("ents_f")

self.calculate_by_entity(total_by_entities, entities_for_text['entities'])

if is_misaligned_doc:
misaligned_docs.append({"text": text, "entities":entities_for_text, "alignment_values": alignment_values})
self.calculate_by_entity(misaligned_lost_by_entities, entities_for_text['entities'])

for key, value in scores["ents_per_type"].items():
if key not in ents_per_type_sum:
ents_per_type_sum[key] = value["f"]
else:
ents_per_type_sum[key] += value["f"]

logger.info(
f"Missaligned docs for ⤴️: {missaligned_docs}/{len(texts)} ({round(100*missaligned_docs/len(texts),2)}%)."
)
if len(misaligned_docs):
only_misaligneds = self.get_total_misaligneds(nlp, save_misaligneds_to_file, f"{data_type}_misaligned_docs.json", log_annotations, misaligned_docs)
total_lost_misaligned = functools.reduce(lambda a,b: a+b, misaligned_lost_by_entities.values())

if log_annotations:
logger.info(f'Misaligned docs for {data_type} data: {len(misaligned_docs)}/{len(texts)} ({round(100*len(misaligned_docs)/len(texts),2)}%).')
logger.info(f'🔍 ANNOTATIONS! 🔍')
logger.info(f'Total by entities: {total_by_entities}.')
logger.info(f'Lost entities because of misaligned: {total_lost_misaligned}.')
logger.info(f'Misaligned: {only_misaligneds} ({round(only_misaligneds/total_lost_misaligned*100, 2)}%).')

for key, value in ents_per_type_sum.items():
ents_per_type_sum[key] = value / len(texts)

return (
f_score_sum / len(texts),
precision_score_sum / len(texts),
recall_score_sum / len(texts),
ents_per_type_sum,
ents_per_type_sum
)

def build_model_package(
Expand Down Expand Up @@ -822,6 +995,5 @@ def show_text(self, files_path: str, entity: str, context_words: int = 0):
for text in texts:
print(text)


if __name__ == "__main__":
fire.Fire(SpacyUtils)

0 comments on commit 1ccc4d8

Please sign in to comment.