From 1f6e08260670ddb5b2af5d5a783ec2f9a9ebf91b Mon Sep 17 00:00:00 2001 From: Branden Chan <33759007+brandenchan@users.noreply.github.com> Date: Fri, 17 Jan 2020 18:15:06 +0100 Subject: [PATCH] Fix bug with added tokens (#197) --- farm/data_handler/processor.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/farm/data_handler/processor.py b/farm/data_handler/processor.py index 7dd984292..5eb990315 100644 --- a/farm/data_handler/processor.py +++ b/farm/data_handler/processor.py @@ -722,11 +722,16 @@ def __init__( ) self.next_sent_pred = next_sent_pred - - self.add_task("lm", "acc", list(self.tokenizer.vocab)) + added_tokens = self.get_added_tokens() + self.add_task("lm", "acc", list(self.tokenizer.vocab) + added_tokens) if self.next_sent_pred: self.add_task("nextsentence", "acc", ["False", "True"]) + def get_added_tokens(self): + dictionary = self.tokenizer.added_tokens_encoder + sorted_tuples = sorted(dictionary.items(), key=lambda x: x[0]) + return [x[1] for x in sorted_tuples] + def file_to_dicts(self, file: str) -> list: dicts = read_docs_from_txt(filename=file, delimiter=self.delimiter, max_docs=self.max_docs, proxies=self.proxies) return dicts