WIP: Simplify processors - add Fasttokenizers (#649)

* increase transformers version * Make fast tokenizers possible * refactor QA processing * Move all fcts into dataset from dicts for QA * refactor doc classification * refactor bert_style_lm * refactor inference_processor Co-authored-by: Bogdan Kostić <bogdankostic@web.de> Co-authored-by: brandenchan <brandenchan@icloud.com> Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
deepset-ai · Dec 23, 2020 · 18e7fc7 · 18e7fc7
1 parent fa08f9d
commit 18e7fc7
Show file tree

Hide file tree

Showing 30 changed files with 2,516 additions and 1,527 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -10,8 +10,7 @@ trigger:
 pr:
   branches:
     include:
-    - '*' 
-
+    - '*'
 jobs:
   - job: 'Test'
     pool:

diff --git a/examples/lm_finetuning.py b/examples/lm_finetuning.py
@@ -19,22 +19,22 @@ def lm_finetuning():
         datefmt="%m/%d/%Y %H:%M:%S",
         level=logging.INFO,
     )
-
+    next_sent_pred_style = "bert-style"
+    next_sent_pred=True
     set_all_seeds(seed=42)
     ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
     ml_logger.init_experiment(
-        experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
+        experiment_name="LM_refactoring", run_name=f"new, nsp: {next_sent_pred}, {next_sent_pred_style}"
     )
     ##########################
     ########## Settings
     ##########################
-    device, n_gpu = initialize_device_settings(use_cuda=False)
+    device, n_gpu = initialize_device_settings(use_cuda=True)
     n_epochs = 1
     batch_size = 32
-    evaluate_every = 30
+    evaluate_every = 1000
     lang_model = "bert-base-cased"
     do_lower_case = False
-    next_sent_pred_style = "bert-style"
 
     # 1.Create a tokenizer
     tokenizer = Tokenizer.load(
@@ -46,7 +46,7 @@ def lm_finetuning():
         data_dir=Path("../data/lm_finetune_nips"),
         tokenizer=tokenizer,
         max_seq_len=128,
-        max_docs=20, # We have set max_docs to 20 to speed up data processing
+        max_docs=None, # You can have set max_docs here to limit the number of docs in the dataset and speed up this example
         next_sent_pred_style=next_sent_pred_style
     )
 
@@ -74,7 +74,7 @@ def lm_finetuning():
         learning_rate=2e-5,
         device=device,
         n_batches=len(data_silo.loaders["train"]),
-        n_epochs=n_epochs,
+        n_epochs=n_epochs
     )
 
     # 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
@@ -87,6 +87,7 @@ def lm_finetuning():
         lr_schedule=lr_schedule,
         evaluate_every=evaluate_every,
         device=device,
+        eval_report=False
     )
 
     # 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai

diff --git a/examples/natural_questions.py b/examples/natural_questions.py
@@ -42,7 +42,7 @@ def question_answering():
 
     # 1.Create a tokenizer
     tokenizer = Tokenizer.load(
-        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
+        pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, use_fast=False,
     )
 
     # Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart

diff --git a/farm/data_handler/data_silo.py b/farm/data_handler/data_silo.py
@@ -129,7 +129,7 @@ def _dataset_from_chunk(cls, chunk, processor):
         """
         dicts = [d[1] for d in chunk]
         indices = [x[0] for x in chunk]
-        dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices, return_problematic=True)
+        dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices)
         return dataset, tensor_names, problematic_sample_ids
 
     def _get_dataset(self, filename, dicts=None):
@@ -176,6 +176,7 @@ def _get_dataset(self, filename, dicts=None):
                 results = map(partial(self._dataset_from_chunk, processor=self.processor), grouper(dicts, num_dicts))
 
             datasets = []
+            problematic_ids_all = set()
 
             desc = f"Preprocessing Dataset"
             if filename:
@@ -185,8 +186,9 @@ def _get_dataset(self, filename, dicts=None):
                     datasets.append(dataset)
                     # update progress bar (last step can have less dicts than actual chunk_size)
                     pbar.update(min(multiprocessing_chunk_size, pbar.total-pbar.n))
-                    self.processor.problematic_sample_ids.update(problematic_samples)
-            self.processor.log_problematic()
+                    problematic_ids_all.update(problematic_samples)
+
+            self.processor.log_problematic(problematic_ids_all)
             # _dataset_from_chunk can return a None in cases where downsampling has occurred
             datasets = [d for d in datasets if d]
             concat_datasets = ConcatDataset(datasets)
@@ -221,7 +223,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
         else:
             logger.info("No train set is being loaded")
             self.data["train"] = None
-        self.processor.log_problematic()
 
         # dev data
         logger.info("")
@@ -243,7 +244,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
         else:
             logger.info("No dev set is being loaded")
             self.data["dev"] = None
-        self.processor.log_problematic()
 
         logger.info("")
         logger.info("LOADING TEST DATA")
@@ -264,7 +264,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
         else:
             logger.info("No test set is being loaded")
             self.data["test"] = None
-        self.processor.log_problematic()
 
         if self.caching:
             self._save_dataset_to_cache()
@@ -724,7 +723,7 @@ def _dataset_from_chunk(self, chunk):
             logger.info("Skipping a dict chunk as it contains less than 2 documents ...")
             return None, None
         indices = [x[0] for x in chunk]
-        datasets, tensor_names = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
+        datasets, tensor_names, _ = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
         return datasets, tensor_names
 
     def shuffle_files(self, files, seed=None):
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,8 +10,7 @@ trigger: @@
     pr:
       branches:
         include:
-        - '*'
+        - '*'
     jobs:
       - job: 'Test'
         pool:
@@ Expand Down @@