Skip to content

Commit

Permalink
WIP: Simplify processors - add Fasttokenizers (#649)
Browse files Browse the repository at this point in the history
* increase transformers version

* Make fast tokenizers possible

* refactor QA processing

* Move all fcts into dataset from dicts for QA

* refactor doc classification

* refactor bert_style_lm

* refactor inference_processor


Co-authored-by: Bogdan Kostić <bogdankostic@web.de>
Co-authored-by: brandenchan <brandenchan@icloud.com>
Co-authored-by: Malte Pietsch <malte.pietsch@deepset.ai>
  • Loading branch information
4 people authored Dec 23, 2020
1 parent fa08f9d commit 18e7fc7
Show file tree
Hide file tree
Showing 30 changed files with 2,516 additions and 1,527 deletions.
3 changes: 1 addition & 2 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@ trigger:
pr:
branches:
include:
- '*'

- '*'
jobs:
- job: 'Test'
pool:
Expand Down
15 changes: 8 additions & 7 deletions examples/lm_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@ def lm_finetuning():
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
)

next_sent_pred_style = "bert-style"
next_sent_pred=True
set_all_seeds(seed=42)
ml_logger = MLFlowLogger(tracking_uri="https://public-mlflow.deepset.ai/")
ml_logger.init_experiment(
experiment_name="Public_FARM", run_name="Run_minimal_example_lm"
experiment_name="LM_refactoring", run_name=f"new, nsp: {next_sent_pred}, {next_sent_pred_style}"
)
##########################
########## Settings
##########################
device, n_gpu = initialize_device_settings(use_cuda=False)
device, n_gpu = initialize_device_settings(use_cuda=True)
n_epochs = 1
batch_size = 32
evaluate_every = 30
evaluate_every = 1000
lang_model = "bert-base-cased"
do_lower_case = False
next_sent_pred_style = "bert-style"

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
Expand All @@ -46,7 +46,7 @@ def lm_finetuning():
data_dir=Path("../data/lm_finetune_nips"),
tokenizer=tokenizer,
max_seq_len=128,
max_docs=20, # We have set max_docs to 20 to speed up data processing
max_docs=None, # You can have set max_docs here to limit the number of docs in the dataset and speed up this example
next_sent_pred_style=next_sent_pred_style
)

Expand Down Expand Up @@ -74,7 +74,7 @@ def lm_finetuning():
learning_rate=2e-5,
device=device,
n_batches=len(data_silo.loaders["train"]),
n_epochs=n_epochs,
n_epochs=n_epochs
)

# 6. Feed everything to the Trainer, which keeps care of growing our model into powerful plant and evaluates it from time to time
Expand All @@ -87,6 +87,7 @@ def lm_finetuning():
lr_schedule=lr_schedule,
evaluate_every=evaluate_every,
device=device,
eval_report=False
)

# 7. Let it grow! Watch the tracked metrics live on the public mlflow server: https://public-mlflow.deepset.ai
Expand Down
2 changes: 1 addition & 1 deletion examples/natural_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def question_answering():

# 1.Create a tokenizer
tokenizer = Tokenizer.load(
pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case
pretrained_model_name_or_path=lang_model, do_lower_case=do_lower_case, use_fast=False,
)

# Add HTML tag tokens to the tokenizer vocabulary, so they do not get split apart
Expand Down
13 changes: 6 additions & 7 deletions farm/data_handler/data_silo.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ def _dataset_from_chunk(cls, chunk, processor):
"""
dicts = [d[1] for d in chunk]
indices = [x[0] for x in chunk]
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices, return_problematic=True)
dataset, tensor_names, problematic_sample_ids = processor.dataset_from_dicts(dicts=dicts, indices=indices)
return dataset, tensor_names, problematic_sample_ids

def _get_dataset(self, filename, dicts=None):
Expand Down Expand Up @@ -176,6 +176,7 @@ def _get_dataset(self, filename, dicts=None):
results = map(partial(self._dataset_from_chunk, processor=self.processor), grouper(dicts, num_dicts))

datasets = []
problematic_ids_all = set()

desc = f"Preprocessing Dataset"
if filename:
Expand All @@ -185,8 +186,9 @@ def _get_dataset(self, filename, dicts=None):
datasets.append(dataset)
# update progress bar (last step can have less dicts than actual chunk_size)
pbar.update(min(multiprocessing_chunk_size, pbar.total-pbar.n))
self.processor.problematic_sample_ids.update(problematic_samples)
self.processor.log_problematic()
problematic_ids_all.update(problematic_samples)

self.processor.log_problematic(problematic_ids_all)
# _dataset_from_chunk can return a None in cases where downsampling has occurred
datasets = [d for d in datasets if d]
concat_datasets = ConcatDataset(datasets)
Expand Down Expand Up @@ -221,7 +223,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
else:
logger.info("No train set is being loaded")
self.data["train"] = None
self.processor.log_problematic()

# dev data
logger.info("")
Expand All @@ -243,7 +244,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
else:
logger.info("No dev set is being loaded")
self.data["dev"] = None
self.processor.log_problematic()

logger.info("")
logger.info("LOADING TEST DATA")
Expand All @@ -264,7 +264,6 @@ def _load_data(self, train_dicts=None, dev_dicts=None, test_dicts=None):
else:
logger.info("No test set is being loaded")
self.data["test"] = None
self.processor.log_problematic()

if self.caching:
self._save_dataset_to_cache()
Expand Down Expand Up @@ -724,7 +723,7 @@ def _dataset_from_chunk(self, chunk):
logger.info("Skipping a dict chunk as it contains less than 2 documents ...")
return None, None
indices = [x[0] for x in chunk]
datasets, tensor_names = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
datasets, tensor_names, _ = self.processor.dataset_from_dicts(dicts=dicts, indices=indices)
return datasets, tensor_names

def shuffle_files(self, files, seed=None):
Expand Down
Loading

0 comments on commit 18e7fc7

Please sign in to comment.