Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Camembert-like models #396

Merged
merged 6 commits into from
Jun 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions farm/data_handler/input_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,8 @@ def sample_to_features_qa(sample, tokenizer, max_seq_len, answer_type_list=None,
# seq_2_start_t is the index of the first token in the second text sequence (e.g. passage)
if tokenizer.__class__.__name__ in ["RobertaTokenizer", "XLMRobertaTokenizer"]:
seq_2_start_t = get_roberta_seq_2_start(input_ids)
elif tokenizer.__class__.__name__ == "CamembertTokenizer":
seq_2_start_t = get_camembert_seq_2_start(input_ids)
else:
seq_2_start_t = segment_ids.index(1)

Expand Down Expand Up @@ -514,6 +516,17 @@ def get_roberta_seq_2_start(input_ids):
second_backslash_s = input_ids.index(2, first_backslash_s + 1)
return second_backslash_s + 1

def get_camembert_seq_2_start(input_ids):
# CamembertTokenizer.encode_plus returns only zeros in token_type_ids (same as RobertaTokenizer).
# This is another way to find the start of the second sequence (following get_roberta_seq_2_start)
# Camembert input sequences have the following
# format: <s> P1 </s> </s> P2 </s>
# <s> has index 5 and </s> has index 6. To find the beginning of the second sequence, this function first finds
# the index of the second </s>
first_backslash_s = input_ids.index(6)
second_backslash_s = input_ids.index(6, first_backslash_s + 1)
return second_backslash_s + 1

def sample_to_features_squadOLD(
sample, tokenizer, max_seq_len, doc_stride, max_query_length, tasks,
):
Expand Down
1 change: 0 additions & 1 deletion farm/modeling/adaptive_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ def __init__(
self.lm_output_types = (
[lm_output_types] if isinstance(lm_output_types, str) else lm_output_types
)
assert len(self.lm_output_types) == len(self.prediction_heads)
self.log_params()
# default loss aggregation function is a simple sum (without using any of the optional params)
if not loss_aggregation_fn:
Expand Down
66 changes: 62 additions & 4 deletions farm/modeling/language_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,13 @@
from transformers.modeling_xlm_roberta import XLMRobertaModel, XLMRobertaConfig
from transformers.modeling_distilbert import DistilBertModel, DistilBertConfig
from transformers.modeling_electra import ElectraModel, ElectraConfig
from transformers.modeling_camembert import CamembertModel, CamembertConfig
from transformers.modeling_utils import SequenceSummary
from transformers.tokenization_bert import load_vocab

from farm.modeling import wordembedding_utils
from farm.modeling.wordembedding_utils import s3e_pooling


# These are the names of the attributes in various model configs which refer to the number of dimensions
# in the output vectors
OUTPUT_DIM_NAMES = ["dim", "hidden_size", "d_model"]
Expand Down Expand Up @@ -134,6 +134,8 @@ def load(cls, pretrained_model_name_or_path, n_added_tokens=0, language_model_cl
language_model_class = 'XLMRoberta'
elif 'roberta' in pretrained_model_name_or_path:
language_model_class = 'Roberta'
elif 'camembert' in pretrained_model_name_or_path or 'umberto' in pretrained_model_name_or_path:
language_model_class = "Camembert"
elif 'albert' in pretrained_model_name_or_path:
language_model_class = 'Albert'
elif 'distilbert' in pretrained_model_name_or_path:
Expand Down Expand Up @@ -234,19 +236,31 @@ def _infer_language_from_name(cls, name):
"multilingual",
)
matches = [lang for lang in known_languages if lang in name]
if len(matches) == 0:
if "camembert" in name:
language = "french"
logger.info(
f"Automatically detected language from language model name: {language}"
)
elif "umberto" in name:
language = "italian"
logger.info(
f"Automatically detected language from language model name: {language}"
)
elif len(matches) == 0:
language = "english"
logger.warning(
"Could not automatically detect from language model name what language it is. \n"
"\t We guess it's an *ENGLISH* model ... \n"
"\t If not: Init the language model by supplying the 'language' param."
)
elif len(matches) > 1:
raise ValueError(
logger.warning(
"Could not automatically detect from language model name what language it is.\n"
f"\t Found multiple matches: {matches}\n"
"\t Please init the language model by manually supplying the 'language' as a parameter.\n"
f"\t Using {matches[0]} as language parameter for now.\n"
)
language = matches[0]
else:
language = matches[0]
logger.info(
Expand Down Expand Up @@ -879,7 +893,7 @@ def forward(
)
# XLNet also only returns the sequence_output (one vec per token)
# We need to manually aggregate that to get a pooled output (one vec per seq)
#TODO verify that this is really doing correct pooling
# TODO verify that this is really doing correct pooling
pooled_output = self.pooler(output_tuple[0])

if self.model.output_hidden_states == True:
Expand Down Expand Up @@ -1282,3 +1296,47 @@ def enable_hidden_states_output(self):
def disable_hidden_states_output(self):
self.model.config.output_hidden_states = False


class Camembert(Roberta):
"""
A Camembert model that wraps the HuggingFace's implementation
(https://github.com/huggingface/transformers) to fit the LanguageModel class.
"""
def __init__(self):
super(Camembert, self).__init__()
self.model = None
self.name = "camembert"

@classmethod
def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
"""
Load a language model either by supplying

* the name of a remote model on s3 ("camembert-base" ...)
* or a local path of a model trained via transformers ("some_dir/huggingface_model")
* or a local path of a model trained via FARM ("some_dir/farm_model")

:param pretrained_model_name_or_path: name or path of a model
:param language: (Optional) Name of language the model was trained for (e.g. "german").
If not supplied, FARM will try to infer it from the model name.
:return: Language Model

"""
camembert = cls()
if "farm_lm_name" in kwargs:
camembert.name = kwargs["farm_lm_name"]
else:
camembert.name = pretrained_model_name_or_path
# We need to differentiate between loading model using FARM format and Pytorch-Transformers format
farm_lm_config = Path(pretrained_model_name_or_path) / "language_model_config.json"
if os.path.exists(farm_lm_config):
# FARM style
config = CamembertConfig.from_pretrained(farm_lm_config)
farm_lm_model = Path(pretrained_model_name_or_path) / "language_model.bin"
camembert.model = CamembertModel.from_pretrained(farm_lm_model, config=config, **kwargs)
camembert.language = camembert.model.config.language
else:
# Huggingface transformer Style
camembert.model = CamembertModel.from_pretrained(str(pretrained_model_name_or_path), **kwargs)
camembert.language = cls._get_or_infer_language_from_name(language, pretrained_model_name_or_path)
return camembert
5 changes: 5 additions & 0 deletions farm/modeling/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.tokenization_xlm_roberta import XLMRobertaTokenizer
from transformers.tokenization_xlnet import XLNetTokenizer
from transformers.tokenization_camembert import CamembertTokenizer

from farm.modeling.wordembedding_utils import load_from_cache, EMBEDDING_VOCAB_FILES_MAP, run_split_on_punc

Expand Down Expand Up @@ -69,6 +70,8 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
tokenizer_class = "XLMRobertaTokenizer"
elif "roberta" in pretrained_model_name_or_path.lower():
tokenizer_class = "RobertaTokenizer"
elif "camembert" in pretrained_model_name_or_path.lower() or "umberto" in pretrained_model_name_or_path:
tokenizer_class = "CamembertTokenizer"
elif "distilbert" in pretrained_model_name_or_path.lower():
tokenizer_class = "DistilBertTokenizer"
elif "bert" in pretrained_model_name_or_path.lower():
Expand Down Expand Up @@ -104,6 +107,8 @@ def load(cls, pretrained_model_name_or_path, tokenizer_class=None, **kwargs):
ret = ElectraTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "EmbeddingTokenizer":
ret = EmbeddingTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
elif tokenizer_class == "CamembertTokenizer":
ret = CamembertTokenizer._from_pretrained(pretrained_model_name_or_path, **kwargs)
if ret is None:
raise Exception("Unable to load tokenizer")
else:
Expand Down
4 changes: 3 additions & 1 deletion readme.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ Core features
- Simple **deployment** and **visualization** to showcase your model

+------------------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+
| Task | BERT | RoBERTa | XLNet | ALBERT | DistilBERT | XLMRoBERTa |
| Task | BERT | RoBERTa* | XLNet | ALBERT | DistilBERT | XLMRoBERTa |
+==============================+===================+===================+===================+===================+===================+===================+
| Text classification | x | x | x | x | x | x |
+------------------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+
Expand All @@ -89,6 +89,8 @@ Core features
| Passage Ranking | x | x | x | x | x | x |
+------------------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+

\* including CamemBERT and UmBERTo

****NEW**** Interested in doing Question Answering at scale? Checkout `Haystack <https://github.com/deepset-ai/haystack>`_!

Resources
Expand Down