From be458e3ededf4bed0163f50ac6a83dc5d7dfbea8 Mon Sep 17 00:00:00 2001 From: Yifan Mai Date: Fri, 1 Dec 2023 17:45:44 -0800 Subject: [PATCH] Fixes --- .../tokenizers/huggingface_tokenizer_utils.py | 40 ++++++++++--------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py b/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py index a082db7a2d8..4453c1db96b 100644 --- a/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py +++ b/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py @@ -20,24 +20,28 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> PreTrained # TODO: Figure out if we actually need this. os.environ["TOKENIZERS_PARALLELISM"] = "False" - try: - # From the Hugging Face documentation, "local_files_only(defaults to False) — - # Whether or not to only look at local files". - # Running `local_files_only=False` requires an internet connection even if the files are downloaded - # and cached. We need to first run with `local_files_only=True` just in case the machine - # we are running this code has connection issues. If the tokenizer files are not cached, - # we attempt to download them from HuggingFace. - # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside - # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face - # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available. - return AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs - ) - except OSError: - hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...") - return AutoTokenizer.from_pretrained( - pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs - ) + with HuggingFacePreTrainedTokenizerFactory._tokenizers_lock: + try: + # From the Hugging Face documentation, "local_files_only(defaults to False) — + # Whether or not to only look at local files". + # Running `local_files_only=False` requires an internet connection even if the files are downloaded + # and cached. We need to first run with `local_files_only=True` just in case the machine + # we are running this code has connection issues. If the tokenizer files are not cached, + # we attempt to download them from HuggingFace. + # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside + # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face + # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available. + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs + ) + except OSError: + hlog( + f"Local files do not exist for HuggingFace tokenizer: " + f"{pretrained_model_name_or_path}. Downloading..." + ) + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs + ) @staticmethod def get_tokenizer(