Skip to content

Commit

Permalink
Fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai committed Dec 2, 2023
1 parent 00fdb00 commit be458e3
Showing 1 changed file with 22 additions and 18 deletions.
40 changes: 22 additions & 18 deletions src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,28 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> PreTrained
# TODO: Figure out if we actually need this.
os.environ["TOKENIZERS_PARALLELISM"] = "False"

try:
# From the Hugging Face documentation, "local_files_only(defaults to False) —
# Whether or not to only look at local files".
# Running `local_files_only=False` requires an internet connection even if the files are downloaded
# and cached. We need to first run with `local_files_only=True` just in case the machine
# we are running this code has connection issues. If the tokenizer files are not cached,
# we attempt to download them from HuggingFace.
# From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
# the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
# Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
)
except OSError:
hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
)
with HuggingFacePreTrainedTokenizerFactory._tokenizers_lock:
try:
# From the Hugging Face documentation, "local_files_only(defaults to False) —
# Whether or not to only look at local files".
# Running `local_files_only=False` requires an internet connection even if the files are downloaded
# and cached. We need to first run with `local_files_only=True` just in case the machine
# we are running this code has connection issues. If the tokenizer files are not cached,
# we attempt to download them from HuggingFace.
# From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
# the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
# Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
)
except OSError:
hlog(
f"Local files do not exist for HuggingFace tokenizer: "
f"{pretrained_model_name_or_path}. Downloading..."
)
return AutoTokenizer.from_pretrained(
pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
)

@staticmethod
def get_tokenizer(
Expand Down

0 comments on commit be458e3

Please sign in to comment.