From be458e3ededf4bed0163f50ac6a83dc5d7dfbea8 Mon Sep 17 00:00:00 2001
From: Yifan Mai <yifan@cs.stanford.edu>
Date: Fri, 1 Dec 2023 17:45:44 -0800
Subject: [PATCH] Fixes

---
 .../tokenizers/huggingface_tokenizer_utils.py | 40 ++++++++++---------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py b/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py
index a082db7a2d8..4453c1db96b 100644
--- a/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py
+++ b/src/helm/proxy/tokenizers/huggingface_tokenizer_utils.py
@@ -20,24 +20,28 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> PreTrained
         # TODO: Figure out if we actually need this.
         os.environ["TOKENIZERS_PARALLELISM"] = "False"
 
-        try:
-            # From the Hugging Face documentation, "local_files_only(defaults to False) —
-            # Whether or not to only look at local files".
-            # Running `local_files_only=False` requires an internet connection even if the files are downloaded
-            # and cached. We need to first run with `local_files_only=True` just in case the machine
-            # we are running this code has connection issues. If the tokenizer files are not cached,
-            # we attempt to download them from HuggingFace.
-            # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
-            # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
-            # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
-            return AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
-            )
-        except OSError:
-            hlog(f"Local files do not exist for HuggingFace tokenizer: {pretrained_model_name_or_path}. Downloading...")
-            return AutoTokenizer.from_pretrained(
-                pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
-            )
+        with HuggingFacePreTrainedTokenizerFactory._tokenizers_lock:
+            try:
+                # From the Hugging Face documentation, "local_files_only(defaults to False) —
+                # Whether or not to only look at local files".
+                # Running `local_files_only=False` requires an internet connection even if the files are downloaded
+                # and cached. We need to first run with `local_files_only=True` just in case the machine
+                # we are running this code has connection issues. If the tokenizer files are not cached,
+                # we attempt to download them from HuggingFace.
+                # From https://huggingface.co/course/chapter6/3, "slow tokenizers are those written in Python inside
+                # the Hugging Face Transformers library, while the fast versions are the ones provided by Hugging Face
+                # Tokenizers, which are written in Rust." So, use the "fast" version of the tokenizers if available.
+                return AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, local_files_only=True, use_fast=True, **kwargs
+                )
+            except OSError:
+                hlog(
+                    f"Local files do not exist for HuggingFace tokenizer: "
+                    f"{pretrained_model_name_or_path}. Downloading..."
+                )
+                return AutoTokenizer.from_pretrained(
+                    pretrained_model_name_or_path, local_files_only=False, use_fast=True, **kwargs
+                )
 
     @staticmethod
     def get_tokenizer(