Add multi-GPU support to HuggingFaceClient (#2762)

stanford-crfm · Jun 21, 2024 · 3d1a817 · 3d1a817
1 parent d25e4c8
commit 3d1a817
Show file tree

Hide file tree

Showing 4 changed files with 59 additions and 36 deletions.
diff --git a/setup.cfg b/setup.cfg
@@ -101,7 +101,7 @@ cleva =
     langdetect==1.0.9
 
 images =
-    accelerate~=0.25.0  # For the newer versions of Transformers
+    crfm-helm[accelerate]
     pillow~=10.2
 
 mongo =
@@ -116,9 +116,14 @@ bhasa =
     sacrebleu~=2.2.1
 
 # Model extras
+
+accelerate =
+    accelerate~=0.25
+
 aleph-alpha =
     aleph-alpha-client~=2.14.0
     tokenizers>=0.13.3
+
 openvino =
     optimum[openvino]~=1.19
 
@@ -158,6 +163,7 @@ yandex =
     sentencepiece~=0.1.97
 
 models =
+    crfm-helm[accelerate]
     crfm-helm[aleph-alpha]
     crfm-helm[allenai]
     crfm-helm[amazon]

diff --git a/src/helm/clients/huggingface_client.py b/src/helm/clients/huggingface_client.py
@@ -9,6 +9,7 @@
 
 from helm.common.cache import CacheConfig
 from helm.common.hierarchical_logger import htrack_block, hlog
+from helm.common.optional_dependencies import handle_module_not_found_error
 from helm.common.request import (
     wrap_request_time,
     EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
@@ -58,60 +59,61 @@ def __init__(
         self,
         pretrained_model_name_or_path: str,
         wrapped_tokenizer: WrappedPreTrainedTokenizer,
-        openvino=False,
+        openvino: bool = False,
         **kwargs,
     ):
-        if torch.cuda.is_available():
-            hlog("CUDA is available, initializing with a GPU...")
-            self.device: str = "cuda:0"
+        self.device: Optional[str]
+        if "device_map" in kwargs:
+            try:
+                import accelerate  # noqa: F401
+            except ModuleNotFoundError as e:
+                handle_module_not_found_error(e, ["accelerate"])
+            hlog(f'Hugging Face device_map set to "{kwargs["device_map"]}".')
+            self.device = None
+        elif torch.cuda.is_available():
+            hlog('Hugging Face device set to "cuda:0" because CUDA is available.')
+            self.device = "cuda:0"
         else:
+            hlog('Hugging Face device set to "cpu" because CUDA is unavailable.')
             self.device = "cpu"
+
+        # Security issue: currently we trust remote code by default.
+        # We retain this temporarily to maintain reverse compatibility.
+        # TODO: Delete if-else and don't set trust_remote_code=True
+        if "trust_remote_code" not in kwargs:
+            kwargs["trust_remote_code"] = True
+
         with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
             # WARNING this may fail if your GPU does not have enough memory
             if openvino:
-                """
-                Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
-                OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
-                Intel® architectures using OpenVINO™ runtime.
-                """
-                from helm.common.optional_dependencies import handle_module_not_found_error
-
+                # Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
+                # OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
+                # Intel® architectures using OpenVINO™ runtime.
                 try:
                     from optimum.intel.openvino import OVModelForCausalLM
                 except ModuleNotFoundError as e:
                     handle_module_not_found_error(e, ["openvino"])
 
                 self.device = "cpu"
-                # Security issue: currently we trust remote code by default.
-                # We retain this temporarily to maintain reverse compatibility.
-                # TODO: Delete if-else and don't set trust_remote_code=True
-                if "trust_remote_code" in kwargs:
-                    self.model = OVModelForCausalLM.from_pretrained(
-                        pretrained_model_name_or_path, export=True, **kwargs
-                    ).to(self.device)
-                else:
-                    self.model = OVModelForCausalLM.from_pretrained(
-                        pretrained_model_name_or_path, export=True, trust_remote_code=True, **kwargs
-                    ).to(self.device)
+                self.model = OVModelForCausalLM.from_pretrained(
+                    pretrained_model_name_or_path, export=True, **kwargs
+                ).to(self.device)
+            elif self.device is None:
+                # kwargs contains device_map=auto
+                # Do not call to() because accelerate will take care of model device placement.
+                self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
             else:
-                # Security issue: currently we trust remote code by default.
-                # We retain this temporarily to maintain reverse compatibility.
-                # TODO: Delete if-else and don't set trust_remote_code=True
-                if "trust_remote_code" in kwargs:
-                    self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs).to(
-                        self.device
-                    )
-                else:
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        pretrained_model_name_or_path, trust_remote_code=True, **kwargs
-                    ).to(self.device)
+                self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs).to(
+                    self.device
+                )
         self.wrapped_tokenizer = wrapped_tokenizer
 
     def serve_request(self, raw_request: HuggingFaceRequest) -> Dict:
         with self.wrapped_tokenizer as tokenizer:
             encoded_input = tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(
-                self.device
+                0 if self.device is None else self.device
             )
+
         stopping_criteria: Optional[StoppingCriteriaList] = None
         optional_args = {}
         if len(raw_request["stop_sequences"]) > 0:

diff --git a/src/helm/config/model_deployments.yaml b/src/helm/config/model_deployments.yaml
@@ -1013,13 +1013,17 @@ model_deployments:
     max_sequence_length: 4096
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        device_map: auto
 
   - name: huggingface/openthaigpt-1.0.0-70b-chat
     model_name: openthaigpt/openthaigpt-1.0.0-70b-chat
     tokenizer_name: huggingface/openthaigpt-1.0.0-7b-chat
     max_sequence_length: 4096
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        device_map: auto
 
   ## SAIL (SEA AI Lab)
   - name: sail/sailor-7b
@@ -1042,13 +1046,17 @@ model_deployments:
     max_sequence_length: 32768
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        device_map: auto
 
   - name: sail/sailor-14b-chat
     model_name: sail/sailor-14b-chat
     tokenizer_name: qwen/qwen1.5-7b
     max_sequence_length: 32768
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        device_map: auto
 
   # SambaNova
   - name: huggingface/sambalingo-thai-base
@@ -1077,6 +1085,8 @@ model_deployments:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
       args:
         pretrained_model_name_or_path: sambanovasystems/SambaLingo-Thai-Base-70B
+        args:
+          device_map: auto
 
   - name: huggingface/sambalingo-thai-chat-70b
     model_name: sambanova/sambalingo-thai-chat-70b
@@ -1086,6 +1096,8 @@ model_deployments:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
       args:
         pretrained_model_name_or_path: sambanovasystems/SambaLingo-Thai-Base-70B
+        args:
+          device_map: auto
 
   ## SCB10X
   - name: huggingface/typhoon-7b
@@ -1115,13 +1127,17 @@ model_deployments:
     max_sequence_length: 32768
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        device_map: auto
 
   - name: huggingface/typhoon-v1.5-72b-instruct
     model_name: scb10x/typhoon-v1.5-72b-instruct
     tokenizer_name: qwen/qwen1.5-7b
     max_sequence_length: 32768
     client_spec:
       class_name: "helm.clients.huggingface_client.HuggingFaceClient"
+      args:
+        device_map: auto
 
   # Alibaba DAMO Academy
   - name: huggingface/seallm-7b-v2

diff --git a/src/helm/tokenizers/huggingface_tokenizer.py b/src/helm/tokenizers/huggingface_tokenizer.py
@@ -53,7 +53,6 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPre
         # If unspecified, set `use_fast=True` by default.
         if "use_fast" not in from_pretrained_kwargs:
             from_pretrained_kwargs["use_fast"] = True
-        print(from_pretrained_kwargs)
         try:
             # From the Hugging Face documentation, "local_files_only(defaults to False) —
             # Whether or not to only look at local files".