Skip to content

Commit

Permalink
Add multi-GPU support to HuggingFaceClient (#2762)
Browse files Browse the repository at this point in the history
  • Loading branch information
yifanmai authored Jun 21, 2024
1 parent d25e4c8 commit 3d1a817
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 36 deletions.
8 changes: 7 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ cleva =
langdetect==1.0.9

images =
accelerate~=0.25.0 # For the newer versions of Transformers
crfm-helm[accelerate]
pillow~=10.2

mongo =
Expand All @@ -116,9 +116,14 @@ bhasa =
sacrebleu~=2.2.1

# Model extras

accelerate =
accelerate~=0.25

aleph-alpha =
aleph-alpha-client~=2.14.0
tokenizers>=0.13.3

openvino =
optimum[openvino]~=1.19

Expand Down Expand Up @@ -158,6 +163,7 @@ yandex =
sentencepiece~=0.1.97

models =
crfm-helm[accelerate]
crfm-helm[aleph-alpha]
crfm-helm[allenai]
crfm-helm[amazon]
Expand Down
70 changes: 36 additions & 34 deletions src/helm/clients/huggingface_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from helm.common.cache import CacheConfig
from helm.common.hierarchical_logger import htrack_block, hlog
from helm.common.optional_dependencies import handle_module_not_found_error
from helm.common.request import (
wrap_request_time,
EMBEDDING_UNAVAILABLE_REQUEST_RESULT,
Expand Down Expand Up @@ -58,60 +59,61 @@ def __init__(
self,
pretrained_model_name_or_path: str,
wrapped_tokenizer: WrappedPreTrainedTokenizer,
openvino=False,
openvino: bool = False,
**kwargs,
):
if torch.cuda.is_available():
hlog("CUDA is available, initializing with a GPU...")
self.device: str = "cuda:0"
self.device: Optional[str]
if "device_map" in kwargs:
try:
import accelerate # noqa: F401
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["accelerate"])
hlog(f'Hugging Face device_map set to "{kwargs["device_map"]}".')
self.device = None
elif torch.cuda.is_available():
hlog('Hugging Face device set to "cuda:0" because CUDA is available.')
self.device = "cuda:0"
else:
hlog('Hugging Face device set to "cpu" because CUDA is unavailable.')
self.device = "cpu"

# Security issue: currently we trust remote code by default.
# We retain this temporarily to maintain reverse compatibility.
# TODO: Delete if-else and don't set trust_remote_code=True
if "trust_remote_code" not in kwargs:
kwargs["trust_remote_code"] = True

with htrack_block(f"Loading Hugging Face model {pretrained_model_name_or_path}"):
# WARNING this may fail if your GPU does not have enough memory
if openvino:
"""
Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
Intel® architectures using OpenVINO™ runtime.
"""
from helm.common.optional_dependencies import handle_module_not_found_error

# Optimum Intel provides a simple interface to optimize Transformer models and convert them to \
# OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on \
# Intel® architectures using OpenVINO™ runtime.
try:
from optimum.intel.openvino import OVModelForCausalLM
except ModuleNotFoundError as e:
handle_module_not_found_error(e, ["openvino"])

self.device = "cpu"
# Security issue: currently we trust remote code by default.
# We retain this temporarily to maintain reverse compatibility.
# TODO: Delete if-else and don't set trust_remote_code=True
if "trust_remote_code" in kwargs:
self.model = OVModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, export=True, **kwargs
).to(self.device)
else:
self.model = OVModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, export=True, trust_remote_code=True, **kwargs
).to(self.device)
self.model = OVModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, export=True, **kwargs
).to(self.device)
elif self.device is None:
# kwargs contains device_map=auto
# Do not call to() because accelerate will take care of model device placement.
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs)
else:
# Security issue: currently we trust remote code by default.
# We retain this temporarily to maintain reverse compatibility.
# TODO: Delete if-else and don't set trust_remote_code=True
if "trust_remote_code" in kwargs:
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs).to(
self.device
)
else:
self.model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=True, **kwargs
).to(self.device)
self.model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **kwargs).to(
self.device
)
self.wrapped_tokenizer = wrapped_tokenizer

def serve_request(self, raw_request: HuggingFaceRequest) -> Dict:
with self.wrapped_tokenizer as tokenizer:
encoded_input = tokenizer(raw_request["prompt"], return_tensors="pt", return_token_type_ids=False).to(
self.device
0 if self.device is None else self.device
)

stopping_criteria: Optional[StoppingCriteriaList] = None
optional_args = {}
if len(raw_request["stop_sequences"]) > 0:
Expand Down
16 changes: 16 additions & 0 deletions src/helm/config/model_deployments.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1013,13 +1013,17 @@ model_deployments:
max_sequence_length: 4096
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
device_map: auto

- name: huggingface/openthaigpt-1.0.0-70b-chat
model_name: openthaigpt/openthaigpt-1.0.0-70b-chat
tokenizer_name: huggingface/openthaigpt-1.0.0-7b-chat
max_sequence_length: 4096
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
device_map: auto

## SAIL (SEA AI Lab)
- name: sail/sailor-7b
Expand All @@ -1042,13 +1046,17 @@ model_deployments:
max_sequence_length: 32768
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
device_map: auto

- name: sail/sailor-14b-chat
model_name: sail/sailor-14b-chat
tokenizer_name: qwen/qwen1.5-7b
max_sequence_length: 32768
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
device_map: auto

# SambaNova
- name: huggingface/sambalingo-thai-base
Expand Down Expand Up @@ -1077,6 +1085,8 @@ model_deployments:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
pretrained_model_name_or_path: sambanovasystems/SambaLingo-Thai-Base-70B
args:
device_map: auto

- name: huggingface/sambalingo-thai-chat-70b
model_name: sambanova/sambalingo-thai-chat-70b
Expand All @@ -1086,6 +1096,8 @@ model_deployments:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
pretrained_model_name_or_path: sambanovasystems/SambaLingo-Thai-Base-70B
args:
device_map: auto

## SCB10X
- name: huggingface/typhoon-7b
Expand Down Expand Up @@ -1115,13 +1127,17 @@ model_deployments:
max_sequence_length: 32768
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
device_map: auto

- name: huggingface/typhoon-v1.5-72b-instruct
model_name: scb10x/typhoon-v1.5-72b-instruct
tokenizer_name: qwen/qwen1.5-7b
max_sequence_length: 32768
client_spec:
class_name: "helm.clients.huggingface_client.HuggingFaceClient"
args:
device_map: auto

# Alibaba DAMO Academy
- name: huggingface/seallm-7b-v2
Expand Down
1 change: 0 additions & 1 deletion src/helm/tokenizers/huggingface_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def create_tokenizer(pretrained_model_name_or_path: str, **kwargs) -> WrappedPre
# If unspecified, set `use_fast=True` by default.
if "use_fast" not in from_pretrained_kwargs:
from_pretrained_kwargs["use_fast"] = True
print(from_pretrained_kwargs)
try:
# From the Hugging Face documentation, "local_files_only(defaults to False) —
# Whether or not to only look at local files".
Expand Down

0 comments on commit 3d1a817

Please sign in to comment.