Skip to content

Commit

Permalink
Merge pull request #120 from aws/inf2
Browse files Browse the repository at this point in the history
Add Inferentai2 and Optimum Neuron Support
  • Loading branch information
philschmid authored May 8, 2024
2 parents c2440c1 + 8c9fc10 commit 4164089
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 70 deletions.
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,14 @@ The custom module can override the following methods:
* `output_fn(prediction, accept)`: overrides the default method for postprocessing, the return value `result` will be the respond of your request(e.g.`JSON`). The inputs are `predictions`, the result of the `predict()` method and `accept` the return accept type from the HTTP Request, e.g. `application/json`


## 🏎️ Deploy Models on AWS Inferentia2

The SageMaker Hugging Face Inference Toolkit provides support for deploying Hugging Face on AWS Inferentia2. To deploy a model on Inferentia2 you have 3 options:
* Provide `HF_MODEL_ID`, the model repo id on huggingface.co which contains the compiled model under `.neuron` format. e.g. `optimum/bge-base-en-v1.5-neuronx`
* Provide the `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH` environment variables to compile the model on the fly, e.g. `HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128`
* Include `neuron` dictionary in the [config.json](https://huggingface.co/optimum/tiny_random_bert_neuron/blob/main/config.json) file in the model archive, e.g. `neuron: {"static_batch_size": 1, "static_sequence_length": 128}`

The currently supported tasks can be found [here](https://huggingface.co/docs/optimum-neuron/en/package_reference/supported_models). If you plan to deploy an LLM, we recommend taking a look at [Neuronx TGI](https://huggingface.co/blog/text-generation-inference-on-inferentia2), which is purposly build for LLMs

---
## 🤝 Contributing
Expand Down Expand Up @@ -201,4 +208,35 @@ curl --request POST \
--header 'Content-Type: application/json' \
--data '"{\"inputs\": \"Camera\"}" \
--output image.png
```


## Run Inferentia2 Model Locally

_Note: You need to run this on an Inferentia2 instance._

1. manually change `MMS_CONFIG_FILE`
```
wget -O sagemaker-mms.properties https://raw.githubusercontent.com/aws/deep-learning-containers/master/huggingface/build_artifacts/inference/config.properties
```

2. Adjust `handler_service.py` and comment out `if content_type in content_types.UTF8_TYPES:` thats needed for SageMaker but cannot be used locally

2. Run Container,

- transformers `text-classification` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
```
HF_MODEL_ID="distilbert/distilbert-base-uncased-finetuned-sst-2-english" HF_TASK="text-classification" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py
```
- sentence transformers `feature-extration` with `HF_OPTIMUM_BATCH_SIZE` and `HF_OPTIMUM_SEQUENCE_LENGTH`
```
HF_MODEL_ID="sentence-transformers/all-MiniLM-L6-v2" HF_TASK="feature-extraction" HF_OPTIMUM_BATCH_SIZE=1 HF_OPTIMUM_SEQUENCE_LENGTH=128 python src/sagemaker_huggingface_inference_toolkit/serving.py
```

3. Send request
```
curl --request POST \
--url http://localhost:8080/invocations \
--header 'Content-Type: application/json' \
--data "{\"inputs\": \"I like you.\"}"
```
2 changes: 1 addition & 1 deletion makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ check_dirs := src tests
# run tests

unit-test:
python -m pytest -n auto --dist loadfile -s -v ./tests/unit/
python -m pytest -v -s ./tests/unit/

integ-test:
python -m pytest -n 2 -s -v ./tests/integ/
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@


extras["test"] = [
"pytest",
"pytest<8",
"pytest-xdist",
"parameterized",
"psutil",
Expand Down
16 changes: 10 additions & 6 deletions src/sagemaker_huggingface_inference_toolkit/mms_model_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@
)

from sagemaker_huggingface_inference_toolkit import handler_service
from sagemaker_huggingface_inference_toolkit.optimum_utils import is_optimum_neuron_available
from sagemaker_huggingface_inference_toolkit.transformers_utils import (
HF_API_TOKEN,
HF_MODEL_REVISION,
_load_model_from_hub,
is_aws_neuron_available,
)


Expand Down Expand Up @@ -73,11 +73,6 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE):
elif use_hf_hub:
# Use different model store directory
model_store = DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY
if is_aws_neuron_available():
raise ValueError(
"Hugging Face Hub deployments are currently not supported with AWS Neuron and Inferentia."
"You need to create a `inference.py` script to run your model using AWS Neuron"
)
storage_dir = _load_model_from_hub(
model_id=os.environ["HF_MODEL_ID"],
model_dir=DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY,
Expand All @@ -90,6 +85,15 @@ def start_model_server(handler_service=DEFAULT_HANDLER_SERVICE):

env = environment.Environment()

# Set the number of workers to available number if optimum neuron is available and not already set
if is_optimum_neuron_available() and os.environ.get("SAGEMAKER_MODEL_SERVER_WORKERS", None) is None:
from optimum.neuron.utils.cache_utils import get_num_neuron_cores

try:
env._model_server_workers = str(get_num_neuron_cores())
except Exception:
env._model_server_workers = "1"

# Note: multi-model default config already sets default_service_handler
handler_service_for_config = None if ENABLE_MULTI_MODEL else handler_service
_create_model_server_config_file(env, handler_service_for_config)
Expand Down
42 changes: 26 additions & 16 deletions src/sagemaker_huggingface_inference_toolkit/optimum_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,25 @@ def get_input_shapes(model_dir):
# try to get input shapes from config file
try:
config = AutoConfig.from_pretrained(model_dir)
if hasattr(config, "neuron_batch_size") and hasattr(config, "neuron_sequence_length"):
input_shapes["batch_size"] = config.neuron_batch_size
input_shapes["sequence_length"] = config.neuron_sequence_length
input_shapes_available = True
logger.info(
f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}"
)
if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None:
logger.warning(
"HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
)
if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None:
logger.warning(
"HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
if hasattr(config, "neuron"):
# check if static batch size and sequence length are available
if config.neuron.get("static_batch_size", None) and config.neuron.get("static_sequence_length", None):
input_shapes["batch_size"] = config.neuron["static_batch_size"]
input_shapes["sequence_length"] = config.neuron["static_sequence_length"]
input_shapes_available = True
logger.info(
f"Input shapes found in config file. Using input shapes from config with batch size {input_shapes['batch_size']} and sequence length {input_shapes['sequence_length']}"
)
else:
# Add warning if environment variables are set but will be ignored
if os.environ.get("HF_OPTIMUM_BATCH_SIZE", None) is not None:
logger.warning(
"HF_OPTIMUM_BATCH_SIZE environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
)
if os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None) is not None:
logger.warning(
"HF_OPTIMUM_SEQUENCE_LENGTH environment variable is set. Environment variable will be ignored and input shapes from config file will be used."
)
except Exception:
input_shapes_available = False

Expand All @@ -62,6 +66,11 @@ def get_input_shapes(model_dir):

# extract input shapes from environment variables
sequence_length = os.environ.get("HF_OPTIMUM_SEQUENCE_LENGTH", None)
if sequence_length is None:
raise ValueError(
"HF_OPTIMUM_SEQUENCE_LENGTH environment variable is not set. Please set HF_OPTIMUM_SEQUENCE_LENGTH to a positive integer."
)

if not int(sequence_length) > 0:
raise ValueError(
f"HF_OPTIMUM_SEQUENCE_LENGTH must be set to a positive integer. Current value is {sequence_length}"
Expand All @@ -73,10 +82,9 @@ def get_input_shapes(model_dir):
return {"batch_size": int(batch_size), "sequence_length": int(sequence_length)}


# TODO: not used yet, need to sync on how to determine if we are running on inf2 instance
def get_optimum_neuron_pipeline(task, model_dir):
"""Method to get optimum neuron pipeline for a given task. Method checks if task is supported by optimum neuron and if required environment variables are set, in case model is not converted. If all checks pass, optimum neuron pipeline is returned. If checks fail, an error is raised."""
from optimum.neuron.pipelines import NEURONX_SUPPORTED_TASKS, pipeline
from optimum.neuron.pipelines.transformers.base import NEURONX_SUPPORTED_TASKS, pipeline
from optimum.neuron.utils import NEURON_FILE_NAME

# check task support
Expand All @@ -94,6 +102,8 @@ def get_optimum_neuron_pipeline(task, model_dir):

# get static input shapes to run inference
input_shapes = get_input_shapes(model_dir)
# set NEURON_RT_NUM_CORES to 1 to avoid conflicts with multiple HTTP workers
os.environ["NEURON_RT_NUM_CORES"] = "1"
# get optimum neuron pipeline
neuron_pipe = pipeline(task, model=model_dir, export=export, input_shapes=input_shapes)

Expand Down
15 changes: 12 additions & 3 deletions src/sagemaker_huggingface_inference_toolkit/transformers_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
from transformers.pipelines import Conversation, Pipeline

from sagemaker_huggingface_inference_toolkit.diffusers_utils import get_diffusers_pipeline, is_diffusers_available
from sagemaker_huggingface_inference_toolkit.optimum_utils import (
get_optimum_neuron_pipeline,
is_optimum_neuron_available,
)


if is_tf_available():
Expand Down Expand Up @@ -71,6 +75,7 @@ def strtobool(val):
"savedmodel": "*tar.gz",
"openvino": "*openvino*",
"ckpt": "*ckpt",
"neuronx": "*neuron",
}


Expand Down Expand Up @@ -202,7 +207,9 @@ def _load_model_from_hub(
# check if safetensors weights are available
if framework == "pytorch":
files = HfApi().model_info(model_id).siblings
if any(f.rfilename.endswith("safetensors") for f in files):
if is_optimum_neuron_available() and any(f.rfilename.endswith("neuron") for f in files):
framework = "neuronx"
elif any(f.rfilename.endswith("safetensors") for f in files):
framework = "safetensors"

# create regex to only include the framework specific weights
Expand Down Expand Up @@ -282,8 +289,10 @@ def get_pipeline(task: str, device: int, model_dir: Path, **kwargs) -> Pipeline:
kwargs["feature_extractor"] = model_dir
else:
kwargs["tokenizer"] = model_dir

if TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0:
# check if optimum neuron is available and tries to load it
if is_optimum_neuron_available():
hf_pipeline = get_optimum_neuron_pipeline(task=task, model_dir=model_dir)
elif TRUST_REMOTE_CODE and os.environ.get("HF_MODEL_ID", None) is not None and device == 0:
tokenizer = AutoTokenizer.from_pretrained(os.environ["HF_MODEL_ID"])

hf_pipeline = pipeline(
Expand Down
15 changes: 6 additions & 9 deletions tests/unit/test_handler_service_without_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ def test_handle(inference_handler):
inference_handler.initialize(CONTEXT)
json_data = json.dumps(INPUT)
prediction = inference_handler.handle([{"body": json_data.encode()}], CONTEXT)
loaded_response = json.loads(prediction[0])
assert "entity" in loaded_response[0]
assert "score" in loaded_response[0]
assert "output" in prediction[0]


@require_torch
Expand All @@ -90,13 +88,15 @@ def test_load(inference_handler):
model_dir=tmpdirname,
)
# test with automatic infer
if "HF_TASK" in os.environ:
del os.environ["HF_TASK"]
hf_pipeline_without_task = inference_handler.load(storage_folder)
assert hf_pipeline_without_task.task == "token-classification"

# test with automatic infer
os.environ["HF_TASK"] = TASK
os.environ["HF_TASK"] = "text-classification"
hf_pipeline_with_task = inference_handler.load(storage_folder)
assert hf_pipeline_with_task.task == TASK
assert hf_pipeline_with_task.task == "text-classification"


def test_preprocess(inference_handler):
Expand Down Expand Up @@ -139,10 +139,7 @@ def test_validate_and_initialize_user_module(inference_handler):
prediction = inference_handler.handle([{"body": b""}], CONTEXT)
assert "output" in prediction[0]

assert inference_handler.load({}) == "model"
assert inference_handler.preprocess({}, "") == "data"
assert inference_handler.predict({}, "model") == "output"
assert inference_handler.postprocess("output", "") == "output"
assert inference_handler.load({}) == "Loading inference_tranform_fn.py"


def test_validate_and_initialize_user_module_transform_fn():
Expand Down
33 changes: 0 additions & 33 deletions tests/unit/test_mms_model_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
# limitations under the License.import os
import os

import pytest
from sagemaker_inference.environment import model_dir

from mock import patch
Expand Down Expand Up @@ -186,35 +185,3 @@ def test_start_mms_with_model_from_hub(
subprocess_popen.assert_called_once_with(multi_model_server_cmd)
sigterm.assert_called_once_with(retrieve.return_value)
os.remove(mms_model_server.DEFAULT_HF_HUB_MODEL_EXPORT_DIRECTORY)


@patch("sagemaker_huggingface_inference_toolkit.transformers_utils._aws_neuron_available", return_value=True)
@patch("subprocess.call")
@patch("subprocess.Popen")
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._retry_retrieve_mms_server_process")
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._load_model_from_hub")
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._add_sigterm_handler")
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._install_requirements")
@patch("os.makedirs", return_value=True)
@patch("os.remove", return_value=True)
@patch("os.path.exists", return_value=True)
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._create_model_server_config_file")
@patch("sagemaker_huggingface_inference_toolkit.mms_model_server._adapt_to_mms_format")
def test_start_mms_neuron_and_model_from_hub(
adapt,
create_config,
exists,
remove,
dir,
install_requirements,
sigterm,
load_model_from_hub,
retrieve,
subprocess_popen,
subprocess_call,
_aws_neuron_available,
):
with pytest.raises(ValueError):
os.environ["HF_MODEL_ID"] = "lysandre/tiny-bert-random"

mms_model_server.start_model_server()
2 changes: 1 addition & 1 deletion tests/unit/test_optimum_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_get_input_shapes_from_file():
)
input_shapes = get_input_shapes(model_dir=storage_folder)
assert input_shapes["batch_size"] == 1
assert input_shapes["sequence_length"] == 16
assert input_shapes["sequence_length"] == 32


@require_torch
Expand Down

0 comments on commit 4164089

Please sign in to comment.