Skip to content

Commit

Permalink
TGI improvements (#522)
Browse files Browse the repository at this point in the history
* examples(generation): log model loading time

* chore: bump dev version

* fix(tgi): do not override generation parameters

When a generation parameter is not specified in a TGI request its value
is zero, and it should not override the default generation parameter.

* fix(tgi): avoid loading model twice when exporting

* fix(tgi): make sure MAX_BATCH_SIZE is set on SageMaker

* fix(tgi): update error message

* fix(tgi): clear pending requests after warmup
  • Loading branch information
dacorvo authored Mar 21, 2024
1 parent 4a7df1a commit 59b5c16
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 57 deletions.
3 changes: 3 additions & 0 deletions examples/text-generation/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ def generate(model, tokenizer, prompts, length, temperature):
else:
if args.seed is not None:
set_seed(args.seed)
start = time.time()
model = NeuronModelForCausalLM.from_pretrained(args.model, export=False, low_cpu_mem_usage=True)
end = time.time()
print(f"Neuron model loaded in {end - start:.2f} s.")
batch_size = model.config.neuron["batch_size"]
prompts = args.prompts.split("|")
if len(prompts) < batch_size:
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.0.20.dev0"
__version__ = "0.0.21.dev0"

__sdk_version__ = "2.17.0"
5 changes: 5 additions & 0 deletions text-generation-inference/sagemaker-entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,9 @@ if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
fi

if [[ -z "${MAX_BATCH_SIZE}" ]]; then
echo "MAX_BATCH_SIZE must be set to the model static batch size"
exit 1
fi

text-generation-launcher --port 8080
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,9 @@ def serve(
logger.warning("'trust_remote_code' argument is not supported and will be ignored.")

# Import here after the logger is added to log potential import exceptions
from .model import fetch_model
from .server import serve

model_path = fetch_model(model_id, revision)
serve(model_path, uds_path)
serve(model_id, revision, uds_path)


@app.command()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@

import torch
from loguru import logger
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
from transformers.generation import GenerationConfig

from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.generation import TokenSelector

from .model import get_export_kwargs_from_env
from .pb.generate_pb2 import (
Batch,
CachedBatch,
Expand Down Expand Up @@ -152,13 +153,19 @@ def assign(self, request: Request, generation_config: GenerationConfig):
self._request_id = request.id
self._inputs = request.inputs
self._generation_config = copy.deepcopy(generation_config)
# Update generation config with token chooser parameters
self._generation_config.temperature = request.parameters.temperature
self._generation_config.top_k = request.parameters.top_k
self._generation_config.top_p = request.parameters.top_p
self._generation_config.typical_p = request.parameters.typical_p
# Update generation config with request parameters
self._generation_config.do_sample = request.parameters.do_sample
self._generation_config.repetition_penalty = request.parameters.repetition_penalty
if self._generation_config.do_sample:
if request.parameters.temperature != 0:
self._generation_config.temperature = request.parameters.temperature
if request.parameters.top_k != 0:
self._generation_config.top_k = request.parameters.top_k
if request.parameters.top_p != 0:
self._generation_config.top_p = request.parameters.top_p
if request.parameters.typical_p != 0:
self._generation_config.typical_p = request.parameters.typical_p
if request.parameters.repetition_penalty != 0:
self._generation_config.repetition_penalty = request.parameters.repetition_penalty
self.seed = request.parameters.seed
# TODO: watermark
self._generation_config.max_new_tokens = request.stopping_parameters.max_new_tokens
Expand Down Expand Up @@ -327,6 +334,7 @@ def warmup(self, batch: Batch) -> int:
f"Inconsistent server configuration: please make sure max-prefill-tokens does not exceed {batch_size} x max-input-length."
)
self.prefill(batch)
self.clear()
return self.model.batch_size * self.model.max_length

def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
Expand All @@ -347,7 +355,7 @@ def prefill(self, batch: Batch) -> Tuple[List[Generation], CachedBatch]:
if len(empty_slots) < len(batch.requests):
raise ValueError(
f"Cannot prefill {len(batch.requests)} new request(s) with only {len(empty_slots)} empty slots."
f"Please align the number of concurrent requests with the static batch size: {self.model.batch_size}."
f" Please align max_batch_size with the static batch size: {self.model.batch_size}."
)
# Assign each request to an empty slot
logger.debug(f"Prefilling {len(batch.requests)} new request(s) with {len(empty_slots)} empty slot(s)")
Expand Down Expand Up @@ -538,23 +546,29 @@ def _clear(self, request_ids: List):
slot.clear()

@classmethod
def from_pretrained(
cls,
model_path: str,
):
def from_pretrained(cls, model_id: str, revision: str = None):
"""Instantiate a NeuronGenerator.
Args:
model_path (`str`):
The path to a local neuron model. This path must also contain a Tokenizer.
model_id (`str`):
A hub model id or the path to a local model. This path must also contain a Tokenizer.
revision (`Optional[str]`, defaults to `None`):
The revision of the model on the HuggingFace hub.
Returns:
A NeuronGenerator.
"""
logger.info("Loading model on neuron devices (this can take a few minutes).")
config = AutoConfig.from_pretrained(model_id)
neuron_config = getattr(config, "neuron", None)
start = time.time()
model = NeuronModelForCausalLM.from_pretrained(model_path)
if neuron_config is None:
export_kwargs = get_export_kwargs_from_env()
logger.info(f"Exporting model to neuron with config: {export_kwargs}.")
model = NeuronModelForCausalLM.from_pretrained(model_id, revision=revision, export=True, **export_kwargs)
else:
logger.info("Loading model on neuron devices (this can take a few minutes).")
model = NeuronModelForCausalLM.from_pretrained(model_id, revision=revision)
end = time.time()
logger.info(f"Model successfully loaded in {end - start:.2f} s.")
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
return cls(model, tokenizer)
52 changes: 20 additions & 32 deletions text-generation-inference/server/text_generation_server/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@
from huggingface_hub import snapshot_download
from huggingface_hub.constants import HF_HUB_CACHE
from loguru import logger
from transformers import AutoConfig, AutoTokenizer, GenerationConfig
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig

from optimum.neuron import NeuronModelForCausalLM
from optimum.neuron.utils import ModelCacheEntry, get_hub_cached_entries
from optimum.neuron.utils import get_hub_cached_entries


def get_export_kwargs_from_env():
Expand Down Expand Up @@ -74,59 +74,47 @@ def fetch_model(
The revision of the model on the HuggingFace hub.
Returns:
Local folder path (string) of the model.
A string corresponding to the model_id or path.
"""
if not os.path.isdir("/sys/class/neuron_device/"):
raise SystemError("No neuron cores detected on the host.")
if os.path.isdir(model_id):
if revision is not None:
logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
return model_id
if os.path.isdir(model_id) and revision is not None:
logger.warning("Revision {} ignored for local model at {}".format(revision, model_id))
revision = None
# Download the model from the Hub (HUGGING_FACE_HUB_TOKEN must be set for a private or gated model)
# Note that the model may already be present in the cache.
config = AutoConfig.from_pretrained(model_id, revision=revision)
neuron_config = getattr(config, "neuron", None)
log_cache_size()
if neuron_config is not None:
if os.path.isdir(model_id):
return model_id
# Prefetch the neuron model from the Hub
logger.info(f"Fetching revision [{revision}] for neuron model {model_id} under {HF_HUB_CACHE}")
log_cache_size()
return snapshot_download(model_id, revision=revision)
# Not a neuron model: evaluate the export config and check if it has been exported locally
# Model needs to be exported: look for compatible cached entries on the hub
export_kwargs = get_export_kwargs_from_env()
export_config = NeuronModelForCausalLM.get_export_config(model_id, config, revision=revision, **export_kwargs)
entry = ModelCacheEntry(model_id, export_config)
export_path = f"{HF_HUB_CACHE}/{entry.hash}"
if os.path.exists(export_path):
# The model has already been exported for that configuration
logger.info(f"Neuron model for {model_id} with {export_config.neuron} found under {export_path}.")
return export_path
# Look for compatible cached entries on the hub
neuron_config = export_config.neuron
if not is_cached(model_id, neuron_config):
error_msg = (
f"No cached version found for {model_id} with {neuron_config}."
"You can start a discussion to request it on https://huggingface.co/aws-neuron/optimum-neuron-cache."
)
raise ValueError(error_msg)
# Export the model
logger.warning(f"{model_id} is not a neuron model: it will be exported using cached artifacts.")
start = time.time()
logger.info(f"Exporting model to neuron with config {neuron_config}.")
# Prefetch weights, tokenizer and generation config so that they are in cache
log_cache_size()
start = time.time()
model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **export_kwargs)
AutoModelForCausalLM.from_pretrained(model_id, revision=revision)
mid = time.time()
logger.info(f"Model weights fetched in {mid - start:.2f} s.")
AutoTokenizer.from_pretrained(model_id, revision=revision)
end = time.time()
logger.info(f"Model successfully exported in {end - start:.2f} s.")
logger.info(f"Saving exported model to local storage under {export_path}.")
log_cache_size()
model.save_pretrained(export_path)
logger.info(f"Saving model tokenizer under {export_path}.")
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
tokenizer.save_pretrained(export_path)
logger.info(f"Tokenizer fetched in {end - mid:.2f} s.")
try:
config = GenerationConfig.from_pretrained(model_id, revision=revision)
config.save_pretrained(export_path)
logger.info(f"Saved model default generation config under {export_path}.")
GenerationConfig.from_pretrained(model_id, revision=revision)
except Exception:
logger.warning(f"No default generation config found for {model_id}.")
logger.info(f"Model successfully exported in {end - start:.2f} s under {export_path}.")
return export_path
log_cache_size()
return model_id
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,17 @@ async def Decode(self, request, context):


def serve(
model_path: str,
model_id: str,
revision: str,
uds_path: Path,
):
async def serve_inner(model_path: str):
async def serve_inner(model_id: str, revision: str):
unix_socket_template = "unix://{}-{}"
local_url = unix_socket_template.format(uds_path, 0)
server_urls = [local_url]

try:
generator = NeuronGenerator.from_pretrained(model_path)
generator = NeuronGenerator.from_pretrained(model_id, revision)
except Exception:
logger.exception("Error when initializing model")
raise
Expand All @@ -84,4 +85,4 @@ async def serve_inner(model_path: str):
logger.info("Signal received. Shutting down")
await server.stop(0)

asyncio.run(serve_inner(model_path))
asyncio.run(serve_inner(model_id, revision))

0 comments on commit 59b5c16

Please sign in to comment.