diff --git a/examples/offline_inference_neuron.py b/examples/offline_inference_neuron.py index 9392558a6fc00..c1009d50a778e 100644 --- a/examples/offline_inference_neuron.py +++ b/examples/offline_inference_neuron.py @@ -1,10 +1,14 @@ import os + from vllm import LLM, SamplingParams +# Builds the cache for the neuron compiled model. os.environ['NEURONX_DUMP_TO'] = "./Cache" - +# creates XLA hlo graphs for all the context length buckets. os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "512,1024,2048" +# creates XLA hlo graphs for all the token gen buckets. os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "512,1024,2048" + # Sample prompts. prompts = [ "Hello, my name is",