pytorch · namannandan · Oct 23, 2023 · Oct 23, 2023 · Oct 23, 2023 · Oct 26, 2023
diff --git a/examples/large_models/inferentia2/llama2/Readme.md b/examples/large_models/inferentia2/llama2/Readme.md
@@ -10,9 +10,9 @@ The batch size and micro batch size configurations are present in [model-config.
 The batch size is chosen to be a relatively large value, say 16 since micro batching enables running the preprocess(tokenization) and inference steps in parallel on the micro batches. The micro batch size is the batch size used for the Inf2 model compilation.
 Since compilation batch size can influence compile time and also constrained by the Inf2 instance type, this is chosen to be a relatively smaller value, say 4.
 
-This example also demonstrates the utilization of neuronx cache to store inf2 model compilation artifacts using the `NEURONX_CACHE` and `NEURONX_DUMP_TO` environment variables in the custom handler.
-When the model is loaded for the first time, the model is compiled for the configured micro batch size and the compilation artifacts are saved to the neuronx cache.
-On subsequent model load, the compilation artifacts in the neuronx cache serves as `Ahead of Time(AOT)` compilation artifacts and significantly reduces the model load time.
+This example also demonstrates the utilization of [Neuron Persistent Cache](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-features/neuron-caching.html) for inf2 model compilation artifacts using the `NEURONX_CACHE` and `NEURONX_DUMP_TO` environment variables.
+When the model is loaded for the first time, the model is compiled for the configured micro batch size and the compilation artifacts are saved to the neuron persistent cache.
+On subsequent model load, the compilation artifacts in the neuron persistent cache serve as `Ahead of Time(AOT)` compilation artifacts and significantly reduces the model load time.
 For convenience, the compiled model artifacts for this example are made available on the Torchserve model zoo: `s3://torchserve/mar_files/llama-2-13b-neuronx-b4`\
 Instructions on how to use the AOT compiled model artifacts is shown below.
 
@@ -78,7 +78,7 @@ huggingface-cli login
 
 Run the `inf2_save_split_checkpoints.py` script
 ```bash
-python ../util/inf2_save_split_checkpoints.py --model_name meta-llama/Llama-2-13b-hf --save_path './llama-2-13b-split'
+python ../util/inf2_save_split_checkpoints.py --model_name meta-llama/Llama-2-13b-hf --save_path './llama-2-13b-split' generate_neuron_cache --neuron_cache_dir './neuron_cache' --batch_size 4 --amp 'bf16' --tp_degree 6
 ```
 
 
@@ -87,6 +87,7 @@ python ../util/inf2_save_split_checkpoints.py --model_name meta-llama/Llama-2-13
 ```bash
 torch-model-archiver --model-name llama-2-13b --version 1.0 --handler inf2_handler.py -r requirements.txt --config-file model-config.yaml --archive-format no-archive
 mv llama-2-13b-split llama-2-13b
+mv neuron_cache llama-2-13b
 ```
 
 ### Step 5: Add the model artifacts to model store

diff --git a/examples/large_models/inferentia2/util/inf2_save_split_checkpoints.py b/examples/large_models/inferentia2/util/inf2_save_split_checkpoints.py
@@ -4,6 +4,7 @@
 import torch
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from transformers.models.opt import OPTForCausalLM
+from transformers_neuronx.llama.model import LlamaForSampling
 from transformers_neuronx.module import save_pretrained_split
 
 os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
@@ -40,6 +41,26 @@ def opt_amp_callback(model: OPTForCausalLM, dtype: torch.dtype) -> None:
     default="./model-splits",
     help="Output directory for downloaded model files",
 )
+subparsers = parser.add_subparsers(dest="subparser")
+parser_neuron_cache = subparsers.add_parser("generate_neuron_cache")
+parser_neuron_cache.add_argument(
+    "--neuron_cache_dir",
+    type=str,
+    required=True,
+    help="Target directory to store neuronx-cc compiled model",
+)
+parser_neuron_cache.add_argument(
+    "--batch_size", type=int, required=True, help="Batch size for the compiled model"
+)
+parser_neuron_cache.add_argument(
+    "--amp", type=str, required=True, help="Automatic mixed precision"
+)
+parser_neuron_cache.add_argument(
+    "--tp_degree",
+    type=int,
+    required=True,
+    help="Tensor parallelism degree for the compiled model",
+)
 args = parser.parse_args()
 
 save_path = create_directory_if_not_exists(args.save_path)
@@ -62,3 +83,26 @@ def opt_amp_callback(model: OPTForCausalLM, dtype: torch.dtype) -> None:
 tokenizer.save_pretrained(args.save_path)
 
 print(f"Files for '{args.model_name}' have been downloaded to '{args.save_path}'.")
+
+if args.subparser == "generate_neuron_cache":
+    os.environ["NEURONX_CACHE"] = "on"
+    os.environ["NEURONX_DUMP_TO"] = create_directory_if_not_exists(
+        args.neuron_cache_dir
+    )
+    os.environ["NEURON_CC_FLAGS"] = "--model-type=transformer-inference"
+
+    if hf_model_config.model_type == "llama":
+        model = LlamaForSampling.from_pretrained(
+            args.save_path,
+            batch_size=args.batch_size,
+            amp=args.amp,
+            tp_degree=args.tp_degree,
+        )
+    else:
+        raise RuntimeError(
+            f"Neuron cache generation for model {args.model_name} not supported"
+        )
+
+    print(f"Compiling '{args.model_name}'")
+    model.to_neuron()
+    print(f"Neuron cache for '{args.model_name}' saved to {args.neuron_cache_dir}")