diff --git a/doc/source/custom_directives.py b/doc/source/custom_directives.py index 6e81d401c833e..2683160332d78 100644 --- a/doc/source/custom_directives.py +++ b/doc/source/custom_directives.py @@ -481,6 +481,7 @@ def key(cls: type) -> str: class Framework(ExampleEnum): """Framework type for example metadata.""" + AWSNEURON = "AWS Neuron" PYTORCH = "PyTorch" LIGHTNING = "Lightning" TRANSFORMERS = "Transformers" diff --git a/doc/source/train/examples.yml b/doc/source/train/examples.yml index 8b4f1c7cf8f24..0e7f6725e1005 100644 --- a/doc/source/train/examples.yml +++ b/doc/source/train/examples.yml @@ -119,7 +119,17 @@ examples: contributor: community link: examples/intel_gaudi/llama_pretrain - - title: Fine-tune a Llama-2 text generation models with DeepSpeed and Hugging Face Accelerate + - title: Fine-tune Llama3.1 with AWS Trainium + frameworks: + - pytorch + - aws neuron + skill_level: advanced + use_cases: + - natural language processing + - large language models + contributor: community + link: examples/aws-trainium/llama3 + - title: Fine-tune a Llama-2 text generation model with DeepSpeed and Hugging Face Accelerate frameworks: - accelerate - deepspeed diff --git a/doc/source/train/examples/aws-trainium/llama3.rst b/doc/source/train/examples/aws-trainium/llama3.rst new file mode 100644 index 0000000000000..ee7b89faf39ee --- /dev/null +++ b/doc/source/train/examples/aws-trainium/llama3.rst @@ -0,0 +1,103 @@ +:orphan: + +Distributed fine-tuning of Llama 3.1 8B on AWS Trainium with Ray and PyTorch Lightning +====================================================================================== + + +This example demonstrates how to fine-tune the `Llama 3.1 8B `__ model on `AWS +Trainium `__ instances using Ray Train, PyTorch Lightning, and AWS Neuron SDK. + +AWS Trainium is the machine learning (ML) chip that AWS built for deep +learning (DL) training of 100B+ parameter models. `AWS Neuron +SDK `__ helps +developers train models on Trainium accelerators. + +Prepare the environment +----------------------- + +See `Setup EKS cluster and tools `__ for setting up an Amazon EKS cluster leveraging AWS Trainium instances. + +Create a Docker image +--------------------- +When the EKS cluster is ready, create an Amazon ECR repository for building and uploading the Docker image containing artifacts for fine-tuning a Llama3.1 8B model: + +1. Clone the repo. + +:: + + git clone https://github.com/aws-neuron/aws-neuron-eks-samples.git + +2. Go to the ``llama3.1_8B_finetune_ray_ptl_neuron`` directory. + +:: + + cd aws-neuron-eks-samples/llama3.1_8B_finetune_ray_ptl_neuron + +3. Trigger the script. + +:: + + chmod +x 0-kuberay-trn1-llama3-finetune-build-image.sh + ./0-kuberay-trn1-llama3-finetune-build-image.sh + +4. Enter the zone your cluster is running in, for example: us-east-2. + +5. Verify in the AWS console that the Amazon ECR service has the newly + created ``kuberay_trn1_llama3.1_pytorch2`` repository. + +6. Update the ECR image ARN in the manifest file used for creating the Ray cluster. + +Replace the and placeholders with actual values in the ``1-llama3-finetune-trn1-create-raycluster.yaml`` file using commands below to reflect the ECR image ARN created above: + +:: + + export AWS_ACCOUNT_ID= # for ex: 111222333444 + export REGION= # for ex: us-east-2 + sed -i "s//$AWS_ACCOUNT_ID/g" 1-llama3-finetune-trn1-create-raycluster.yaml + sed -i "s//$REGION/g" 1-llama3-finetune-trn1-create-raycluster.yaml + +Configuring Ray Cluster +----------------------- + +The ``llama3.1_8B_finetune_ray_ptl_neuron`` directory in the AWS Neuron samples repository simplifies the +Ray configuration. KubeRay provides a manifest that you can apply +to the cluster to set up the head and worker pods. + +Run the following command to set up the Ray cluster: + +:: + + kubectl apply -f 1-llama3-finetune-trn1-create-raycluster.yaml + + +Accessing Ray Dashboard +----------------------- +Port forward from the cluster to see the state of the Ray dashboard and +then view it on `http://localhost:8265 `__. +Run it in the background with the following command: + +:: + + kubectl port-forward service/kuberay-trn1-head-svc 8265:8265 & + +Launching Ray Jobs +------------------ + +The Ray cluster now ready to handle workloads. Initiate the data preparation and fine-tuning Ray jobs: + +1. Launch the Ray job for downloading the dolly-15k dataset and the Llama3.1 8B model artifacts: + +:: + + kubectl apply -f 2-llama3-finetune-trn1-rayjob-create-data.yaml + +2. When the job has executed successfully, run the following fine-tuning job: + +:: + + kubectl apply -f 3-llama3-finetune-trn1-rayjob-submit-finetuning-job.yaml + +3. Monitor the jobs via the Ray Dashboard + + +For detailed information on each of the steps above, see the `AWS documentation link `__. \ No newline at end of file