Merge remote-tracking branch 'upstream/main'

Y-IAB · May 28, 2024 · b87e34c · b87e34c
2 parents ae4253a + 8a20a7b
commit b87e34c
Show file tree

Hide file tree

Showing 110 changed files with 1,799 additions and 454 deletions.
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
@@ -30,7 +30,12 @@ jobs:
           - cuda: "121"
             cuda_version: 12.1.0
             python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.2.2
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
     steps:
       - name: Checkout

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -28,7 +28,12 @@ jobs:
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -84,7 +89,12 @@ jobs:
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -115,3 +125,45 @@ jobs:
              ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
              ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
           labels: ${{ steps.metadata.outputs.labels }}
+
+  build-axolotl-cloud-no-tmux:
+    needs: build-axolotl
+    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
+    # this job needs to be run on self-hosted GPU runners...
+    strategy:
+      matrix:
+        include:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
+            axolotl_extras:
+    runs-on: axolotl-gpu-runner
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
+        with:
+          images: winglian/axolotl-cloud-term
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Build
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          build-args: |
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            CUDA=${{ matrix.cuda }}
+          file: ./docker/Dockerfile-cloud-no-tmux
+          push: ${{ github.event_name != 'pull_request' }}
+          tags: |
+             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
+          labels: ${{ steps.metadata.outputs.labels }}
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
@@ -27,7 +27,12 @@ jobs:
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
@@ -84,7 +89,12 @@ jobs:
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.2.2
+            axolotl_extras:
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.11"
+            pytorch: 2.3.0
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -82,7 +82,7 @@ jobs:
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.11"
-            pytorch: 2.2.1
+            pytorch: 2.2.2
             num_gpus: 1
     steps:
       - name: Checkout

diff --git a/README.md b/README.md
@@ -34,6 +34,7 @@ Features:
   - [Mac](#mac)
   - [Google Colab](#google-colab)
   - [Launching on public clouds via SkyPilot](#launching-on-public-clouds-via-skypilot)
+  - [Launching on public clouds via dstack](#launching-on-public-clouds-via-dstack)
 - [Dataset](#dataset)
 - [Config](#config)
   - [Train](#train)
@@ -292,6 +293,42 @@ HF_TOKEN=xx sky launch axolotl.yaml --env HF_TOKEN
 HF_TOKEN=xx BUCKET=<unique-name> sky spot launch axolotl-spot.yaml --env HF_TOKEN --env BUCKET
 ```
 
+#### Launching on public clouds via dstack
+To launch on GPU instance (both on-demand and spot instances) on public clouds (GCP, AWS, Azure, Lambda Labs, TensorDock, Vast.ai, and CUDO), you can use [dstack](https://dstack.ai/).
+
+Write a job description in YAML as below:
+
+```yaml
+# dstack.yaml
+type: task
+
+image: winglian/axolotl-cloud:main-20240429-py3.11-cu121-2.2.2
+
+env:
+  - HUGGING_FACE_HUB_TOKEN
+  - WANDB_API_KEY
+
+commands:
+  - accelerate launch -m axolotl.cli.train config.yaml
+
+ports:
+  - 6006
+
+resources:
+  gpu:
+    memory: 24GB..
+    count: 2
+```
+
+then, simply run the job with `dstack run` command. Append `--spot` option if you want spot instance. `dstack run` command will show you the instance with cheapest price across multi cloud services:
+
+```bash
+pip install dstack
+HUGGING_FACE_HUB_TOKEN=xxx WANDB_API_KEY=xxx dstack run . -f dstack.yaml # --spot
+```
+
+For further and fine-grained use cases, please refer to the official [dstack documents](https://dstack.ai/docs/) and the detailed description of [axolotl example](https://github.com/dstackai/dstack/tree/master/examples/fine-tuning/axolotl) on the official repository.
+
 ### Dataset
 
 Axolotl supports a variety of dataset formats.  It is recommended to use a JSONL.  The schema of the JSONL depends upon the task and the prompt template you wish to use.  Instead of a JSONL, you can also use a HuggingFace dataset with columns for each JSONL field.

diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -11,7 +11,7 @@ ARG PYTORCH_VERSION="2.1.2"
 ENV PYTORCH_VERSION=$PYTORCH_VERSION
 
 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev apt-transport-https ca-certificates gnupg
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs apt-transport-https ca-certificates gnupg
 
 RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
     curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | gpg --dearmor -o /usr/share/keyrings/cloud.google.gpg && \

diff --git a/docker/Dockerfile-cloud-no-tmux b/docker/Dockerfile-cloud-no-tmux
@@ -0,0 +1,27 @@
+ARG BASE_TAG=main
+FROM winglian/axolotl:$BASE_TAG
+
+ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
+ENV HF_HOME="/workspace/data/huggingface-cache/hub"
+ENV HF_HUB_ENABLE_HF_TRANSFER="1"
+
+EXPOSE 8888
+EXPOSE 22
+
+COPY scripts/cloud-entrypoint-term.sh /root/cloud-entrypoint.sh
+COPY scripts/motd /etc/motd
+
+RUN pip install jupyterlab notebook ipywidgets && \
+    jupyter lab clean
+RUN apt install --yes --no-install-recommends openssh-server tmux sudo && \
+    pip3 install -U --no-cache-dir grpcio ray[default]==2.9.3 && \
+    mkdir -p ~/.ssh && \
+    chmod 700 ~/.ssh && \
+    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
+    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
+    chmod +x /root/cloud-entrypoint.sh
+
+ENTRYPOINT ["/root/cloud-entrypoint.sh"]
+CMD ["sleep", "infinity"]
diff --git a/docs/config.qmd b/docs/config.qmd
@@ -186,6 +186,11 @@ eval_sample_packing:
 # The trainer will provide recommended values for these values.
 sample_packing_eff_est:
 total_num_tokens:
+# Increasing the following values helps with packing, but usually only slightly (<%1.)
+# The number of samples packed at a time.
+sample_packing_group_size: 100000
+# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
+sample_packing_bin_size: 200
 
 # Passed through to transformers when loading the model when launched without accelerate
 # Use `sequential` when training w/ model parallelism to limit memory
@@ -227,6 +232,12 @@ lora_modules_to_save:
 
 lora_fan_in_fan_out: false
 
+# LoRA+ hyperparameters
+# For more details about the following options, see:
+# https://arxiv.org/abs/2402.12354  and `src/axolotl/core/train_builder.py`
+loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
+loraplus_lr_embedding: #  loraplus learning rate for lora embedding layers. Default value is 1e-6.
+
 peft:
   # Configuration options for loftq initialization for LoRA
   # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
@@ -279,7 +290,7 @@ lr_quadratic_warmup:
 logging_steps:
 eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
 evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
-save_strategy: # Set to `no` to skip checkpoint saves
+save_strategy: # Set to `"no"` to skip checkpoint saves
 save_steps: # Leave empty to save at each epoch
 saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
 save_total_limit: # Checkpoints saved at a time

diff --git a/examples/cerebras/btlm-ft.yml b/examples/cerebras/btlm-ft.yml
@@ -38,7 +38,7 @@ wandb_watch:
 wandb_name:
 wandb_log_model:
 
-output_dir: btlm-out
+output_dir: ./outputs/btlm-out
 gradient_accumulation_steps: 1
 micro_batch_size: 1
 num_epochs: 1

diff --git a/examples/cerebras/qlora.yml b/examples/cerebras/qlora.yml
@@ -25,7 +25,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./qlora-out
+output_dir: ./outputs/qlora-out
 batch_size: 4
 micro_batch_size: 4
 num_epochs: 2

diff --git a/examples/code-llama/13b/lora.yml b/examples/code-llama/13b/lora.yml
@@ -11,7 +11,7 @@ datasets:
     type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./lora-out
+output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true

diff --git a/examples/code-llama/13b/qlora.yml b/examples/code-llama/13b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
     type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./qlora-out
+output_dir: ./outputs/qlora-out
 
 adapter: qlora
 lora_model_dir:

diff --git a/examples/code-llama/34b/lora.yml b/examples/code-llama/34b/lora.yml
@@ -11,7 +11,7 @@ datasets:
     type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./lora-out
+output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true

diff --git a/examples/code-llama/34b/qlora.yml b/examples/code-llama/34b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
     type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./qlora-out
+output_dir: ./outputs/qlora-out
 
 adapter: qlora
 lora_model_dir:

diff --git a/examples/code-llama/7b/lora.yml b/examples/code-llama/7b/lora.yml
@@ -11,7 +11,7 @@ datasets:
     type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./lora-out
+output_dir: ./outputs/lora-out
 
 sequence_len: 4096
 sample_packing: true

diff --git a/examples/code-llama/7b/qlora.yml b/examples/code-llama/7b/qlora.yml
@@ -11,7 +11,7 @@ datasets:
     type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
-output_dir: ./qlora-out
+output_dir: ./outputs/qlora-out
 
 adapter: qlora
 lora_model_dir:

diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -84,7 +84,7 @@
         "    type: alpaca\n",
         "dataset_prepared_path:\n",
         "val_set_size: 0.05\n",
-        "output_dir: ./qlora-out\n",
+        "output_dir: ./outputs/qlora-out\n",
         "\n",
         "adapter: qlora\n",
         "lora_model_dir:\n",

diff --git a/examples/dbrx/16bit-lora.yaml b/examples/dbrx/16bit-lora.yaml
@@ -10,7 +10,7 @@ datasets:
     type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./out
+output_dir: ./outputs/out
 
 sequence_len: 512
 sample_packing: false

diff --git a/examples/dbrx/8bit-lora.yaml b/examples/dbrx/8bit-lora.yaml
@@ -10,7 +10,7 @@ datasets:
     type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./out
+output_dir: ./outputs/out
 
 sequence_len: 512
 sample_packing: false

diff --git a/examples/dbrx/fft-ds-zero3.yaml b/examples/dbrx/fft-ds-zero3.yaml
@@ -10,7 +10,7 @@ datasets:
     type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
-output_dir: ./out
+output_dir: ./outputs/out
 
 sequence_len: 512
 sample_packing: false

diff --git a/examples/falcon/config-7b-lora.yml b/examples/falcon/config-7b-lora.yml
@@ -28,7 +28,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./falcon-7b
+output_dir: ./outputs/falcon-7b
 batch_size: 2
 micro_batch_size: 1
 num_epochs: 4

diff --git a/examples/falcon/config-7b-qlora.yml b/examples/falcon/config-7b-qlora.yml
@@ -42,7 +42,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./qlora-out
+output_dir: ./outputs/qlora-out
 
 # QLoRA paper Table 9
 # - 16 for 7b & 13b

diff --git a/examples/falcon/config-7b.yml b/examples/falcon/config-7b.yml
@@ -28,7 +28,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./falcon-7b
+output_dir: ./outputs/falcon-7b
 batch_size: 2
 micro_batch_size: 1
 num_epochs: 4

diff --git a/examples/gemma/qlora.yml b/examples/gemma/qlora.yml
@@ -12,7 +12,7 @@ datasets:
   - path: mhenrichsen/alpaca_2k_test
     type: alpaca
 val_set_size: 0.1
-output_dir: ./out
+output_dir: ./outputs/out
 
 adapter: qlora
 lora_r: 32

diff --git a/examples/gptj/qlora.yml b/examples/gptj/qlora.yml
@@ -23,7 +23,7 @@ wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
-output_dir: ./qlora-out
+output_dir: ./outputs/qlora-out
 gradient_accumulation_steps: 2
 micro_batch_size: 2
 num_epochs: 2