Update axolotl (#16)

* WIP: Update to latest axolotl * Update snowflake connector * Remove monkey patching, use plugins system * Fix for sample packing patching * Update deepspeed * bump axolotl commit version * Disable Liger till linkedin/Liger-Kernel#322 (comment) is taken care of * Update reqs * Enhance sequence lens validation * Fix notebook * Update notebook * Fix error handling * Update base image * Fix cuda repo and allow changing held packages for libnccl * Fix build
truefoundry · Nov 4, 2024 · 9f2b630 · 9f2b630
1 parent 68a0c90
commit 9f2b630
Show file tree

Hide file tree

Showing 15 changed files with 422 additions and 382 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,3 +16,4 @@ tensorboard_logs/
 mlruns/
 output/
 outputs/
+*.egg-info/
diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,22 @@
-# https://hub.docker.com/layers/winglian/axolotl/main-20240819-py3.11-cu121-2.3.1/images/sha256-eb331da0d83e0e55301c542852ee4939d36fa02810f57d99b15f56e4dc0e200d?context=explore
-FROM winglian/axolotl@sha256:e70e7ea55ab3ae3c212066bf45271f49198445ab646b9d470d9e0f41050ac8c9
+# https://hub.docker.com/layers/winglian/axolotl/main-20241104-py3.11-cu121-2.3.1/images/sha256-790297fa1d71f8f1590c73ca4505ca39fe7dfa2886b2b6862199a6df679bf8e4?context=explore
+FROM winglian/axolotl@sha256:cffbcc4993e80301a8918062f8136a6ac402877fd6c29f1168be563e543aee4d
 USER root
 COPY requirements.txt /tmp/
 RUN pip install -U pip wheel setuptools && \
-    pip uninstall -y mlflow axolotl && \
+    pip uninstall -y axolotl && \
     pip install --no-cache-dir -U -r /tmp/requirements.txt
 RUN mkdir -p /packages && \
     cd /packages && \
     git clone https://github.com/truefoundry/axolotl && \
     cd axolotl/ && \
-    git checkout 294e9097e2c4ea642198aea5ad0561d3b647e572
+    git checkout e16f637d079ef5d56321a240ef0547a50c37b708
 RUN cd /packages/axolotl/ && \
-    MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers] && \
+    MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation --no-cache-dir -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers,lion-pytorch,galore] && \
     pip install --no-cache-dir -U -r /tmp/requirements.txt && \
     rm -rf /root/.cache/pip
+COPY plugins/axolotl_truefoundry /packages/axolotl_truefoundry
+RUN cd /packages/axolotl_truefoundry/ && \
+    pip install --no-cache-dir -e . && \
+    rm -rf /root/.cache/pip
 WORKDIR /app
 COPY . /app
diff --git a/Dockerfile-notebook b/Dockerfile-notebook
@@ -1,18 +1,17 @@
-FROM truefoundrycloud/jupyter:0.2.20-sudo
+FROM tfy.jfrog.io/tfy-images/jupyter:0.3.4-cu121-py3.11.10-sudo
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
 ENV DEBIAN_FRONTEND=noninteractive
 USER root
-RUN add-apt-repository ppa:flexiondotorg/nvtop -y && \
-    apt update && \
-    apt install -y --no-install-recommends git curl wget htop nvtop && \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /tmp/cuda-keyring_1.1-1_all.deb && \
+RUN apt update && \
+    apt install -y --no-install-recommends git curl wget && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -O /tmp/cuda-keyring_1.1-1_all.deb && \
     dpkg -i /tmp/cuda-keyring_1.1-1_all.deb && \
     apt update && \
-    apt install -y --no-install-recommends cuda-toolkit-12-1 libcudnn8=8.9.7.29-1+cuda12.2 libcudnn8-dev=8.9.7.29-1+cuda12.2 libnccl2 libnccl-dev
+    apt install -y --no-install-recommends --allow-change-held-packages libnccl2 libnccl-dev
 USER jovyan
 COPY requirements.txt notebook-requirements.txt /tmp/llm-finetune/
 RUN pip install -U pip wheel setuptools && \
-    pip uninstall -y mlflow axolotl && \
+    pip uninstall -y axolotl && \
     pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
 USER root
 RUN mkdir -p /packages && \
@@ -21,9 +20,13 @@ USER jovyan
 RUN cd /packages && \
     git clone https://github.com/truefoundry/axolotl && \
     cd axolotl/ && \
-    git checkout 294e9097e2c4ea642198aea5ad0561d3b647e572
+    git checkout e16f637d079ef5d56321a240ef0547a50c37b708
 RUN cd /packages/axolotl/ && \
-    MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers] && \
-    pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
+    MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation --no-cache-dir -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers,lion-pytorch,galore] && \
+    pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt && \
+    rm -rf /root/.cache/pip
+COPY plugins/axolotl_truefoundry /packages/axolotl_truefoundry
+RUN cd /packages/axolotl_truefoundry/ && \
+    pip install --no-cache-dir -e .
 COPY --chown=jovyan:users . /tmp_home/jovyan/llm-finetune/
 ENV JUPYTER_APP_LAUNCHER_PATH=/home/jovyan/llm-finetune/.jp_app_launcher/
diff --git a/config-base.yaml b/config-base.yaml
@@ -6,17 +6,10 @@ micro_batch_size: 1
 revision_of_model: null
 sequence_len: 2048
 val_set_size: 0.1
-## Added by TrueFoundry, not native to Axolotl
-train_data_uri: null
-val_data_uri: null
-dataset_type: completion  # Can be completion | chat
-mlfoundry_enable_reporting: False
-mlfoundry_ml_repo: null
 
 # ---------------------
 # Auto computed and set by script based on environment and external state
 # Only edit them if you know what you are doing
-chat_template: auto # type: string
 data_dir: auto # type: string
 datasets: auto # type: list
 test_datasets: auto # type: list
@@ -40,13 +33,11 @@ unsloth_lora_o: auto # type: bool
 unsloth_rms_norm: auto # type: bool
 unsloth_rope: auto # type: bool
 tf32: auto # type: bool
-## Added by TrueFoundry, not native to Axolotl
-mlfoundry_run_name: auto # type: string
-mlfoundry_checkpoint_artifact_name: auto # type: string
 
 
 # ---------------------
 # Defaults
+auto_find_batch_size: False
 bnb_config_kwargs:
   bnb_4bit_quant_type: nf4
   bnb_4bit_use_double_quant: True
@@ -59,6 +50,7 @@ base_model_ignore_patterns:
   - '*.ot'
   - '*.tflite'
   - '*.msgpack'
+chat_template: tokenizer_default_fallback_chatml
 dataset_prepared_path: ./outputs/data/last_run_prepared
 dataset_processes: 1
 ddp_timeout: 21600
@@ -71,7 +63,7 @@ eval_steps: 0.1
 eval_strategy: steps
 fix_untrained_tokens: true
 gradient_accumulation_steps: 4
-gradient_checkpointing: unsloth
+gradient_checkpointing: True
 gradient_checkpointing_kwargs:
   use_reentrant: True
 learning_rate: 0.00001
@@ -87,9 +79,15 @@ lora_target_modules: null
 low_cpu_mem_usage: True
 lr_scheduler: cosine
 max_grad_norm: 1.0
+mean_resizing_embeddings: True
+multipack_real_batches: False
 num_epochs: 10
 optimizer: adamw_torch_fused
 output_dir: ./outputs
+plugins:
+  - axolotl_truefoundry.TrueFoundryMLPlugin
+  # Liger is disabled till it is updated with Gradient Accumulation Loss fixes
+  # - axolotl.integrations.liger.LigerPlugin
 pad_to_sequence_len: True
 remove_unused_columns: True
 report_to: tensorboard
@@ -107,12 +105,28 @@ trust_remote_code: True
 type_of_model: AutoModelForCausalLM
 warmup_ratio: 0.1
 weight_decay: 0.01
-## Added by TrueFoundry, not native to Axolotl
+use_mflow: False
+use_wandb: False
+use_tensorboard: True
+
+# ---------------------
+# Plugin specific configs
+
+## TrueFoundry
 cleanup_output_dir_on_start: False
+dataset_type: chat  # Can be completion | chat
 drop_long_sequences: False
 logging_dir: ./tensorboard_logs
-mlfoundry_log_checkpoints: True
+truefoundry_ml_run_name: auto # type: string
 save_model_on_interrupt: False
-use_mflow: False
-use_wandb: False
-use_tensorboard: True
+train_data_uri: null
+truefoundry_ml_checkpoint_artifact_name: auto # type: string
+truefoundry_ml_enable_reporting: False
+truefoundry_ml_log_checkpoints: True
+truefoundry_ml_repo: null
+val_data_uri: null
+
+## Liger
+liger_rms_norm: True
+liger_swiglu: True
+liger_fused_linear_cross_entropy: True
diff --git a/data_utils.py b/data_utils.py
@@ -61,7 +61,12 @@ def _make_dataset_file_source(
             "field_messages": "messages",
             "message_field_role": "role",
             "message_field_content": "content",
-            "roles": {"system": ["system"], "user": ["user", "human"], "assistant": ["assistant"], "tool": ["tool"]},
+            "roles": {
+                "system": ["system"],
+                "user": ["user", "human"],
+                "assistant": ["assistant"],
+                "tool": ["tool"],
+            },
             "split": split,
             "roles_to_train": ["gpt", "assistant", "ipython"],
             "train_on_eos": "last",
@@ -99,27 +104,3 @@ def dataset_uri_to_axolotl_datasources(
         return datasources
     else:
         raise ValueError(f"Unsupported data uri or path does not exist: {uri}")
-
-
-# --- Reference Notes ---
-
-
-# Other axolotl strategies:
-#     # Llama 2 chat template with data that looks like
-#     {"conversations": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]}
-
-#     Config:
-#     {
-#         "type": "sharegpt",
-#         "conversation": "llama-2",
-#         "field_human": "user",
-#         "field_model": "assistant",
-#     }
-
-#     # HF Tokenizers Chat Template
-#     {"conversations": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]}
-
-#     {
-#         "type": "chat_template",
-#         "chat_template": "chatml",
-#     }
diff --git a/finetune.ipynb b/finetune.ipynb
@@ -255,7 +255,10 @@
     "max_sequence_length = launch_parameters.max_length\n",
     "\n",
     "# If to drop sequences that are longer than max_sequence_length\n",
-    "drop_long_sequences = False\n",
+    "# error ->  will raise an error that are longer than max_sequence_length\n",
+    "# truncate -> will truncate sequences that are longer than max_sequence_length\n",
+    "# drop -> will drop sequences that are longer than max_sequence_length\n",
+    "long_sequences_strategy = \"error\"\n",
     "\n",
     "# Batch size per GPU. \n",
     "# Increasing this will increase GPU memory requirement and training time\n",
@@ -291,30 +294,29 @@
     "from mlfoundry_utils import generate_run_name, get_or_create_run\n",
     "\n",
     "# Enable reporting metrics to mlfoundry\n",
-    "mlfoundry_enable_reporting = True\n",
+    "truefoundry_ml_enable_reporting = True\n",
     "\n",
     "# Which ML Repo to log metrics and checkpoints to. \n",
     "# You can create new ML Repos from the https://<your-org>.truefoundry.cloud/mlfoundry page\n",
     "# Docs: https://docs.truefoundry.com/docs/key-concepts#creating-a-ml-repo\n",
-    "mlfoundry_ml_repo = \"llm-finetuning\"\n",
+    "truefoundry_ml_repo = \"llm-finetuning\"\n",
     "\n",
     "# If to upload checkpoints to ML Repo when they are saved\n",
-    "mlfoundry_log_checkpoints = True\n",
+    "truefoundry_ml_log_checkpoints = True\n",
     "\n",
     "# Run to which metrics and checkpoints will be logged\n",
-    "mlfoundry_run_name = generate_run_name(model_id, seed=os.getpid())\n",
+    "truefoundry_ml_run_name = generate_run_name(model_id, seed=os.getpid())\n",
     "\n",
     "# If to upload checkpoints to ML Repo when they are saved\n",
-    "mlfoundry_checkpoint_artifact_name = f\"ckpt-{mlfoundry_run_name}\"\n",
+    "truefoundry_ml_checkpoint_artifact_name = f\"ckpt-{truefoundry_ml_run_name}\"\n",
     "\n",
     "\n",
-    "if mlfoundry_enable_reporting:\n",
-    "    print(f\"Checkpoints will be logged with name {mlfoundry_checkpoint_artifact_name}\")\n",
+    "if truefoundry_ml_enable_reporting:\n",
+    "    print(f\"Checkpoints will be logged with name {truefoundry_ml_checkpoint_artifact_name}\")\n",
     "    get_or_create_run(\n",
-    "        ml_repo=mlfoundry_ml_repo,\n",
-    "        run_name=mlfoundry_run_name,\n",
+    "        ml_repo=truefoundry_ml_repo,\n",
+    "        run_name=truefoundry_ml_run_name,\n",
     "        auto_end=False,\n",
-    "        create_ml_repo=True\n",
     "    )\n",
     "    print(\"You can click on the above link to track metrics and checkpoints\")"
    ]
@@ -335,7 +337,7 @@
     "    os.environ[\"TENSORBOARD_PROXY_URL\"] = urljoin(os.getenv(\"NB_PREFIX\", \"/\"), \"proxy/%PORT%/\")\n",
     "    notebook.start(f\"--logdir {tb_logs} --reload_interval 30.0 --reload_multifile True\")\n",
     "\n",
-    "if not mlfoundry_enable_reporting:\n",
+    "if not truefoundry_ml_enable_reporting:\n",
     "    _launch_tensorboard()"
    ]
   },
@@ -375,7 +377,7 @@
     "--val_set_size {eval_size} \\\n",
     "--max_steps {max_steps} \\\n",
     "--sequence_len {max_sequence_length} \\\n",
-    "--drop_long_sequences {drop_long_sequences} \\\n",
+    "--long_sequences_strategy {long_sequences_strategy} \\\n",
     "--train_on_inputs False \\\n",
     "--sample_packing {sample_packing} \\\n",
     "--pad_to_sequence_len True \\\n",
@@ -397,11 +399,11 @@
     "--save_strategy steps \\\n",
     "--save_steps {save_steps} \\\n",
     "--seed 42 \\\n",
-    "--mlfoundry_enable_reporting {mlfoundry_enable_reporting} \\\n",
-    "--mlfoundry_ml_repo {mlfoundry_ml_repo} \\\n",
-    "--mlfoundry_run_name {mlfoundry_run_name} \\\n",
-    "--mlfoundry_checkpoint_artifact_name {mlfoundry_checkpoint_artifact_name} \\\n",
-    "--mlfoundry_log_checkpoints {mlfoundry_log_checkpoints} \\\n",
+    "--truefoundry_ml_enable_reporting {truefoundry_ml_enable_reporting} \\\n",
+    "--truefoundry_ml_repo {truefoundry_ml_repo} \\\n",
+    "--truefoundry_ml_run_name {truefoundry_ml_run_name} \\\n",
+    "--truefoundry_ml_checkpoint_artifact_name {truefoundry_ml_checkpoint_artifact_name} \\\n",
+    "--truefoundry_ml_log_checkpoints {truefoundry_ml_log_checkpoints} \\\n",
     "--cleanup_output_dir_on_start {cleanup_output_dir_on_start} \\\n",
     "--resume_from_checkpoint True \\\n",
     "| tee train.log\n",
@@ -425,7 +427,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "ft",
+   "display_name": "jupyter-base",
    "language": "python",
    "name": "python3"
   },
@@ -439,7 +441,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.9"
+   "version": "3.11.10"
   }
  },
  "nbformat": 4,
-Original file line number
+Diff line change
@@ Expand Up / @@ -16,3 +16,4 @@ tensorboard_logs/ @@
     mlruns/
     output/
     outputs/
+    *.egg-info/