Skip to content

Commit

Permalink
Update axolotl (#16)
Browse files Browse the repository at this point in the history
* WIP: Update to latest axolotl

* Update snowflake connector

* Remove monkey patching, use plugins system

* Fix for sample packing patching

* Update deepspeed

* bump axolotl commit version

* Disable Liger till linkedin/Liger-Kernel#322 (comment) is taken care of

* Update reqs

* Enhance sequence lens validation

* Fix notebook

* Update notebook

* Fix error handling

* Update base image

* Fix cuda repo and allow changing held packages for libnccl

* Fix build
  • Loading branch information
chiragjn authored Nov 4, 2024
1 parent 68a0c90 commit 9f2b630
Show file tree
Hide file tree
Showing 15 changed files with 422 additions and 382 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@ tensorboard_logs/
mlruns/
output/
outputs/
*.egg-info/
14 changes: 9 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
# https://hub.docker.com/layers/winglian/axolotl/main-20240819-py3.11-cu121-2.3.1/images/sha256-eb331da0d83e0e55301c542852ee4939d36fa02810f57d99b15f56e4dc0e200d?context=explore
FROM winglian/axolotl@sha256:e70e7ea55ab3ae3c212066bf45271f49198445ab646b9d470d9e0f41050ac8c9
# https://hub.docker.com/layers/winglian/axolotl/main-20241104-py3.11-cu121-2.3.1/images/sha256-790297fa1d71f8f1590c73ca4505ca39fe7dfa2886b2b6862199a6df679bf8e4?context=explore
FROM winglian/axolotl@sha256:cffbcc4993e80301a8918062f8136a6ac402877fd6c29f1168be563e543aee4d
USER root
COPY requirements.txt /tmp/
RUN pip install -U pip wheel setuptools && \
pip uninstall -y mlflow axolotl && \
pip uninstall -y axolotl && \
pip install --no-cache-dir -U -r /tmp/requirements.txt
RUN mkdir -p /packages && \
cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 294e9097e2c4ea642198aea5ad0561d3b647e572
git checkout e16f637d079ef5d56321a240ef0547a50c37b708
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers] && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation --no-cache-dir -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers,lion-pytorch,galore] && \
pip install --no-cache-dir -U -r /tmp/requirements.txt && \
rm -rf /root/.cache/pip
COPY plugins/axolotl_truefoundry /packages/axolotl_truefoundry
RUN cd /packages/axolotl_truefoundry/ && \
pip install --no-cache-dir -e . && \
rm -rf /root/.cache/pip
WORKDIR /app
COPY . /app
23 changes: 13 additions & 10 deletions Dockerfile-notebook
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
FROM truefoundrycloud/jupyter:0.2.20-sudo
FROM tfy.jfrog.io/tfy-images/jupyter:0.3.4-cu121-py3.11.10-sudo
ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
ENV DEBIAN_FRONTEND=noninteractive
USER root
RUN add-apt-repository ppa:flexiondotorg/nvtop -y && \
apt update && \
apt install -y --no-install-recommends git curl wget htop nvtop && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /tmp/cuda-keyring_1.1-1_all.deb && \
RUN apt update && \
apt install -y --no-install-recommends git curl wget && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -O /tmp/cuda-keyring_1.1-1_all.deb && \
dpkg -i /tmp/cuda-keyring_1.1-1_all.deb && \
apt update && \
apt install -y --no-install-recommends cuda-toolkit-12-1 libcudnn8=8.9.7.29-1+cuda12.2 libcudnn8-dev=8.9.7.29-1+cuda12.2 libnccl2 libnccl-dev
apt install -y --no-install-recommends --allow-change-held-packages libnccl2 libnccl-dev
USER jovyan
COPY requirements.txt notebook-requirements.txt /tmp/llm-finetune/
RUN pip install -U pip wheel setuptools && \
pip uninstall -y mlflow axolotl && \
pip uninstall -y axolotl && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
USER root
RUN mkdir -p /packages && \
Expand All @@ -21,9 +20,13 @@ USER jovyan
RUN cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout 294e9097e2c4ea642198aea5ad0561d3b647e572
git checkout e16f637d079ef5d56321a240ef0547a50c37b708
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers] && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation --no-cache-dir -e .[flash-attn,mamba-ssm,fused-dense-lib,optimizers,lion-pytorch,galore] && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt && \
rm -rf /root/.cache/pip
COPY plugins/axolotl_truefoundry /packages/axolotl_truefoundry
RUN cd /packages/axolotl_truefoundry/ && \
pip install --no-cache-dir -e .
COPY --chown=jovyan:users . /tmp_home/jovyan/llm-finetune/
ENV JUPYTER_APP_LAUNCHER_PATH=/home/jovyan/llm-finetune/.jp_app_launcher/
46 changes: 30 additions & 16 deletions config-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,10 @@ micro_batch_size: 1
revision_of_model: null
sequence_len: 2048
val_set_size: 0.1
## Added by TrueFoundry, not native to Axolotl
train_data_uri: null
val_data_uri: null
dataset_type: completion # Can be completion | chat
mlfoundry_enable_reporting: False
mlfoundry_ml_repo: null

# ---------------------
# Auto computed and set by script based on environment and external state
# Only edit them if you know what you are doing
chat_template: auto # type: string
data_dir: auto # type: string
datasets: auto # type: list
test_datasets: auto # type: list
Expand All @@ -40,13 +33,11 @@ unsloth_lora_o: auto # type: bool
unsloth_rms_norm: auto # type: bool
unsloth_rope: auto # type: bool
tf32: auto # type: bool
## Added by TrueFoundry, not native to Axolotl
mlfoundry_run_name: auto # type: string
mlfoundry_checkpoint_artifact_name: auto # type: string


# ---------------------
# Defaults
auto_find_batch_size: False
bnb_config_kwargs:
bnb_4bit_quant_type: nf4
bnb_4bit_use_double_quant: True
Expand All @@ -59,6 +50,7 @@ base_model_ignore_patterns:
- '*.ot'
- '*.tflite'
- '*.msgpack'
chat_template: tokenizer_default_fallback_chatml
dataset_prepared_path: ./outputs/data/last_run_prepared
dataset_processes: 1
ddp_timeout: 21600
Expand All @@ -71,7 +63,7 @@ eval_steps: 0.1
eval_strategy: steps
fix_untrained_tokens: true
gradient_accumulation_steps: 4
gradient_checkpointing: unsloth
gradient_checkpointing: True
gradient_checkpointing_kwargs:
use_reentrant: True
learning_rate: 0.00001
Expand All @@ -87,9 +79,15 @@ lora_target_modules: null
low_cpu_mem_usage: True
lr_scheduler: cosine
max_grad_norm: 1.0
mean_resizing_embeddings: True
multipack_real_batches: False
num_epochs: 10
optimizer: adamw_torch_fused
output_dir: ./outputs
plugins:
- axolotl_truefoundry.TrueFoundryMLPlugin
# Liger is disabled till it is updated with Gradient Accumulation Loss fixes
# - axolotl.integrations.liger.LigerPlugin
pad_to_sequence_len: True
remove_unused_columns: True
report_to: tensorboard
Expand All @@ -107,12 +105,28 @@ trust_remote_code: True
type_of_model: AutoModelForCausalLM
warmup_ratio: 0.1
weight_decay: 0.01
## Added by TrueFoundry, not native to Axolotl
use_mflow: False
use_wandb: False
use_tensorboard: True

# ---------------------
# Plugin specific configs

## TrueFoundry
cleanup_output_dir_on_start: False
dataset_type: chat # Can be completion | chat
drop_long_sequences: False
logging_dir: ./tensorboard_logs
mlfoundry_log_checkpoints: True
truefoundry_ml_run_name: auto # type: string
save_model_on_interrupt: False
use_mflow: False
use_wandb: False
use_tensorboard: True
train_data_uri: null
truefoundry_ml_checkpoint_artifact_name: auto # type: string
truefoundry_ml_enable_reporting: False
truefoundry_ml_log_checkpoints: True
truefoundry_ml_repo: null
val_data_uri: null

## Liger
liger_rms_norm: True
liger_swiglu: True
liger_fused_linear_cross_entropy: True
31 changes: 6 additions & 25 deletions data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,12 @@ def _make_dataset_file_source(
"field_messages": "messages",
"message_field_role": "role",
"message_field_content": "content",
"roles": {"system": ["system"], "user": ["user", "human"], "assistant": ["assistant"], "tool": ["tool"]},
"roles": {
"system": ["system"],
"user": ["user", "human"],
"assistant": ["assistant"],
"tool": ["tool"],
},
"split": split,
"roles_to_train": ["gpt", "assistant", "ipython"],
"train_on_eos": "last",
Expand Down Expand Up @@ -99,27 +104,3 @@ def dataset_uri_to_axolotl_datasources(
return datasources
else:
raise ValueError(f"Unsupported data uri or path does not exist: {uri}")


# --- Reference Notes ---


# Other axolotl strategies:
# # Llama 2 chat template with data that looks like
# {"conversations": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]}

# Config:
# {
# "type": "sharegpt",
# "conversation": "llama-2",
# "field_human": "user",
# "field_model": "assistant",
# }

# # HF Tokenizers Chat Template
# {"conversations": [{"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": "hi"}, {"role": "assistant", "content": "hello"}]}

# {
# "type": "chat_template",
# "chat_template": "chatml",
# }
42 changes: 22 additions & 20 deletions finetune.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,10 @@
"max_sequence_length = launch_parameters.max_length\n",
"\n",
"# If to drop sequences that are longer than max_sequence_length\n",
"drop_long_sequences = False\n",
"# error -> will raise an error that are longer than max_sequence_length\n",
"# truncate -> will truncate sequences that are longer than max_sequence_length\n",
"# drop -> will drop sequences that are longer than max_sequence_length\n",
"long_sequences_strategy = \"error\"\n",
"\n",
"# Batch size per GPU. \n",
"# Increasing this will increase GPU memory requirement and training time\n",
Expand Down Expand Up @@ -291,30 +294,29 @@
"from mlfoundry_utils import generate_run_name, get_or_create_run\n",
"\n",
"# Enable reporting metrics to mlfoundry\n",
"mlfoundry_enable_reporting = True\n",
"truefoundry_ml_enable_reporting = True\n",
"\n",
"# Which ML Repo to log metrics and checkpoints to. \n",
"# You can create new ML Repos from the https://<your-org>.truefoundry.cloud/mlfoundry page\n",
"# Docs: https://docs.truefoundry.com/docs/key-concepts#creating-a-ml-repo\n",
"mlfoundry_ml_repo = \"llm-finetuning\"\n",
"truefoundry_ml_repo = \"llm-finetuning\"\n",
"\n",
"# If to upload checkpoints to ML Repo when they are saved\n",
"mlfoundry_log_checkpoints = True\n",
"truefoundry_ml_log_checkpoints = True\n",
"\n",
"# Run to which metrics and checkpoints will be logged\n",
"mlfoundry_run_name = generate_run_name(model_id, seed=os.getpid())\n",
"truefoundry_ml_run_name = generate_run_name(model_id, seed=os.getpid())\n",
"\n",
"# If to upload checkpoints to ML Repo when they are saved\n",
"mlfoundry_checkpoint_artifact_name = f\"ckpt-{mlfoundry_run_name}\"\n",
"truefoundry_ml_checkpoint_artifact_name = f\"ckpt-{truefoundry_ml_run_name}\"\n",
"\n",
"\n",
"if mlfoundry_enable_reporting:\n",
" print(f\"Checkpoints will be logged with name {mlfoundry_checkpoint_artifact_name}\")\n",
"if truefoundry_ml_enable_reporting:\n",
" print(f\"Checkpoints will be logged with name {truefoundry_ml_checkpoint_artifact_name}\")\n",
" get_or_create_run(\n",
" ml_repo=mlfoundry_ml_repo,\n",
" run_name=mlfoundry_run_name,\n",
" ml_repo=truefoundry_ml_repo,\n",
" run_name=truefoundry_ml_run_name,\n",
" auto_end=False,\n",
" create_ml_repo=True\n",
" )\n",
" print(\"You can click on the above link to track metrics and checkpoints\")"
]
Expand All @@ -335,7 +337,7 @@
" os.environ[\"TENSORBOARD_PROXY_URL\"] = urljoin(os.getenv(\"NB_PREFIX\", \"/\"), \"proxy/%PORT%/\")\n",
" notebook.start(f\"--logdir {tb_logs} --reload_interval 30.0 --reload_multifile True\")\n",
"\n",
"if not mlfoundry_enable_reporting:\n",
"if not truefoundry_ml_enable_reporting:\n",
" _launch_tensorboard()"
]
},
Expand Down Expand Up @@ -375,7 +377,7 @@
"--val_set_size {eval_size} \\\n",
"--max_steps {max_steps} \\\n",
"--sequence_len {max_sequence_length} \\\n",
"--drop_long_sequences {drop_long_sequences} \\\n",
"--long_sequences_strategy {long_sequences_strategy} \\\n",
"--train_on_inputs False \\\n",
"--sample_packing {sample_packing} \\\n",
"--pad_to_sequence_len True \\\n",
Expand All @@ -397,11 +399,11 @@
"--save_strategy steps \\\n",
"--save_steps {save_steps} \\\n",
"--seed 42 \\\n",
"--mlfoundry_enable_reporting {mlfoundry_enable_reporting} \\\n",
"--mlfoundry_ml_repo {mlfoundry_ml_repo} \\\n",
"--mlfoundry_run_name {mlfoundry_run_name} \\\n",
"--mlfoundry_checkpoint_artifact_name {mlfoundry_checkpoint_artifact_name} \\\n",
"--mlfoundry_log_checkpoints {mlfoundry_log_checkpoints} \\\n",
"--truefoundry_ml_enable_reporting {truefoundry_ml_enable_reporting} \\\n",
"--truefoundry_ml_repo {truefoundry_ml_repo} \\\n",
"--truefoundry_ml_run_name {truefoundry_ml_run_name} \\\n",
"--truefoundry_ml_checkpoint_artifact_name {truefoundry_ml_checkpoint_artifact_name} \\\n",
"--truefoundry_ml_log_checkpoints {truefoundry_ml_log_checkpoints} \\\n",
"--cleanup_output_dir_on_start {cleanup_output_dir_on_start} \\\n",
"--resume_from_checkpoint True \\\n",
"| tee train.log\n",
Expand All @@ -425,7 +427,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "ft",
"display_name": "jupyter-base",
"language": "python",
"name": "python3"
},
Expand All @@ -439,7 +441,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down
Loading

0 comments on commit 9f2b630

Please sign in to comment.