Revert "merge upstream changes before adding support for torchbench (#8…

…)" (#10) This reverts commit 6863c59.
tianyu-l · Aug 13, 2024 · ffa82fc · ffa82fc
1 parent 6863c59
commit ffa82fc
Show file tree

Hide file tree

Showing 48 changed files with 45,945 additions and 3,378 deletions.
diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
@@ -1,5 +1,4 @@
 torch >= 2.3.0
-torchdata >= 0.8.0
 datasets >= 2.19.0
 tomli >= 1.1.0 ; python_version < "3.11"
 tensorboard

diff --git a/.github/workflows/integration_test_4gpu.yaml b/.github/workflows/integration_test_4gpu.yaml
@@ -38,6 +38,6 @@ jobs:
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
-        USE_CPP=0 python -m pip install git+https://github.com/pytorch/ao.git
+        python -m pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly/
         mkdir artifacts-to-be-uploaded
         python ./test_runner.py artifacts-to-be-uploaded --ngpu 4
diff --git a/.github/workflows/integration_test_8gpu.yaml b/.github/workflows/integration_test_8gpu.yaml
@@ -37,5 +37,6 @@ jobs:
         pip config --user set global.progress_bar off
 
         python -m pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+        python -m pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly/
         mkdir artifacts-to-be-uploaded
         python ./test_runner.py artifacts-to-be-uploaded --ngpu 8
diff --git a/.github/workflows/unit_test_cpu.yaml b/.github/workflows/unit_test_cpu.yaml
@@ -25,4 +25,5 @@ jobs:
         pip config --user set global.progress_bar off
 
         pip install --force-reinstall --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
+        pip install --pre torchdata --index-url https://download.pytorch.org/whl/nightly
         pytest test --cov=. --cov-report=xml --durations=20 -vv
diff --git a/README.md b/README.md
@@ -18,16 +18,6 @@ Our guiding principles when building `torchtitan`:
 
 [![Welcome to torchtitan!](assets/images/titan_play_video.png)](https://youtu.be/ee5DOEqD35I?si=_B94PbVv0V5ZnNKE "Welcome to torchtitan!")
 
-### Dive into the code
-
-You may want to see how the model is defined or how parallelism techniques are applied. For a guided tour, see these files first:
-* [train.py](https://github.com/pytorch/torchtitan/blob/main/train.py) - the main training loop and high-level setup code
-* [torchtitan/parallelisms/parallelize_llama.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/parallelize_llama.py) - helpers for applying Data Parallel, Tensor Parallel, activation checkpointing, and `torch.compile` to the model
-* [torchtitan/parallelisms/pipeline_llama.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/parallelisms/pipeline_llama.py) - helpers for applying Pipeline Parallel to the model
-* [torchtitan/checkpoint.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/checkpoint.py) - utils for saving/loading distributed checkpoints
-* [torchtitan/float8.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/float8.py) - utils for applying Float8 techniques
-* [torchtitan/models/llama/model.py](https://github.com/pytorch/torchtitan/blob/main/torchtitan/models/llama/model.py) - the Llama model definition (shared for Llama2 and Llama3 variants)
-
 ## Pre-Release Updates:
 #### (4/25/2024): `torchtitan` is now public but in a pre-release state and under development.
 Currently we showcase pre-training **Llama 3 and Llama 2** LLMs of various sizes from scratch. `torchtitan` is tested and verified with the PyTorch nightly version `torch-2.4.0.dev20240412`. (We recommend latest PyTorch nightly).
@@ -43,18 +33,18 @@ Currently we showcase pre-training **Llama 3 and Llama 2** LLMs of various sizes
 6. Learning rate scheduler, meta init, Optional Fused RMSNorm
 7. All options easily configured via [toml files](train_configs/)
 8. [Interoperable checkpoints](docs/checkpoint.md) which can be loaded directly into [`torchtune`](https://github.com/pytorch/torchtune) for fine tuning
-9. [Float8 support](docs/float8.md)
 
 We report our [Performance](docs/performance.md) verified on 64 A100 GPUs
 
 
 ### Coming soon
 
 1. Async checkpointing
-2. Context Parallel
-3. 3D Pipeline Parallel
-4. `torch.compile` support
-5. Scalable data loading solution
+2. FP8 support
+3. Context Parallel
+4. 3D Pipeline Parallel
+5. `torch.compile` support
+6. Scalable data loading solution
 
 
 ## Installation
@@ -64,6 +54,7 @@ git clone https://github.com/pytorch/torchtitan
 cd torchtitan
 pip install -r requirements.txt
 pip3 install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121 # or cu118
+pip3 install --pre torchdata --index-url https://download.pytorch.org/whl/nightly
 ```
 
 ### Downloading a tokenizer
@@ -75,7 +66,7 @@ Once you have confirmed access, you can run the following command to download th
 ```bash
 # Get your HF token from https://huggingface.co/settings/tokens
 
-# llama3 or 3.1 tokenizer.model
+# llama3 tokenizer.model
 python torchtitan/datasets/download_tokenizer.py --repo_id meta-llama/Meta-Llama-3-8B --tokenizer_path "original" --hf_token=...
 
 # llama2 tokenizer.model

diff --git a/create_seed_checkpoint.sh b/create_seed_checkpoint.sh
@@ -18,6 +18,8 @@
 
 set -ex
 
+export USE_LIBUV=1
+TRAINER_DIR=${1:-/home/$USER/local/torchtitan}
 NGPU=1
 LOG_RANK=0
 CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}

diff --git a/docs/composability.md b/docs/composability.md
diff --git a/docs/float8.md b/docs/float8.md
diff --git a/estimation.py b/estimation.py
@@ -9,19 +9,22 @@
 import os
 
 import torch
+import torch.nn.functional as F
 from torch._guards import active_fake_mode
 from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.distributed import destroy_process_group
 from torch.distributed._tools.fsdp2_mem_tracker import FSDPMemTracker
+from torch.distributed.tensor.parallel import loss_parallel
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 from torchtitan.config_manager import JobConfig
-from torchtitan.datasets import build_tokenizer
-from torchtitan.float8 import Float8Handler
-from torchtitan.logging import init_logger, logger
+from torchtitan.datasets import create_tokenizer
+from torchtitan.float8_linear import build_fp8_linear
+from torchtitan.logging_utils import init_logger, logger
+from torchtitan.lr_scheduling import get_lr_schedulers
 from torchtitan.models import model_name_to_cls, model_name_to_tokenizer, models_config
-from torchtitan.optimizer import build_lr_schedulers, build_optimizers
 from torchtitan.parallelisms import models_parallelize_fns, ParallelDims
-from train import get_train_context
+from train import build_optimizers
 
 
 def estimate_memory(job_config: JobConfig):
@@ -58,18 +61,16 @@ def estimate_memory(job_config: JobConfig):
         logger.info("Compiled RMSNorm is not supported yet. Switching to RMSNorm.")
         job_config.model.norm_type = "rmsnorm"
 
-    if job_config.training.compile or job_config.experimental.enable_compiled_autograd:
+    if job_config.training.compile:
         logger.info("Compile mode is not supported yet. Switching to eager mode.")
         job_config.training.compile = False
-        job_config.experimental.enable_compiled_autograd = False
 
     parallel_dims = ParallelDims(
         dp=job_config.training.data_parallel_degree,
         tp=job_config.training.tensor_parallel_degree,
         pp=job_config.experimental.pipeline_parallel_degree,
         world_size=world_size,
         enable_loss_parallel=job_config.training.enable_loss_parallel,
-        dp_type=job_config.training.data_parallel_type,
     )
 
     device = torch.device(f"cuda:{int(os.environ['LOCAL_RANK'])}")
@@ -92,18 +93,16 @@ def estimate_memory(job_config: JobConfig):
 
     # build tokenizer
     tokenizer_type = model_name_to_tokenizer[model_name]
-    tokenizer = build_tokenizer(tokenizer_type, job_config.model.tokenizer_path)
+    tokenizer = create_tokenizer(tokenizer_type, job_config.model.tokenizer_path)
 
-    train_context = get_train_context(
-        parallel_dims.loss_parallel_enabled,
-        job_config.experimental.enable_compiled_autograd,
+    # loss_parallel enables dispatching to efficient loss operators
+    loss_parallel_ctx = (
+        loss_parallel if parallel_dims.loss_parallel_enabled else contextlib.nullcontext
     )
 
     # loss fn can be shared by pipeline-parallel or non-pp execution
     def loss_fn(pred, labels):
-        return torch.nn.functional.cross_entropy(
-            pred.flatten(0, 1), labels.flatten(0, 1)
-        )
+        return F.cross_entropy(pred.flatten(0, 1), labels.flatten(0, 1))
 
     # build model (using meta init)
     model_cls = model_name_to_cls[model_name]
@@ -122,25 +121,32 @@ def loss_fn(pred, labels):
             f"Building {model_name} {job_config.model.flavor} with {model_config}"
         )
         with torch.device("meta"):
-            model = model_cls.from_model_args(model_config)
+            whole_model = model_cls.from_model_args(model_config)
 
-        # a no-op hander if float8 is not enabled
-        float8_handler = Float8Handler(job_config, parallel_dims)
-        # swap to Float8Linear based on float8 configs
-        float8_handler.convert_to_float8_training(model)
+        # apply fp8 linear module swap
+        if job_config.training.fp8_linear:
+            build_fp8_linear(whole_model, job_config)
 
         # apply PT-D DP/TP parallelisms and activation checkpointing
-        models_parallelize_fns[model_name](model, world_mesh, parallel_dims, job_config)
+        model_parts = [whole_model]
+        model_parts = [
+            models_parallelize_fns[model_name](m, world_mesh, parallel_dims, job_config)
+            for m in model_parts
+        ]
+
+        init_device = "cuda"
+        for model in model_parts:
+            model.to_empty(device=init_device)
 
-        model.to_empty(device="cuda")
         if not active_fake_mode():
-            model.init_weights()
-        model.train()
+            whole_model.init_weights()
 
         # build optimizer after applying parallelisms to the model
-        optimizers = build_optimizers([model], job_config)
-        lr_schedulers = build_lr_schedulers(optimizers.optimizers, job_config)
+        optimizers = build_optimizers(model_parts, job_config)
+        lr_schedulers = get_lr_schedulers(optimizers.optimizers, job_config)
 
+        for model in model_parts:
+            model.train()
         logger.info(f"Vocab size: {model_config.vocab_size}")
         # Create a dummy batch instead of loading from a dataset
         batch = (
@@ -157,31 +163,27 @@ def loss_fn(pred, labels):
                 device="cuda",
             ),
         )
-        fsdp_memtracker = FSDPMemTracker(mod=model, optm=optimizers.optimizers[0])
+        fsdp_memtracker = FSDPMemTracker(mod=whole_model, optm=optimizers.optimizers[0])
         fsdp_memtracker.track_inputs(batch)
 
         with fsdp_memtracker:
             for iter_idx in range(2):
                 input_ids, labels = batch
                 # train step
-                with train_context():
-                    pred = model(input_ids)
+                with loss_parallel_ctx():
+                    pred = whole_model(input_ids)
                     loss = loss_fn(pred, labels)
                     del pred
                     loss.backward()
 
                 # clip gradients
-                torch.nn.utils.clip_grad_norm_(
-                    model.parameters(), job_config.training.max_norm, foreach=True
-                )
-                # sync float8 amaxes and scales
-                float8_handler.sync_float8_amax_and_scale_history(model)
+                for model in model_parts:
+                    torch.nn.utils.clip_grad_norm_(
+                        model.parameters(), job_config.training.max_norm, foreach=True
+                    )
                 # optimizer step
                 optimizers.step()
                 lr_schedulers.step()
-                # calculate float8 dynamic amax/scale for all-parameter for FSDP2
-                # it issues a single all-reduce for all parameters at once for better performance
-                float8_handler.precompute_float8_dynamic_scale_for_fsdp(model)
                 optimizers.zero_grad()
                 print(f"Peak Memory at iter: {iter_idx}")
                 fsdp_memtracker.display_snapshot("peak", units="MiB", tabulate=True)
@@ -215,4 +217,4 @@ def loss_fn(pred, labels):
     try:
         estimate_memory(config)
     finally:
-        torch.distributed.destroy_process_group()
+        destroy_process_group()
diff --git a/multinode_trainer.slurm b/multinode_trainer.slurm
@@ -53,6 +53,7 @@ export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
 export NCCL_BUFFSIZE=2097152
 #export TORCH_DIST_INIT_BARRIER=1
 export FI_EFA_SET_CUDA_SYNC_MEMOPS=0
+#export USE_LIBUV=1
 CONFIG_FILE=${CONFIG_FILE:-"./train_configs/llama2_13b.toml"}
 
 dcgmi profile --pause

diff --git a/run_llama_train.sh b/run_llama_train.sh
@@ -7,18 +7,40 @@
 
 set -ex
 
+# libUV is a scalable backend for TCPStore which is used in processGroup
+# rendezvous. This is the recommended backend for distributed training.
+export USE_LIBUV=1
+TRAINER_DIR=${TRAINER_DIR:-/home/$USER/local/torchtitan}
+
 # use envs as local overrides for convenience
 # e.g.
 # LOG_RANK=0,1 NGPU=4 ./run_llama_train.sh
+
 NGPU=${NGPU:-"8"}
+NNODES=${NNODES:-"1"}
+
+# by default log just rank 0 output,
 LOG_RANK=${LOG_RANK:-0}
+
+
 CONFIG_FILE=${CONFIG_FILE:-"./train_configs/debug_model.toml"}
 
 overrides=""
 if [ $# -ne 0 ]; then
     overrides="$*"
 fi
 
-torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
---local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-train.py --job.config_file ${CONFIG_FILE} $overrides
+# Check if --estimate.memory=True is in the arguments
+if echo "$overrides" | grep -q -- "--memory_estimation.enabled"; then
+    # Calculate WORLD_SIZE as the product of NGPU and NNODES
+    # Export WORLD_SIZE and LOCAL_RANK
+    export WORLD_SIZE=$((NGPU * NNODES))
+    export LOCAL_RANK=0
+    python estimation.py --job.config_file ${CONFIG_FILE} $overrides
+else
+    # Call train.py if not in estimation mode
+    # TORCH_TRACE="outputs/compile_trace" \
+    torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+    --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+    train.py --job.config_file ${CONFIG_FILE} $overrides
+fi