Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
lw committed Dec 4, 2024
2 parents b9672f5 + 462b0e6 commit 97c9983
Show file tree
Hide file tree
Showing 157 changed files with 9,776 additions and 3,886 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build_wheels_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ jobs:
# triggered daily from main with a schedule
repository: pytorch/ao
ref: ""
test-infra-repository: pytorch/test-infra
test-infra-ref: main
build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
env-var-script: packaging/env_var_script_linux.sh
pre-script: packaging/pre_build_script.sh
Expand Down
4 changes: 1 addition & 3 deletions .github/workflows/float8_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
gpu-arch-type: "cuda"
gpu-arch-version: "12.1"

uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
timeout: 60
runner: ${{ matrix.runs-on }}
Expand All @@ -38,8 +38,6 @@ jobs:
script: |
conda create -n venv python=3.9 -y
conda activate venv
echo "::group::Install newer objcopy that supports --set-section-alignment"
yum install -y devtoolset-10-binutils
export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
python -m pip install --upgrade pip
pip install ${{ matrix.torch-spec }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nightly_smoke_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
gpu-arch-version: "12.1"


uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
Expand Down
42 changes: 32 additions & 10 deletions .github/workflows/regression_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,38 @@ env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}

jobs:
test-nightly:
strategy:
fail-fast: false
matrix:
include:
- name: CUDA Nightly
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124'
gpu-arch-type: "cuda"
gpu-arch-version: "12.4"
- name: CPU Nightly
runs-on: linux.4xlarge
torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""

uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
timeout: 120
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
script: |
conda create -n venv python=3.9 -y
conda activate venv
python -m pip install --upgrade pip
pip install ${{ matrix.torch-spec }}
pip install -r dev-requirements.txt
pip install .
export CONDA=$(dirname $(dirname $(which conda)))
export LD_LIBRARY_PATH=$CONDA/lib/:$LD_LIBRARY_PATH
pytest test --verbose -s
test:
strategy:
fail-fast: false
Expand All @@ -38,11 +70,6 @@ jobs:
torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121'
gpu-arch-type: "cuda"
gpu-arch-version: "12.1"
- name: CUDA Nightly
runs-on: linux.g5.12xlarge.nvidia.gpu
torch-spec: '--pre torch==2.6.0.dev20241101 --index-url https://download.pytorch.org/whl/nightly/cu121'
gpu-arch-type: "cuda"
gpu-arch-version: "12.1"

- name: CPU 2.3
runs-on: linux.4xlarge
Expand All @@ -59,11 +86,6 @@ jobs:
torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""
- name: CPU Nightly
runs-on: linux.4xlarge
torch-spec: '--pre torch==2.6.0.dev20241101 --index-url https://download.pytorch.org/whl/nightly/cpu'
gpu-arch-type: "cpu"
gpu-arch-version: ""

uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -371,4 +371,7 @@ venv/
sweep/

# Model checkpoints
checkpoints/
checkpoints/

# Experimental
torchao/experimental/cmake-out
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,8 @@ We're also fortunate to be integrated into some of the leading open-source libra
2. Hugging Face diffusers best practices with torch.compile and torchao in a standalone repo [diffusers-torchao](https://github.com/sayakpaul/diffusers-torchao)
3. Mobius HQQ backend leveraged our int4 kernels to get [195 tok/s on a 4090](https://github.com/mobiusml/hqq#faster-inference)
4. [TorchTune](https://github.com/pytorch/torchtune) for our QLoRA and QAT recipes
5. [torchchat](https://github.com/pytorch/torchtune) for post training quantization
6. [SGLang](https://github.com/sgl-project/sglang/pull/1341) for LLM inference quantization
5. [torchchat](https://github.com/pytorch/torchchat) for post training quantization
6. SGLang for LLM serving: [usage](https://github.com/sgl-project/sglang/blob/4f2ee48ed1c66ee0e189daa4120581de324ee814/docs/backend/backend.md?plain=1#L83) and the major [PR](https://github.com/sgl-project/sglang/pull/1341).

## Videos
* [Keynote talk at GPU MODE IRL](https://youtu.be/FH5wiwOyPX4?si=VZK22hHz25GRzBG1&t=1009)
Expand All @@ -201,8 +201,9 @@ If you find the torchao library useful, please cite it in your work as below.
@software{torchao,
title = {torchao: PyTorch native quantization and sparsity for training and inference},
author = {torchao maintainers and contributors},
url = {https//github.com/pytorch/torchao},
url = {https://github.com/pytorch/torchao},
license = {BSD-3-Clause},
month = oct,
year = {2024}
}
```
88 changes: 63 additions & 25 deletions benchmarks/benchmark_low_bit_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# - lpmm (4-bit optim): pip install yacs git+https://github.com/thu-ml/low-bit-optimizers.git
# - DeepSpeed (ZeRO-Offload):
# sudo apt install libopenmpi-dev
# LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu pip install mpi4p
# LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu pip install mpi4py
# DS_BUILD_CPU_ADAM=1 pip install deepspeed --no-cache-dir
#
# To fine-tune a pre-trained ViT-Base on resisc45 dataset with BF16 AMP, using default AdamW optimizer from PyTorch core
Expand All @@ -31,11 +31,15 @@
import torch.nn.functional as F
import wandb
from torch.utils.data import DataLoader
from torchao.utils import get_available_devices
from torchvision.transforms import v2
from tqdm import tqdm

from torchao.prototype import low_bit_optim

_DEVICE = get_available_devices()[-1]
assert _DEVICE in ["cuda", "xpu"], "Benchmark currently only supports CUDA & XPU(BF16)"

OPTIM_MAP = dict(
AdamW=partial(torch.optim.AdamW, fused=True),
AdamW8bitBnb=bnb.optim.AdamW8bit,
Expand All @@ -49,7 +53,9 @@

OPTIM_MAP.update(
AdamW4bitLpmm=partial(lpmm.optim.AdamW, fused=True),
AdamW4bitRank1Lpmm=partial(lpmm.optim.AdamW, qconfig=argparse.Namespace(scale_type="rank1")),
AdamW4bitRank1Lpmm=partial(
lpmm.optim.AdamW, qconfig=argparse.Namespace(scale_type="rank1")
),
)

except ImportError:
Expand All @@ -67,8 +73,12 @@ def get_lr(self, step: int) -> float:
if step < self.warmup_steps:
return self.lr * step / self.warmup_steps
if step < self.total_steps:
progress = (step - self.warmup_steps) / (self.total_steps - self.warmup_steps)
return self.final_lr + 0.5 * (self.lr - self.final_lr) * (1 + math.cos(progress * math.pi))
progress = (step - self.warmup_steps) / (
self.total_steps - self.warmup_steps
)
return self.final_lr + 0.5 * (self.lr - self.final_lr) * (
1 + math.cos(progress * math.pi)
)
return self.final_lr


Expand All @@ -92,7 +102,9 @@ def get_parser():
parser.add_argument("--weight_decay", type=float, default=0)
parser.add_argument("--optim_kwargs", type=json.loads, default=dict())
parser.add_argument("--cosine_lr_scheduler", action="store_true")
parser.add_argument("--optim_cpu_offload", choices=["ao", "ao_offload_grads", "deepspeed"])
parser.add_argument(
"--optim_cpu_offload", choices=["ao", "ao_offload_grads", "deepspeed"]
)

parser.add_argument("--project")
parser.add_argument("--run_name", default="debug")
Expand All @@ -110,11 +122,15 @@ def get_dloader(args, training: bool):
transforms.extend([v2.Resize(256), v2.CenterCrop(224)])

transforms.append(v2.ToDtype(torch.float32, scale=True))
transforms.append(v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]))
transforms.append(
v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)
transforms = v2.Compose(transforms)

# use dataset from HF so download is fast
ds = datasets.load_dataset("timm/resisc45", split="train" if training else "validation")
ds = datasets.load_dataset(
"timm/resisc45", split="train" if training else "validation"
)
ds = ds.select_columns(["image", "label"])
ds.set_transform(lambda x: dict(image=transforms(x["image"]), label=x["label"]))

Expand All @@ -128,9 +144,9 @@ def get_dloader(args, training: bool):
)


def get_amp_ctx(amp):
def get_amp_ctx(amp, device):
dtype = dict(bf16=torch.bfloat16, fp16=torch.float16, none=None)[amp]
return torch.autocast("cuda", dtype=dtype, enabled=amp != "none")
return torch.autocast(device, dtype=dtype, enabled=amp != "none")


@torch.no_grad()
Expand All @@ -148,8 +164,8 @@ def evaluate_model(model, args):
if args.channels_last:
batch["image"] = batch["image"].to(memory_format=torch.channels_last)

with get_amp_ctx(args.amp):
all_preds.append(model(batch["image"].cuda()).argmax(1).cpu())
with get_amp_ctx(args.amp, _DEVICE):
all_preds.append(model(batch["image"].to(_DEVICE)).argmax(1).cpu())

all_labels = torch.cat(all_labels, dim=0)
all_preds = torch.cat(all_preds, dim=0)
Expand All @@ -164,8 +180,12 @@ def evaluate_model(model, args):
if args.full_bf16:
assert args.amp == "none", "When --full_bf16 is set, --amp must be none"
if args.optim_cpu_offload == "deepspeed":
assert args.amp == "none", "When using DeepSpeed ZeRO-Offload, --amp must be none"
assert args.optim == "AdamW", "When using DeepSpeed ZeRO-Offload, --optim must be AdamW"
assert (
args.amp == "none"
), "When using DeepSpeed ZeRO-Offload, --amp must be none"
assert (
args.optim == "AdamW"
), "When using DeepSpeed ZeRO-Offload, --optim must be AdamW"
if args.profile:
args.n_epochs = 1
if args.seed is not None:
Expand All @@ -185,14 +205,16 @@ def evaluate_model(model, args):
dloader = get_dloader(args, True)
print(f"Train dataset: {len(dloader.dataset):,} images")

model = timm.create_model(args.model, pretrained=True, num_classes=45, **args.model_kwargs)
model = timm.create_model(
args.model, pretrained=True, num_classes=45, **args.model_kwargs
)
if args.checkpoint_activations:
model.set_grad_checkpointing()
if args.full_bf16:
model.bfloat16()
if args.channels_last:
model.to(memory_format=torch.channels_last)
model.cuda() # move model to CUDA after optionally convert it to BF16
model.to(_DEVICE) # move model to DEVICE after optionally convert it to BF16
if args.compile:
model.compile(fullgraph=True)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
Expand Down Expand Up @@ -227,9 +249,15 @@ def evaluate_model(model, args):
optim_cls = OPTIM_MAP[args.optim]

if args.optim_cpu_offload == "ao":
optim_cls = partial(low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls)
optim_cls = partial(
low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls
)
elif args.optim_cpu_offload == "ao_offload_grads":
optim_cls = partial(low_bit_optim.CPUOffloadOptimizer, optimizer_class=optim_cls, offload_gradients=True)
optim_cls = partial(
low_bit_optim.CPUOffloadOptimizer,
optimizer_class=optim_cls,
offload_gradients=True,
)

optim = optim_cls(
model.parameters(),
Expand All @@ -239,24 +267,30 @@ def evaluate_model(model, args):
)

lr_schedule = CosineSchedule(args.lr, len(dloader) * args.n_epochs)
grad_scaler = torch.amp.GradScaler("cuda", enabled=args.amp == "fp16")
grad_scaler = torch.amp.GradScaler(_DEVICE, enabled=args.amp == "fp16")
log_interval = 10
t0 = time.perf_counter()

step = 0
for epoch_idx in range(args.n_epochs):
model.train()
pbar = tqdm(dloader, dynamic_ncols=True, desc=f"Epoch {epoch_idx + 1}/{args.n_epochs}")
pbar = tqdm(
dloader, dynamic_ncols=True, desc=f"Epoch {epoch_idx + 1}/{args.n_epochs}"
)

with torch.profiler.profile() if args.profile else nullcontext() as prof:
for batch in pbar:
if args.full_bf16:
batch["image"] = batch["image"].bfloat16()
if args.channels_last:
batch["image"] = batch["image"].to(memory_format=torch.channels_last)
batch["image"] = batch["image"].to(
memory_format=torch.channels_last
)

with get_amp_ctx(args.amp):
loss = F.cross_entropy(model(batch["image"].cuda()), batch["label"].cuda())
with get_amp_ctx(args.amp, _DEVICE):
loss = F.cross_entropy(
model(batch["image"].to(_DEVICE)), batch["label"].to(_DEVICE)
)

if args.optim_cpu_offload == "deepspeed":
model.backward(loss)
Expand All @@ -275,7 +309,9 @@ def evaluate_model(model, args):
log_dict = dict(loss=loss.item(), lr=optim.param_groups[0]["lr"])
if step > 0:
t1 = time.perf_counter()
log_dict["imgs_per_second"] = args.batch_size * log_interval / (t1 - t0)
log_dict["imgs_per_second"] = (
args.batch_size * log_interval / (t1 - t0)
)
t0 = t1
logger.log(log_dict, step=step)

Expand All @@ -296,9 +332,11 @@ def evaluate_model(model, args):

else:
val_acc = evaluate_model(model, args)
print(f"Epoch {epoch_idx + 1}/{args.n_epochs}: val_acc={val_acc.item() * 100:.2f}")
print(
f"Epoch {epoch_idx + 1}/{args.n_epochs}: val_acc={val_acc.item() * 100:.2f}"
)
logger.log(dict(val_acc=val_acc), step=step)

peak_mem = torch.cuda.max_memory_allocated() / 1e9
peak_mem = getattr(torch, _DEVICE).max_memory_allocated() / 1e9
print(f"Max memory used: {peak_mem:.02f} GB")
logger.log(dict(max_memory_allocated=peak_mem))
Loading

0 comments on commit 97c9983

Please sign in to comment.