Skip to content

Commit

Permalink
Merge branch 'unpack_int4' of https://github.com/jeromeku/ao into unp…
Browse files Browse the repository at this point in the history
…ack_int4
  • Loading branch information
jeromeku committed Jul 4, 2024
2 parents 75df5f5 + d1bd61b commit e90e280
Show file tree
Hide file tree
Showing 78 changed files with 3,853 additions and 1,589 deletions.
82 changes: 30 additions & 52 deletions .github/scripts/trymerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,6 @@ def merge_into(
# Finally, upload the record to Rockset. The list of pending and failed
# checks are at the time of the merge
save_merge_record(
collection=ROCKSET_MERGES_COLLECTION,
comment_id=comment_id,
pr_num=self.pr_num,
owner=self.org,
Expand All @@ -1179,10 +1178,8 @@ def merge_into(
merge_base_sha=self.get_merge_base(),
merge_commit_sha=merge_commit_sha,
is_failed=False,
dry_run=dry_run,
skip_mandatory_checks=skip_mandatory_checks,
ignore_current=bool(ignore_current_checks),
workspace=ROCKSET_MERGES_WORKSPACE,
)
else:
print("Missing comment ID or PR number, couldn't upload to Rockset")
Expand Down Expand Up @@ -1489,7 +1486,6 @@ def checks_to_markdown_bullets(

@retries_decorator()
def save_merge_record(
collection: str,
comment_id: int,
pr_num: int,
owner: str,
Expand All @@ -1505,59 +1501,44 @@ def save_merge_record(
merge_base_sha: str,
merge_commit_sha: str = "",
is_failed: bool = False,
dry_run: bool = False,
skip_mandatory_checks: bool = False,
ignore_current: bool = False,
error: str = "",
workspace: str = "commons",
) -> None:
"""
This saves the merge records into Rockset, so we can query them (for fun and profit)
This saves the merge records as a json, which can later be uploaded to s3
"""
if dry_run:
# Decide not to save the record to Rockset if dry-run is set to not pollute
# the collection
return

try:
import rockset # type: ignore[import]

# Prepare the record to be written into Rockset
data = [
{
"comment_id": comment_id,
"pr_num": pr_num,
"owner": owner,
"project": project,
"author": author,
"pending_checks": pending_checks,
"failed_checks": failed_checks,
"ignore_current_checks": ignore_current_checks,
"broken_trunk_checks": broken_trunk_checks,
"flaky_checks": flaky_checks,
"unstable_checks": unstable_checks,
"last_commit_sha": last_commit_sha,
"merge_base_sha": merge_base_sha,
"merge_commit_sha": merge_commit_sha,
"is_failed": is_failed,
"skip_mandatory_checks": skip_mandatory_checks,
"ignore_current": ignore_current,
"error": error,
}
]

client = rockset.RocksetClient(
host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
)
client.Documents.add_documents(
collection=collection,
data=data,
workspace=workspace,
)
# Prepare the record to be written into Rockset
data = [
{
"comment_id": comment_id,
"pr_num": pr_num,
"owner": owner,
"project": project,
"author": author,
"pending_checks": pending_checks,
"failed_checks": failed_checks,
"ignore_current_checks": ignore_current_checks,
"broken_trunk_checks": broken_trunk_checks,
"flaky_checks": flaky_checks,
"unstable_checks": unstable_checks,
"last_commit_sha": last_commit_sha,
"merge_base_sha": merge_base_sha,
"merge_commit_sha": merge_commit_sha,
"is_failed": is_failed,
"skip_mandatory_checks": skip_mandatory_checks,
"ignore_current": ignore_current,
"error": error,
# This is a unique identifier for the record for deduping purposes
# in rockset. Any unique string would work
"_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
}
]
repo_root = Path(__file__).resolve().parent.parent.parent

except ModuleNotFoundError:
print("Rockset is missing, no record will be saved")
return
with open(repo_root / "merge_record.json", "w") as f:
json.dump(data, f)


@retries_decorator(rc=[])
Expand Down Expand Up @@ -2374,7 +2355,6 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
# list of pending and failed checks here, but they are not really
# needed at the moment
save_merge_record(
collection=ROCKSET_MERGES_COLLECTION,
comment_id=args.comment_id,
pr_num=args.pr_num,
owner=org,
Expand All @@ -2389,11 +2369,9 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
last_commit_sha=pr.last_commit().get("oid", ""),
merge_base_sha=pr.get_merge_base(),
is_failed=True,
dry_run=args.dry_run,
skip_mandatory_checks=args.force,
ignore_current=args.ignore_current,
error=str(e),
workspace=ROCKSET_MERGES_WORKSPACE,
)
else:
print("Missing comment ID or PR number, couldn't upload to Rockset")
Expand Down
2 changes: 2 additions & 0 deletions .github/scripts/validate_binaries.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pip install ${PYTORCH_PIP_PREFIX} torchao --index-url ${PYTORCH_PIP_DOWNLOAD_URL}
python ./test/smoke_tests/smoke_tests.py
1 change: 1 addition & 0 deletions .github/workflows/regression_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ jobs:

uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
with:
timeout: 60
runner: ${{ matrix.runs-on }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
Expand Down
26 changes: 26 additions & 0 deletions .github/workflows/trymerge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ jobs:
name: try_merge_pr_${{ github.event.client_payload.pr_num }}
runs-on: ubuntu-latest
environment: pytorchbot-env
permissions:
id-token: write
env:
GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
steps:
Expand All @@ -26,6 +28,8 @@ jobs:
check-latest: false
cache: pip
architecture: x64
# TODO (huydhn): get rid of Rockset
- run: pip install pyyaml==6.0 rockset==1.0.3

- name: Setup committer id
run: |
Expand All @@ -36,8 +40,14 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.PYTORCH_MERGEBOT_TOKEN }}
PR_NUM: ${{ github.event.client_payload.pr_num }}
FORCE: ${{ github.event.client_payload.force}}
COMMENT_ID: ${{ github.event.client_payload.comment_id }}
GIT_REMOTE_URL: https://github.com/pytorch/ao
REBASE: ${{ github.event.client_payload.rebase }}
IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
GITHUB_RUN_ID: ${{ github.run_id }}
run: |
set -x
if [ -n "${FORCE}" ]; then
Expand All @@ -58,6 +68,22 @@ jobs:
python3 .github/scripts/trymerge.py "${PR_NUM}"
fi
- name: configure aws credentials
uses: aws-actions/configure-aws-credentials@v3
continue-on-error: true
with:
role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
aws-region: us-east-1

- name: Upload merge record to s3
if: always()
continue-on-error: true
uses: seemethere/upload-artifact-s3@v5
with:
s3-bucket: ossci-raw-job-status
s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
path: merge_record.json

# We want newer merge commands to supercede old ones
concurrency:
group: try-merge-${{ github.event.client_payload.pr_num }}
Expand Down
49 changes: 49 additions & 0 deletions .github/workflows/validate-binaries.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
name: Validate binaries

on:
workflow_call:
inputs:
channel:
description: "Channel to use (nightly, test, release, all)"
required: false
type: string
default: release
ref:
description: "Reference to checkout, defaults to empty"
default: ""
required: false
type: string
workflow_dispatch:
inputs:
channel:
description: "Channel to use (nightly, test, release, all)"
required: true
type: choice
options:
- release
- nightly
- test
- all
ref:
description: "Reference to checkout, defaults to empty"
default: ""
required: false
type: string
pytorch_version:
description: "PyTorch version to validate (ie. 2.0, 2.2.2, etc.) - optional"
default: ""
required: false
type: string
jobs:
validate-binaries:
uses: pytorch/test-infra/.github/workflows/validate-domain-library.yml@main
with:
package_type: "wheel"
version: ${{ inputs.version }}
os: "linux"
channel: ${{ inputs.channel }}
repository: "pytorch/ao"
with_cuda: "enable"
with_rocm: "disable"
smoke_test: "source ./.github/scripts/validate_binaries.sh"
install_torch: true
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ env
.circleci/scripts/COMMIT_MSG
scripts/release_notes/*.json
sccache-stats*.json
merge_record.json

# These files get copied over on invoking setup.py
torchgen/packaged/*
Expand Down
77 changes: 61 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ All with no intrusive code changes and minimal accuracy degradation.
Quantizing your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/) and a HuggingFace inference example [here](scripts/hf_eval.py)

```python
from torchao.quantization.quant_api import quantize
m = quantize(m, "int4wo")
from torchao.quantization.quant_api import quantize, int4_weight_only
m = quantize(m, int4_weight_only())
```

Benchmarks are run on a machine with a single A100 GPU using the script in `_models/llama` which generates text in a latency-optimized way (batchsize=1)
Expand All @@ -29,15 +29,17 @@ The models used were `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Meta-Llama-

| Model | Technique | wikitext-perplexity | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
| ----------- | ------------------ | ------------------- | ------------- | ----------------------- | ---------------- | --------------- |
| Llama-2-7B | Base (bfloat16) | 12.212 | 105.02 | 1387.78 | 13.21 | 13.90 |
| | int8dq | 12.262 | 9.40 | 62.26 | 6.62 | 8.61 |
| | int8wo | 12.204 | 147.03 | 973.54 | 6.62 | 8.95 |
| | int4wo-64 | 12.843 | 199.81 | 746.45 | 3.74 | 4.75 |
| | int4wo-64-GPTQ | 12.489 | 199.81 | 746.45 | 3.74 | 4.75 |
| Llama-3-8B | Base (bfloat16) | | 94.91 | 1424.58 | 15.01 | 16.43 |
| | int8dq | | 8.41 | 63.23 | 7.52 | 9.24 |
| | int8wo | | 136.75 | 1028.38 | 7.52 | 10.42 |
| | int4wo-64 | | 179.41 | 757.45 | 4.22 | 6.88 |
| Llama-2-7B | Base (bfloat16) | 12.212 | 105.14 | 1389.35 | 13.88 | 13.21 |
| | int8dq | 12.262 | 9.20 | 60.93 | 8.33 | 6.62 |
| | int8wo | 12.204 | 150.18 | 994.40 | 8.95 | 6.62 |
| | int4wo-64 | 12.843 | 199.86 | 746.66 | 4.50 | 3.74 |
| | int4wo-64-GPTQ | 12.489 | 199.86 | 746.66 | 4.50 | 3.74 |
| | autoquant | 12.204 | 159.22 | 1069.87 | 8.91 | 6.72 |
| Llama-3-8B | Base (bfloat16) | N/A | 94.97 | 1425.55 | 16.43 | 15.01 |
| | int8dq | N/A | 8.44 | 63.45 | 8.98 | 7.52 |
| | int8wo | N/A | 139.76 | 1051.02 | 10.42 | 7.52 |
| | int4wo-64 | N/A | 179.44 | 757.60 | 6.62 | 4.22 |
| | autoquant | N/A | 137.71 | 1037.74 | 11.08 | 7.54 |

note: Int8 dynamic quantization works best on compute bound as opposed to memory bound models. Some relatable examples might be [SAM](https://github.com/pytorch-labs/segment-anything-fast) which is compute bound vs Llama at batchsize=1 which is memory bound.

Expand All @@ -50,7 +52,20 @@ And a quick crash course on inference quantization to help parse the above table

In some cases we rewrote popular GenAI models to be significantly faster in native PyTorch as in no C++/CUDA to achieve at the time SOTA inference performance. These involve more intrusive code changes.

* 8x speedups for Image segmentation models with [sam-fast](https://pytorch.org/blog/accelerating-generative-ai)
* 9.5x speedups for Image segmentation models with [sam-fast](https://pytorch.org/blog/accelerating-generative-ai) compared to vanilla [sam](https://github.com/facebookresearch/segment-anything).
* 1.16x speedup when composing int8 quantization with 2:4 sparsity against the accelerated baseline `bfloat16` dtype and `torch.compile="max_autotune"`.

| Model Type | Technique | img/s | memory (MiB) | mIoU (coco2017 val) | relative speedup | relative accuracy |
|------------|------------------------------------------------------------------------------------------------------|-------|--------------|---------------------|------------------|-------------------|
| ViT-h | sam (float32, eager) | 2.78 | 28806 | 0.58 | baseline | baseline |
| | sam (bfloat16, eager) | 14.85 | 14424 | 0.58 | **5.34x** | **100%** |
| | sam-fast (bfloat16, max-autotune) | 22.75 | 15172 | 0.58 | **8.18x** | **100%** |
| | int8 dynamic quant (attn + mlp) | 24.91 | 15154 | 0.58 | **8.96x** | **100%** |
| | 2:4 sparsity (mlp only) | 24.81 | 15632 | 0.57 | **8.92x** | **98%** |
| | int8 dynamic quant (attn)<br>int8 dynamic quant + 2:4 sparsity (mlp lin1)<br>2:4 sparsity (mlp lin2) | 26.46 | 14865 | 0.57 | **9.52x** | **98%** |

The relative speedup is measured purely across the image encoder (ViT) of the model, where we apply our model optimizations. Benchmarks ran on an NVIDIA-A100-80GB with batch_size=32

* 10x speedups for Language models with [gpt-fast](https://pytorch.org/blog/accelerating-generative-ai-2)
* 3x speedup for Diffusion models with [sd-fast](https://pytorch.org/blog/accelerating-generative-ai-3)

Expand All @@ -68,7 +83,7 @@ swap_linear_with_semi_sparse_linear(model, {"seq.0": SemiSparseLinear})

* [MX](torchao/prototype/mx_formats) implementing training and inference support with tensors using the [OCP MX spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) data types, which can be described as groupwise scaled float8/float6/float4/int8, with the scales being constrained to powers of two. This work is prototype as the hardware support is not available yet.
* [nf4](torchao/dtypes/nf4tensor.py) which was used to [implement QLoRA](https://github.com/pytorch/torchtune/blob/main/docs/source/tutorials/qlora_finetune.rst) one of the most popular finetuning algorithms without writing custom Triton or CUDA code. Accessible talk [here](https://x.com/HamelHusain/status/1800315287574847701)
* [fp6](torchao/prototype/fp6_llm/) for 2x faster inference over fp16 with an easy to use wrapper api `convert_fp6_llm(model)`
* [fp6](torchao/prototype/quant_llm/) for 2x faster inference over fp16 with an easy to use API `quantize(model, fp6_llm_weight_only())`

## Composability

Expand All @@ -79,11 +94,34 @@ A key design principle for us is composability as in any new dtype or layout we


### Installation

`torchao` makes liberal use of several new features in Pytorch, it's recommended to use it with the current nightly or latest stable version of PyTorch.

Stable Release
#### Install torch

Install torch stable

```
pip install torch
```

Or torch nightlies

```
pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
```

#### Install torchao

Stable release from Pypi which will default to CUDA 12.1

```Shell
pip install torchao --extra-index-url https://download.pytorch.org/whl/test/cu121 # full options are cpu/cu118/cu121/cu124
pip install torchao
```

Stable Release from the PyTorch index
```Shell
pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
```

Nightly Release
Expand All @@ -104,10 +142,17 @@ python setup.py install
* [GaLore](torchao/prototype/galore/) a drop for the Adam Optimizer that allows you to finetune llama 7b on a single 4090 card with up to 70% speedups relative to eager PyTorch
* [DoRA](torchao/prototype/dora) a newer replacement for QLoRA with more promising convergence characteristics
* [Fused int4/fp16 Quant Matmul](torchao/prototype/hqq) which is particularly useful for compute bound kernels showing 4x speedups over tinygemm for larger batch sizes such as 512
* [gau-nernst](https://github.com/gau-nernst) fp6 kernels that are 4x faster than fp16 [torchao/prototype/fp6_llm](torchao/prototype/fp6_llm)
* [gau-nernst](https://github.com/gau-nernst) fp6 kernels that are 4x faster than fp16 [torchao/prototype/quant_llm](torchao/prototype/quant_llm)
* [vayuda](https://github.com/vayuda) with generic bitpacking kernels that were code generated using pure PyTorch [prototype/common](torchao/prototype/common)
* [andreaskopf](https://github.com/andreaskoepf) and [melvinebenezer](https://github.com/melvinebenezer) with [1 bit LLMs](torchao/prototype/dtypes) Bitnet 1.58 bitpacked into uint2 and fully code-generated with torch.compile

## Blogs and Videos
* [Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity](https://pytorch.org/blog/accelerating-neural-network-training/)
* [https://mobiusml.github.io/whisper-static-cache-blog/](https://mobiusml.github.io/whisper-static-cache-blog/)
* [Slaying OOMs at the Mastering LLM's course](https://x.com/HamelHusain/status/1800315287574847701)
* [Advanced Quantization at CUDA MODE](https://youtu.be/1u9xUK3G4VM?si=4JcPlw2w8chPXW8J)
* [Chip Huyen's GPU Optimization Workshop](https://www.youtube.com/live/v_q2JTIqE20?si=mf7HeZ63rS-uYpS6)

## How to contribute

This repository is currently under heavy development
Expand Down
Loading

0 comments on commit e90e280

Please sign in to comment.