Merge branch 'unpack_int4' of https://github.com/jeromeku/ao into unp…

…ack_int4
pytorch · Jul 4, 2024 · e90e280 · e90e280
2 parents 75df5f5 + d1bd61b
commit e90e280
Show file tree

Hide file tree

Showing 78 changed files with 3,853 additions and 1,589 deletions.
diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py
@@ -1163,7 +1163,6 @@ def merge_into(
             # Finally, upload the record to Rockset. The list of pending and failed
             # checks are at the time of the merge
             save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                 comment_id=comment_id,
                 pr_num=self.pr_num,
                 owner=self.org,
@@ -1179,10 +1178,8 @@ def merge_into(
                 merge_base_sha=self.get_merge_base(),
                 merge_commit_sha=merge_commit_sha,
                 is_failed=False,
-                dry_run=dry_run,
                 skip_mandatory_checks=skip_mandatory_checks,
                 ignore_current=bool(ignore_current_checks),
-                workspace=ROCKSET_MERGES_WORKSPACE,
             )
         else:
             print("Missing comment ID or PR number, couldn't upload to Rockset")
@@ -1489,7 +1486,6 @@ def checks_to_markdown_bullets(
 
 @retries_decorator()
 def save_merge_record(
-    collection: str,
     comment_id: int,
     pr_num: int,
     owner: str,
@@ -1505,59 +1501,44 @@ def save_merge_record(
     merge_base_sha: str,
     merge_commit_sha: str = "",
     is_failed: bool = False,
-    dry_run: bool = False,
     skip_mandatory_checks: bool = False,
     ignore_current: bool = False,
     error: str = "",
-    workspace: str = "commons",
 ) -> None:
     """
-    This saves the merge records into Rockset, so we can query them (for fun and profit)
+    This saves the merge records as a json, which can later be uploaded to s3
     """
-    if dry_run:
-        # Decide not to save the record to Rockset if dry-run is set to not pollute
-        # the collection
-        return
-
-    try:
-        import rockset  # type: ignore[import]
-
-        # Prepare the record to be written into Rockset
-        data = [
-            {
-                "comment_id": comment_id,
-                "pr_num": pr_num,
-                "owner": owner,
-                "project": project,
-                "author": author,
-                "pending_checks": pending_checks,
-                "failed_checks": failed_checks,
-                "ignore_current_checks": ignore_current_checks,
-                "broken_trunk_checks": broken_trunk_checks,
-                "flaky_checks": flaky_checks,
-                "unstable_checks": unstable_checks,
-                "last_commit_sha": last_commit_sha,
-                "merge_base_sha": merge_base_sha,
-                "merge_commit_sha": merge_commit_sha,
-                "is_failed": is_failed,
-                "skip_mandatory_checks": skip_mandatory_checks,
-                "ignore_current": ignore_current,
-                "error": error,
-            }
-        ]
 
-        client = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        )
-        client.Documents.add_documents(
-            collection=collection,
-            data=data,
-            workspace=workspace,
-        )
+    # Prepare the record to be written into Rockset
+    data = [
+        {
+            "comment_id": comment_id,
+            "pr_num": pr_num,
+            "owner": owner,
+            "project": project,
+            "author": author,
+            "pending_checks": pending_checks,
+            "failed_checks": failed_checks,
+            "ignore_current_checks": ignore_current_checks,
+            "broken_trunk_checks": broken_trunk_checks,
+            "flaky_checks": flaky_checks,
+            "unstable_checks": unstable_checks,
+            "last_commit_sha": last_commit_sha,
+            "merge_base_sha": merge_base_sha,
+            "merge_commit_sha": merge_commit_sha,
+            "is_failed": is_failed,
+            "skip_mandatory_checks": skip_mandatory_checks,
+            "ignore_current": ignore_current,
+            "error": error,
+            # This is a unique identifier for the record for deduping purposes
+            # in rockset.  Any unique string would work
+            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
+        }
+    ]
+    repo_root = Path(__file__).resolve().parent.parent.parent
 
-    except ModuleNotFoundError:
-        print("Rockset is missing, no record will be saved")
-        return
+    with open(repo_root / "merge_record.json", "w") as f:
+        json.dump(data, f)
 
 
 @retries_decorator(rc=[])
@@ -2374,7 +2355,6 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
             # list of pending and failed checks here, but they are not really
             # needed at the moment
             save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                 comment_id=args.comment_id,
                 pr_num=args.pr_num,
                 owner=org,
@@ -2389,11 +2369,9 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None:
                 last_commit_sha=pr.last_commit().get("oid", ""),
                 merge_base_sha=pr.get_merge_base(),
                 is_failed=True,
-                dry_run=args.dry_run,
                 skip_mandatory_checks=args.force,
                 ignore_current=args.ignore_current,
                 error=str(e),
-                workspace=ROCKSET_MERGES_WORKSPACE,
             )
         else:
             print("Missing comment ID or PR number, couldn't upload to Rockset")

diff --git a/.github/scripts/validate_binaries.sh b/.github/scripts/validate_binaries.sh
@@ -0,0 +1,2 @@
+pip install ${PYTORCH_PIP_PREFIX} torchao --index-url ${PYTORCH_PIP_DOWNLOAD_URL}
+python  ./test/smoke_tests/smoke_tests.py
diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml
@@ -54,6 +54,7 @@ jobs:
 
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     with:
+      timeout: 60
       runner: ${{ matrix.runs-on }}
       gpu-arch-type: ${{ matrix.gpu-arch-type }}
       gpu-arch-version: ${{ matrix.gpu-arch-version }}

diff --git a/.github/workflows/trymerge.yml b/.github/workflows/trymerge.yml
@@ -9,6 +9,8 @@ jobs:
     name: try_merge_pr_${{ github.event.client_payload.pr_num }}
     runs-on: ubuntu-latest
     environment: pytorchbot-env
+    permissions:
+      id-token: write
     env:
       GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
     steps:
@@ -26,6 +28,8 @@ jobs:
           check-latest: false
           cache: pip
           architecture: x64
+      # TODO (huydhn): get rid of Rockset
+      - run: pip install pyyaml==6.0 rockset==1.0.3
 
       - name: Setup committer id
         run: |
@@ -36,8 +40,14 @@ jobs:
         env:
           GITHUB_TOKEN: ${{ secrets.PYTORCH_MERGEBOT_TOKEN }}
           PR_NUM: ${{ github.event.client_payload.pr_num }}
+          FORCE: ${{ github.event.client_payload.force}}
           COMMENT_ID: ${{ github.event.client_payload.comment_id }}
           GIT_REMOTE_URL: https://github.com/pytorch/ao
+          REBASE: ${{ github.event.client_payload.rebase }}
+          IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
+          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+          DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
         run: |
           set -x
           if [ -n "${FORCE}" ]; then
@@ -58,6 +68,22 @@ jobs:
             python3 .github/scripts/trymerge.py "${PR_NUM}"
           fi
 
+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        continue-on-error: true
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Upload merge record to s3
+        if: always()
+        continue-on-error: true
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: ossci-raw-job-status
+          s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
+          path: merge_record.json
+
 # We want newer merge commands to supercede old ones
 concurrency:
   group: try-merge-${{ github.event.client_payload.pr_num }}

diff --git a/.github/workflows/validate-binaries.yml b/.github/workflows/validate-binaries.yml
@@ -0,0 +1,49 @@
+name: Validate binaries
+
+on:
+  workflow_call:
+    inputs:
+      channel:
+        description: "Channel to use (nightly, test, release, all)"
+        required: false
+        type: string
+        default: release
+      ref:
+        description: "Reference to checkout, defaults to empty"
+        default: ""
+        required: false
+        type: string
+  workflow_dispatch:
+    inputs:
+      channel:
+        description: "Channel to use (nightly, test, release, all)"
+        required: true
+        type: choice
+        options:
+          - release
+          - nightly
+          - test
+          - all
+      ref:
+        description: "Reference to checkout, defaults to empty"
+        default: ""
+        required: false
+        type: string
+      pytorch_version:
+        description: "PyTorch version to validate (ie. 2.0, 2.2.2, etc.) - optional"
+        default: ""
+        required: false
+        type: string
+jobs:
+  validate-binaries:
+    uses: pytorch/test-infra/.github/workflows/validate-domain-library.yml@main
+    with:
+      package_type: "wheel"
+      version: ${{ inputs.version }}
+      os: "linux"
+      channel: ${{ inputs.channel }}
+      repository: "pytorch/ao"
+      with_cuda: "enable"
+      with_rocm: "disable"
+      smoke_test: "source ./.github/scripts/validate_binaries.sh"
+      install_torch: true
diff --git a/.gitignore b/.gitignore
@@ -127,6 +127,7 @@ env
 .circleci/scripts/COMMIT_MSG
 scripts/release_notes/*.json
 sccache-stats*.json
+merge_record.json
 
 # These files get copied over on invoking setup.py
 torchgen/packaged/*

diff --git a/README.md b/README.md
@@ -19,8 +19,8 @@ All with no intrusive code changes and minimal accuracy degradation.
 Quantizing your models is a 1 liner that should work on any model with an `nn.Linear` including your favorite HuggingFace model. You can find a more comprehensive usage instructions [here](torchao/quantization/) and a HuggingFace inference example [here](scripts/hf_eval.py)
 
 ```python
-from torchao.quantization.quant_api import quantize
-m = quantize(m, "int4wo")
+from torchao.quantization.quant_api import quantize, int4_weight_only
+m = quantize(m, int4_weight_only())
 ```
 
 Benchmarks are run on a machine with a single A100 GPU using the script in `_models/llama` which generates text in a latency-optimized way (batchsize=1)
@@ -29,15 +29,17 @@ The models used were `meta-llama/Llama-2-7b-chat-hf` and `meta-llama/Meta-Llama-
 
 | Model       | Technique          | wikitext-perplexity | Tokens/Second | Memory Bandwidth (GB/s) | Peak Memory (GB) | Model Size (GB) |
 | ----------- | ------------------ | ------------------- | ------------- | ----------------------- | ---------------- | --------------- |
-| Llama-2-7B  | Base (bfloat16)    | 12.212              |  105.02       | 1387.78                 | 13.21            | 13.90           |
-|             | int8dq             | 12.262              |  9.40         | 62.26                   | 6.62             | 8.61            |
-|             | int8wo             | 12.204              |  147.03       | 973.54                  | 6.62             | 8.95            |
-|             | int4wo-64          | 12.843              |  199.81       | 746.45                  | 3.74             | 4.75            |
-|             | int4wo-64-GPTQ     | 12.489              |  199.81       | 746.45                  | 3.74             | 4.75            |
-| Llama-3-8B  | Base (bfloat16)    |                  |  94.91        | 1424.58                 | 15.01            | 16.43           |
-|             | int8dq             |                  |  8.41         | 63.23                   | 7.52             | 9.24            |
-|             | int8wo             |                  |  136.75       | 1028.38                 | 7.52             | 10.42           |
-|             | int4wo-64          |                  |  179.41       | 757.45                  | 4.22             | 6.88            |
+| Llama-2-7B  | Base (bfloat16)    | 12.212              |  105.14       | 1389.35                 | 13.88            | 13.21           |
+|             | int8dq             | 12.262              |    9.20       |   60.93                 |  8.33            |  6.62           |
+|             | int8wo             | 12.204              |  150.18       |  994.40                 |  8.95            |  6.62           |
+|             | int4wo-64          | 12.843              |  199.86       |  746.66                 |  4.50            |  3.74           |
+|             | int4wo-64-GPTQ     | 12.489              |  199.86       |  746.66                 |  4.50            |  3.74           |
+|             | autoquant          | 12.204              |  159.22       | 1069.87                 |  8.91            |  6.72           |
+| Llama-3-8B  | Base (bfloat16)    | N/A                 |   94.97       | 1425.55                 | 16.43            | 15.01           |
+|             | int8dq             | N/A                 |    8.44       |   63.45                 |  8.98            |  7.52           |
+|             | int8wo             | N/A                 |  139.76       | 1051.02                 | 10.42            |  7.52           |
+|             | int4wo-64          | N/A                 |  179.44       |  757.60                 |  6.62            |  4.22           |
+|             | autoquant          | N/A                 |  137.71       | 1037.74                 | 11.08            |  7.54           |
 
 note: Int8 dynamic quantization works best on compute bound as opposed to memory bound models. Some relatable examples might be [SAM](https://github.com/pytorch-labs/segment-anything-fast) which is compute bound vs Llama at batchsize=1 which is memory bound.
 
@@ -50,7 +52,20 @@ And a quick crash course on inference quantization to help parse the above table
 
 In some cases we rewrote popular GenAI models to be significantly faster in native PyTorch as in no C++/CUDA to achieve at the time SOTA inference performance. These involve more intrusive code changes.
 
-* 8x speedups for Image segmentation models with [sam-fast](https://pytorch.org/blog/accelerating-generative-ai)
+* 9.5x speedups for Image segmentation models with [sam-fast](https://pytorch.org/blog/accelerating-generative-ai) compared to vanilla [sam](https://github.com/facebookresearch/segment-anything).
+* 1.16x speedup when composing int8 quantization with 2:4 sparsity against the accelerated baseline `bfloat16` dtype and `torch.compile="max_autotune"`.
+
+| Model Type | Technique                                                                                            | img/s | memory (MiB) | mIoU (coco2017 val) | relative speedup | relative accuracy |
+|------------|------------------------------------------------------------------------------------------------------|-------|--------------|---------------------|------------------|-------------------|
+| ViT-h      | sam (float32, eager)                                                                                 |  2.78 | 28806        | 0.58                | baseline         | baseline          |
+|            | sam (bfloat16, eager)                                                                                | 14.85 | 14424        | 0.58                | **5.34x**        | **100%**          |
+|            | sam-fast (bfloat16, max-autotune)                                                                    | 22.75 | 15172        | 0.58                | **8.18x**        | **100%**          |
+|            | int8 dynamic quant (attn + mlp)                                                                      | 24.91 | 15154        | 0.58                | **8.96x**        | **100%**          |
+|            | 2:4 sparsity (mlp only)                                                                              | 24.81 | 15632        | 0.57                | **8.92x**        | **98%**           |
+|            | int8 dynamic quant (attn)<br>int8 dynamic quant + 2:4 sparsity (mlp lin1)<br>2:4 sparsity (mlp lin2) | 26.46 | 14865        | 0.57                | **9.52x**        | **98%**           |
+
+The relative speedup is measured purely across the image encoder (ViT) of the model, where we apply our model optimizations. Benchmarks ran on an NVIDIA-A100-80GB with batch_size=32
+
 * 10x speedups for Language models with [gpt-fast](https://pytorch.org/blog/accelerating-generative-ai-2)
 * 3x speedup for Diffusion models with [sd-fast](https://pytorch.org/blog/accelerating-generative-ai-3)
 
@@ -68,7 +83,7 @@ swap_linear_with_semi_sparse_linear(model, {"seq.0": SemiSparseLinear})
 
 * [MX](torchao/prototype/mx_formats) implementing training and inference support with tensors using the [OCP MX spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) data types, which can be described as groupwise scaled float8/float6/float4/int8, with the scales being constrained to powers of two. This work is prototype as the hardware support is not available yet.
 * [nf4](torchao/dtypes/nf4tensor.py) which was used to [implement QLoRA](https://github.com/pytorch/torchtune/blob/main/docs/source/tutorials/qlora_finetune.rst) one of the most popular finetuning algorithms without writing custom Triton or CUDA code. Accessible talk [here](https://x.com/HamelHusain/status/1800315287574847701)
-* [fp6](torchao/prototype/fp6_llm/) for 2x faster inference over fp16 with an easy to use wrapper api `convert_fp6_llm(model)`
+* [fp6](torchao/prototype/quant_llm/) for 2x faster inference over fp16 with an easy to use API `quantize(model, fp6_llm_weight_only())`
 
 ## Composability
 
@@ -79,11 +94,34 @@ A key design principle for us is composability as in any new dtype or layout we
 
 
 ### Installation
+
 `torchao` makes liberal use of several new features in Pytorch, it's recommended to use it with the current nightly or latest stable version of PyTorch.
 
-Stable Release
+#### Install torch
+
+Install torch stable
+
+```
+pip install torch
+```
+
+Or torch nightlies
+
+```
+pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121
+```
+
+#### Install torchao
+
+Stable release from Pypi which will default to CUDA 12.1
+
 ```Shell
-pip install torchao --extra-index-url https://download.pytorch.org/whl/test/cu121 # full options are cpu/cu118/cu121/cu124
+pip install torchao
+```
+
+Stable Release from the PyTorch index
+```Shell
+pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
 ```
 
 Nightly Release
@@ -104,10 +142,17 @@ python setup.py install
     * [GaLore](torchao/prototype/galore/) a drop for the Adam Optimizer that allows you to finetune llama 7b on a single 4090 card with up to 70% speedups relative to eager PyTorch
     * [DoRA](torchao/prototype/dora) a newer replacement for QLoRA with more promising convergence characteristics
     * [Fused int4/fp16 Quant Matmul](torchao/prototype/hqq) which is particularly useful for compute bound kernels showing 4x speedups over tinygemm for larger batch sizes such as 512
-* [gau-nernst](https://github.com/gau-nernst) fp6 kernels that are 4x faster than fp16 [torchao/prototype/fp6_llm](torchao/prototype/fp6_llm)
+* [gau-nernst](https://github.com/gau-nernst) fp6 kernels that are 4x faster than fp16 [torchao/prototype/quant_llm](torchao/prototype/quant_llm)
 * [vayuda](https://github.com/vayuda) with generic bitpacking kernels that were code generated using pure PyTorch [prototype/common](torchao/prototype/common)
 * [andreaskopf](https://github.com/andreaskoepf) and [melvinebenezer](https://github.com/melvinebenezer) with [1 bit LLMs](torchao/prototype/dtypes) Bitnet 1.58 bitpacked into uint2 and fully code-generated with torch.compile
 
+## Blogs and Videos
+* [Accelerating Neural Network Training with Semi-Structured (2:4) Sparsity](https://pytorch.org/blog/accelerating-neural-network-training/)
+* [https://mobiusml.github.io/whisper-static-cache-blog/](https://mobiusml.github.io/whisper-static-cache-blog/)
+* [Slaying OOMs at the Mastering LLM's course](https://x.com/HamelHusain/status/1800315287574847701)
+* [Advanced Quantization at CUDA MODE](https://youtu.be/1u9xUK3G4VM?si=4JcPlw2w8chPXW8J)
+* [Chip Huyen's GPU Optimization Workshop](https://www.youtube.com/live/v_q2JTIqE20?si=mf7HeZ63rS-uYpS6)
+
 ## How to contribute
 
 This repository is currently under heavy development