drisspg · drisspg · Feb 11, 2024 · Feb 11, 2024 · Feb 11, 2024 · Feb 11, 2024
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -1,7 +1,6 @@
-# This workflow will install Python dependencies, run tests and lint with a single version of Python
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+# Basic flak8 + pytest workflow for Python 3.10
 
-name: Python application
+name: Python Lint and Test
 
 on:
   push:
@@ -26,14 +25,11 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install flake8 pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install -e .
+        pip install -e .'[dev]'
     - name: Lint with flake8
       run: |
-        # stop the build if there are Python syntax errors or undefined names
-        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
-        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
-        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+        flake8
     - name: Test with pytest
       run: |
         pytest
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,3 +18,9 @@ repos:
           - usort == 1.0.6
           - ufmt == 2.1.0
           - libcst == 1.0.1
+
+-   repo: https://github.com/pycqa/flake8
+    rev: 7.0.0
+    hooks:
+    -   id: flake8
+        additional_dependencies: [flake8-pyproject]
diff --git a/benchmarks/fp8_sat_cast.py b/benchmarks/fp8_sat_cast.py
@@ -1,6 +1,7 @@
 import itertools
-from dataclasses import dataclass
 
+from contextlib import suppress
+from dataclasses import dataclass
 from typing import List
 
 import torch
@@ -80,10 +81,8 @@ def run_experiment(config: ExperimentConfig) -> ExperimentResult:
         high_precision_tensor, eager_abs_max, scale, config.low_precision_dtype, config.saturated
     ).to(config.high_precision_dtype)
     eager_out_hp = eager_out.to(config.high_precision_dtype)
-    try:
+    with suppress(AssertionError):
         torch.testing.assert_close(nuggets_out_hp, eager_out_hp, rtol=1e-3, atol=1e-3)
-    except AssertionError as e:
-        pass
         # investigate why we are seeing small deviations
         # Mismatched elements: 62577 / 2097152 (3.0%)
         # Greatest absolute difference: 2.0 at index (11111,) (up to 0.001 allowed)

diff --git a/benchmarks/llama.py b/benchmarks/llama.py
@@ -82,11 +82,7 @@ def __init__(self, config: LLaMAConfig) -> None:
         self.kv_caches: List[KVCache] = []
 
     def _init_weights(self, module: nn.Module) -> None:
-        if isinstance(module, nn.Linear):
-            torch.nn.init.normal_(
-                module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer)
-            )
-        elif isinstance(module, nn.Embedding):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
             torch.nn.init.normal_(
                 module.weight, mean=0.0, std=0.02 / math.sqrt(2 * self.config.n_layer)
             )

diff --git a/benchmarks/qlora.py b/benchmarks/qlora.py
@@ -1,13 +1,13 @@
 import argparse
 import csv
-import gc
 import itertools
+
+import logging
 from dataclasses import asdict, dataclass
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Optional
 
 import torch
-import torch.nn as nn
 import torch.nn.functional as F
 
 import transformer_nuggets as nugs
@@ -16,12 +16,16 @@
 from transformer_nuggets.quant import NF4Tensor
 
 bnb_available = False
+
+
+logging.getLogger(__name__).addHandler(logging.NullHandler())
+
 try:
-    import bitsandbytes as bnb
+    import bitsandbytes as bnb  # noqa: F401
 
     bnb_available = True
 except ImportError:
-    raise (
+    logging.warning(
         "Could not import bitsandbytes, make sure you have installed it `pip install bitsandbytes` "
     )
 
@@ -137,7 +141,7 @@ def main(output_path: Optional[Path], profile_path: Optional[Path], dynamic: boo
         results = []
         for experiment_config in tqdm(gen_configs()):
             # Since we are changing between dynamic and not
-            import torch._dynamo
+            import torch._dynamo  # noqa: F402
 
             torch._dynamo.reset()
             experiment = experiment_types[experiment_config.op]

diff --git a/pyproject.toml b/pyproject.toml
@@ -20,20 +20,22 @@ classifiers = [
 dependencies = [
     "torch >= 2.1.1",
     "scipy >= 1.9.1",
-    "tqdm",
-    "tabulate"
+    "tqdm >= 4.66",
+    "tabulate >= 0.8"
 ]
 
 [project.optional-dependencies]
 dev = [
     "black==23.3.0",
     "usort==1.0.6",
-    "ufmt==2.1.0",
-    "libcst==1.0.1",
+    "ufmt==2.3.0",
+    "libcst==1.1.0",
     "pre-commit==3.6.0",
     "bumpver",
     "pip-tools",
-    "pytest"
+    "pytest",
+    "flake8==6.1.0",
+    "flake8-pyproject",
 ]
 
 qlora = ['bitsandbytes']
@@ -46,6 +48,14 @@ llama = [
     "float8_experimental",
 ]
 
+# ---------- TOOL CONFIGURATIONS ------------
+[tool.flake8]
+max-line-length = 99
+ignore = ['E231', 'E241', 'E501', 'C408', 'E261', 'E731', 'G004', 'W503', 'E203']
+per-file-ignores = [
+    '__init__.py:F401',
+]
+
 [tool.usort]
 first_party_detection = false
 

diff --git a/test/test_flash.py b/test/test_flash.py
@@ -7,6 +7,7 @@
 @pytest.mark.parametrize("causal", [True, False])
 @pytest.mark.parametrize("bias_choice", [BiasMode.rel_pos, BiasMode.none, BiasMode.alibi])
 @pytest.mark.parametrize("sm_scale", [None, 1])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_op(Z, H, N_CTX, D_HEAD, causal, bias_choice, sm_scale, dtype=torch.float16):
     torch.manual_seed(20)
     q = (

diff --git a/test/test_fp8.py b/test/test_fp8.py
@@ -5,6 +5,7 @@
 
 
 @pytest.mark.parametrize("fp8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_basic_quant(fp8_dtype):
     torch.manual_seed(0)
     torch.cuda.manual_seed_all(0)
@@ -17,6 +18,7 @@ def test_basic_quant(fp8_dtype):
 
 
 @pytest.mark.parametrize("fp8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_saturated(fp8_dtype):
     torch.manual_seed(0)
     torch.cuda.manual_seed_all(0)

diff --git a/test/test_qlora.py b/test/test_qlora.py
@@ -21,6 +21,7 @@
 @pytest.mark.parametrize(
     "inpt_size, block_size, scaler_block_size", [(16384, 64, 256), (256, 16, 16), (1024, 32, 32)]
 )
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_reconstruction(inpt_size: int, block_size: int, scaler_block_size: int):
     torch.manual_seed(0)
     device = "cuda"
@@ -37,9 +38,10 @@ def test_reconstruction(inpt_size: int, block_size: int, scaler_block_size: int)
 
 @unittest.skipIf(not bnb_available, "Bitsandbytes not available")
 @pytest.mark.parametrize("embed_dim", [256, 4096, 5120, 6656, 8192])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_reconstruction_qlora_vs_bnb(embed_dim: int):
     torch.manual_seed(0)
-    device = "cuda:0"
+    device = "cuda"
     input_weight = qlora.build_input_weight(embed_dim, device)
     nugs_qlora = NF4Tensor.from_tensor(input_weight)
     bnb_linear = qlora.build_bitsandbytes_linear(input_weight, device)
@@ -56,10 +58,11 @@ def test_reconstruction_qlora_vs_bnb(embed_dim: int):
     assert (nugs_diff - bnb_diff).abs() < 2e-1
 
 
-@unittest.skipIf(not bnb_available, "Bitsandbytes not available")
+@pytest.mark.skipIf(not bnb_available, "Bitsandbytes not available")
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 @pytest.mark.parametrize("embed_dim", [256, 4096, 5120, 6656, 8192])
 def test_binning_distribution(embed_dim: int):
-    device = "cuda:0"
+    device = "cuda"
     input_weight = qlora.build_input_weight(embed_dim, device)
     nugs_qlora = NF4Tensor.from_tensor(input_weight)
     first_elements = (nugs_qlora.quantized_data >> 4).to(torch.long)
@@ -71,21 +74,22 @@ def test_binning_distribution(embed_dim: int):
     bnb_first_elements = (bnb_data >> 4).to(torch.long)
     bnb_second_elements = (bnb_data & 0b1111).to(torch.long)
 
-    bnb_first_counts = torch.unique(bnb_first_elements, return_counts=True)[1]
-    bnb_second_counts = torch.unique(bnb_second_elements, return_counts=True)[1]
+    bnb_first_counts = torch.unique(bnb_first_elements, return_counts=True)[1]  # noqa: F841
+    bnb_second_counts = torch.unique(bnb_second_elements, return_counts=True)[1]  # noqa: F841
 
-    first_counts = torch.unique(first_elements, return_counts=True)[1]
-    second_counts = torch.unique(second_elements, return_counts=True)[1]
+    first_counts = torch.unique(first_elements, return_counts=True)[1]  # noqa: F841
+    second_counts = torch.unique(second_elements, return_counts=True)[1]  # noqa: F841
 
     # Why are these normally distributed and not uniform?
 
 
 @pytest.mark.parametrize("embed_dim", [256, 4096, 5120, 6656, 8192])
 @pytest.mark.parametrize("compile", [True, False])
 @pytest.mark.parametrize("requires_grad", [True, False])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_autograd_func_to_eager(embed_dim: int, compile: bool, requires_grad: bool):
     torch.manual_seed(0)
-    device = "cuda:0"
+    device = "cuda"
     input_weight = qlora.build_input_weight(embed_dim, device)
     sample_input = qlora.get_sample_inputs(8, 128, embed_dim, device, requires_grad=requires_grad)
     nugs_qlora = NF4Tensor.from_tensor(input_weight)
@@ -99,9 +103,10 @@ def test_autograd_func_to_eager(embed_dim: int, compile: bool, requires_grad: bo
         out.sum().backward()
 
 
-@unittest.skipIf(not bnb_available, "Bitsandbytes not available")
+@pytest.mark.skipIf(not bnb_available, "Bitsandbytes not available")
 @pytest.mark.parametrize("embed_dim", [256, 4096, 5120, 6656, 8192])
 @pytest.mark.parametrize("compile", [True, False])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_bitsandbytes_linear_parity(embed_dim, compile):
     device = torch.device("cuda:0")
     input_weight = qlora.build_input_weight(embed_dim, device)
@@ -127,9 +132,10 @@ def qlora_linear(
     assert bnb_difference.max() < 0.5 * embed_dim
 
 
-@unittest.skipIf(not bnb_available, "Bitsandbytes not available")
+@pytest.mark.skipIf(not bnb_available, "Bitsandbytes not available")
 @pytest.mark.parametrize("embed_dim", [256, 4096, 5120, 6656, 8192])
 @pytest.mark.parametrize("compile", [True, False])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_bitsandbytes_mlp_parity(embed_dim, compile):
     device = torch.device("cuda:0")
     weights = qlora.get_mlp_weights(embed_dim, device)
@@ -159,6 +165,7 @@ def test_bitsandbytes_mlp_parity(embed_dim, compile):
 @pytest.mark.parametrize("r", [1, 2])
 @pytest.mark.parametrize("dropout", [0.0, 0.2])
 @pytest.mark.parametrize("run_backward", [True, False])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available")
 def test_qlora_linear(embed_dim: int, compile: bool, r: int, dropout: float, run_backward: bool):
     torch.manual_seed(0)
     device = "cuda:0"

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -11,9 +11,8 @@ def test_nan():
             0.0,
         ]
     )
-    with pytest.raises(RuntimeError, match="returned a NaN"):
-        with NanInfDetect():
-            print(torch.div(a, a))
+    with pytest.raises(RuntimeError, match="returned a NaN"), NanInfDetect():
+        print(torch.div(a, a))
 
 
 def test_inf():
@@ -23,9 +22,8 @@ def test_inf():
         ],
         dtype=torch.float16,
     )
-    with pytest.raises(RuntimeError, match="returned an Inf"):
-        with NanInfDetect():
-            print(torch.mul(a, 65537))
+    with pytest.raises(RuntimeError, match="returned an Inf"), NanInfDetect():
+        print(torch.mul(a, 65537))
 
 
 def test_breakpoint():
@@ -34,11 +32,11 @@ def test_breakpoint():
             0.0,
         ]
     )
-    with pytest.raises(RuntimeError, match="returned a NaN"):
-        with mock.patch("builtins.breakpoint") as mock_breakpoint:
-            with NanInfDetect(do_breakpoint=True):
-                print(torch.div(a, a))
-            mock_breakpoint.assert_called_once()
+    with pytest.raises(RuntimeError, match="returned a NaN"), mock.patch(
+        "builtins.breakpoint"
+    ) as mock_breakpoint, NanInfDetect(do_breakpoint=True):
+        print(torch.div(a, a))
+        mock_breakpoint.assert_called_once()
 
 
 if __name__ == "__main__":

diff --git a/transformer_nuggets/flash/__init__.py b/transformer_nuggets/flash/__init__.py
@@ -1 +1 @@
-from transformer_nuggets.flash.flash_attention import *
+from transformer_nuggets.flash.flash_attention import *  # noqa: F403
diff --git a/transformer_nuggets/fp8/scaled_quant.py b/transformer_nuggets/fp8/scaled_quant.py
@@ -90,5 +90,5 @@ def eager_scaled_quant(
         out = torch.where(
             out < -1 * torch.finfo(fp8_dtype).max, -1 * torch.finfo(fp8_dtype).max, out
         )
-    abs_max = torch.max(torch.abs(out))
+    _ = torch.max(torch.abs(out))
     return out.to(fp8_dtype)
diff --git a/transformer_nuggets/llama/finetune.py b/transformer_nuggets/llama/finetune.py
@@ -2,16 +2,11 @@
 Used to train a model from scratch on big dense blocks of text data using causal attention.
 """
 import argparse
-import csv
 import logging
-import math
 import os
 import random
-import time
-from contextlib import nullcontext
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from pathlib import Path
-from typing import List, Optional
 
 import numpy as np
 import torch
@@ -22,7 +17,6 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.utils.data import DataLoader, IterableDataset
-from tqdm import tqdm
 from transformer_nuggets.llama.model import ModelArgs, Transformer, TransformerBlock
 from transformer_nuggets.llama.train import (
     calculate_loss,
@@ -222,15 +216,17 @@ def train(
             # TODO(future): fix this condition, eval currently only happens
             # if eval_interval and batch_size are multiples of each other
             if not is_accumulating and step_count % training_config.eval_interval == 0:
-                t0 = time.time()
-                val_loss = validate(
-                    model, val_data, val_loss_file, training_config, step_count, rank, world_size
-                )
-                t1 = time.time() - t0
-                if rank == 0:
-                    logging.info(
-                        f"step {iter_num}: val loss {val_loss:.4f}, val time: {t1 * 1000:.2f}ms"
-                    )
+                pass
+                # TODO: add validation loop
+                # t0 = time.time()
+                # val_loss = validate(
+                #     model, val_data, val_loss_file, training_config, step_count, rank, world_size
+                # )
+                # t1 = time.time() - t0
+                # if rank == 0:
+                #     logging.info(
+                #         f"step {iter_num}: val loss {val_loss:.4f}, val time: {t1 * 1000:.2f}ms"
+                #     )
 
             if not is_accumulating and step_count % training_config.save_interval == 0:
                 checkpoint_path = training_config.out_dir / f"iter-{iter_num:06d}-ckpt.pth"

diff --git a/transformer_nuggets/llama/model.py b/transformer_nuggets/llama/model.py
@@ -128,7 +128,7 @@ def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         freqs_cis = self.freqs_cis[input_pos]
         x = self.tok_embeddings(idx)
 
-        for i, layer in enumerate(self.layers):
+        for _, layer in enumerate(self.layers):
             x = layer(x, input_pos, freqs_cis)
         x = self.norm(x)
         logits = self.output(x)
@@ -202,7 +202,7 @@ def forward(self, x: Tensor, freqs_cis: Tensor) -> Tensor:
         q = apply_rotary_emb(q, freqs_cis)
         k = apply_rotary_emb(k, freqs_cis)
 
-        q, k, v = map(lambda x: x.transpose(1, 2), (q, k, v))
+        q, k, v = (x.transpose(1, 2) for x in (q, k, v))
 
         k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
         v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		from transformer_nuggets.flash.flash_attention import *
		from transformer_nuggets.flash.flash_attention import * # noqa: F403