Merge branch 'main' into kylesayrs/smoothquant-ignore-glm

vllm-project · Jan 3, 2025 · 72f5b93 · 72f5b93
2 parents ec37204 + 1b8c7bf
commit 72f5b93
Show file tree

Hide file tree

Showing 6 changed files with 252 additions and 9 deletions.
diff --git a/examples/big_models_with_accelerate/README.md b/examples/big_models_with_accelerate/README.md
@@ -54,7 +54,7 @@ When working with `accelerate`, it is important to keep in mind that CPU offload
 
 We will show working examples for each use case:
 - **CPU Offloading**: Quantize `Llama-70B` to `FP8` using `PTQ` with a single GPU
-- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 8 GPUs
+- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 2 GPUs
 
 ### Installation
 
@@ -81,12 +81,10 @@ The resulting model `./Meta-Llama-3-70B-Instruct-FP8-Dynamic` is ready to run wi
 
 For quantization methods that require calibration data (e.g. `GPTQ`), CPU offloading is too slow. For these methods, `llmcompressor` can use `accelerate` multi-GPU to quantize models that are larger than a single GPU. For example, when quantizing a model to `int8`, we typically use `GPTQ` to statically quantize the weights, which requires calibration data.
 
-Note that running non-sequential `GPTQ` requires significant additional memory beyond the model size. As a rough rule of thumb, running `GPTQModifier` non-sequentially will take up 3x the model size for a 16-bit model and 2x the model size for a 32-bit model (these estimates include the memory required to store the model itself in GPU).
-
-- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 8 A100s:
+- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 2 A100s:
 
 ```python
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export CUDA_VISIBLE_DEVICES=0,1
 python multi_gpu_int8.py
 ```
 

diff --git a/examples/quantization_kv_cache/gemma2_fp8_kv_example.py b/examples/quantization_kv_cache/gemma2_fp8_kv_example.py
@@ -0,0 +1,99 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.transformers import oneshot
+
+# Select model and load it.
+MODEL_ID = "google/gemma-2-9b-it"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def process_and_tokenize(example):
+    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+    return tokenizer(
+        text,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per-channel scales
+#   * quantize the activations to fp8 with dynamic per-token scales
+#   * quantize the kv cache to fp8 with per-tensor scales
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: channel
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: token
+                        dynamic: true
+                        symmetric: true
+                    targets: ["Linear"]
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
+"""
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+print(
+    "Note: Inference with the quantized kv_cache is not supported. ",
+    "Please use vLLM for inference with the quantized kv_cache.",
+)
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py b/examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
@@ -0,0 +1,101 @@
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from llmcompressor.transformers import oneshot
+
+# Select model and load it.
+# Phi-3.5 is a special case for KV cache quantization because it has
+# fused QKV linear layers.
+MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+
+# Select calibration dataset.
+DATASET_ID = "HuggingFaceH4/ultrachat_200k"
+DATASET_SPLIT = "train_sft"
+
+# Select number of samples. 512 samples is a good place to start.
+# Increasing the number of samples can improve accuracy.
+NUM_CALIBRATION_SAMPLES = 512
+MAX_SEQUENCE_LENGTH = 2048
+
+# Load dataset and preprocess.
+ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
+ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))
+
+
+def process_and_tokenize(example):
+    text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
+    return tokenizer(
+        text,
+        padding=False,
+        max_length=MAX_SEQUENCE_LENGTH,
+        truncation=True,
+        add_special_tokens=False,
+    )
+
+
+ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp8 with per-tensor scales
+#   * quantize the activations to fp8 with per-tensor scales
+#   * quantize the kv cache to fp8 with per-tensor scales
+recipe = """
+quant_stage:
+    quant_modifiers:
+        QuantizationModifier:
+            ignore: ["lm_head"]
+            config_groups:
+                group_0:
+                    weights:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    input_activations:
+                        num_bits: 8
+                        type: float
+                        strategy: tensor
+                        dynamic: false
+                        symmetric: true
+                    targets: ["Linear"]
+            kv_cache_scheme:
+                num_bits: 8
+                type: float
+                strategy: tensor
+                dynamic: false
+                symmetric: true
+"""
+
+# Apply algorithms.
+oneshot(
+    model=model,
+    dataset=ds,
+    recipe=recipe,
+    max_seq_length=MAX_SEQUENCE_LENGTH,
+    num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+)
+
+print(
+    "Note: Inference with the quantized kv_cache is not supported. ",
+    "Please use vLLM for inference with the quantized kv_cache.",
+)
+# Confirm generations of the quantized model look sane.
+print("\n\n")
+print("========== SAMPLE GENERATION ==============")
+input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
+output = model.generate(input_ids, max_new_tokens=100)
+print(tokenizer.decode(output[0]))
+print("==========================================\n\n")
+
+# Save to disk compressed.
+SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+tokenizer.save_pretrained(SAVE_DIR)
diff --git a/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py b/src/llmcompressor/transformers/sparsification/compressed_tensors_utils.py
@@ -236,6 +236,10 @@ def patch_tied_tensors_bug(model: torch.nn.Module):
         input_embed = model.get_input_embeddings()
         output_embed = model.get_output_embeddings()
 
+        if input_embed is None or output_embed is None:
+            # some models fail to properly override the abstract methods
+            return
+
         if storage_ptr(input_embed.weight) == storage_ptr(output_embed.weight):
             for module in (input_embed, output_embed):
                 if not is_module_offloaded(module):

diff --git a/src/llmcompressor/utils/metric_logging.py b/src/llmcompressor/utils/metric_logging.py
@@ -8,7 +8,17 @@
 __all__ = ["get_GPU_memory_usage", "get_layer_size_mb", "CompressionLogger"]
 
 
-def get_GPU_memory_usage() -> List[Tuple]:
+def get_GPU_memory_usage() -> List[Tuple[float, float]]:
+    if torch.version.hip:
+        return get_GPU_usage_amd()
+    else:
+        return get_GPU_usage_nv()
+
+
+def get_GPU_usage_nv() -> List[Tuple[float, float]]:
+    """
+    get gpu usage for Nvidia GPUs using nvml lib
+    """
     try:
         import pynvml
         from pynvml import NVMLError
@@ -39,6 +49,39 @@ def get_GPU_memory_usage() -> List[Tuple]:
         return []
 
 
+def get_GPU_usage_amd() -> List[Tuple[float, float]]:
+    """
+    get gpu usage for AMD GPUs using amdsmi lib
+    """
+    usage = []
+    try:
+        import amdsmi
+
+        try:
+            amdsmi.amdsmi_init()
+            devices = amdsmi.amdsmi_get_processor_handles()
+
+            for device in devices:
+                vram_memory_usage = amdsmi.amdsmi_get_gpu_memory_usage(
+                    device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
+                )
+                vram_memory_total = amdsmi.amdsmi_get_gpu_memory_total(
+                    device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
+                )
+
+                memory_percentage = vram_memory_usage / vram_memory_total
+                usage.append(
+                    (memory_percentage, vram_memory_total / (1e9)),
+                )
+            amdsmi.amdsmi_shut_down()
+        except amdsmi.AmdSmiException as error:
+            logger.warning(f"amdsmi library error:\n {error}")
+    except ImportError:
+        logger.warning("Failed to obtain GPU usage from amdsmi")
+
+    return usage
+
+
 def get_layer_size_mb(module: Module) -> float:
     param_size = 0
     buffer_size = 0
@@ -81,7 +124,7 @@ def __exit__(self, _exc_type, _exc_val, _exc_tb):
 
         if self.start_tick is not None:
             duration = stop_tick - self.start_tick
-            patch.log("METRIC", f"time {duration:.2f}")
+            patch.log("METRIC", f"time {duration:.2f}s")
         if self.loss is not None:
             patch.log("METRIC", f"error {self.loss:.2f}")
 

diff --git a/tests/examples/test_big_models_with_accelerate.py b/tests/examples/test_big_models_with_accelerate.py
@@ -7,7 +7,6 @@
     copy_and_run_script,
     gen_cmd_fail_message,
     requires_gpu_count,
-    requires_gpu_mem,
 )
 
 
@@ -43,7 +42,6 @@ def test_readme_has_install_command(self, example_dir: str):
                 "",
                 id="multi_gpu_int8",
                 marks=[
-                    requires_gpu_mem(630),
                     requires_gpu_count(2),
                     pytest.mark.multi_gpu,
                 ],