Skip to content

Commit

Permalink
Merge branch 'main' into kylesayrs/smoothquant-ignore-glm
Browse files Browse the repository at this point in the history
  • Loading branch information
kylesayrs authored Jan 3, 2025
2 parents ec37204 + 1b8c7bf commit 72f5b93
Show file tree
Hide file tree
Showing 6 changed files with 252 additions and 9 deletions.
8 changes: 3 additions & 5 deletions examples/big_models_with_accelerate/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ When working with `accelerate`, it is important to keep in mind that CPU offload

We will show working examples for each use case:
- **CPU Offloading**: Quantize `Llama-70B` to `FP8` using `PTQ` with a single GPU
- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 8 GPUs
- **Multi-GPU**: Quantize `Llama-70B` to `INT8` using `GPTQ` and `SmoothQuant` with 2 GPUs

### Installation

Expand All @@ -81,12 +81,10 @@ The resulting model `./Meta-Llama-3-70B-Instruct-FP8-Dynamic` is ready to run wi

For quantization methods that require calibration data (e.g. `GPTQ`), CPU offloading is too slow. For these methods, `llmcompressor` can use `accelerate` multi-GPU to quantize models that are larger than a single GPU. For example, when quantizing a model to `int8`, we typically use `GPTQ` to statically quantize the weights, which requires calibration data.

Note that running non-sequential `GPTQ` requires significant additional memory beyond the model size. As a rough rule of thumb, running `GPTQModifier` non-sequentially will take up 3x the model size for a 16-bit model and 2x the model size for a 32-bit model (these estimates include the memory required to store the model itself in GPU).

- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 8 A100s:
- `multi_gpu_int8.py` demonstrates quantizing the weights and activations of `Llama-70B` to `int8` on 2 A100s:

```python
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export CUDA_VISIBLE_DEVICES=0,1
python multi_gpu_int8.py
```

Expand Down
99 changes: 99 additions & 0 deletions examples/quantization_kv_cache/gemma2_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot

# Select model and load it.
MODEL_ID = "google/gemma-2-9b-it"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))


def process_and_tokenize(example):
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
return tokenizer(
text,
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm and scheme.
# In this case, we:
# * quantize the weights to fp8 with per-channel scales
# * quantize the activations to fp8 with dynamic per-token scales
# * quantize the kv cache to fp8 with per-tensor scales
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 8
type: float
strategy: channel
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: float
strategy: token
dynamic: true
symmetric: true
targets: ["Linear"]
kv_cache_scheme:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
"""

# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

print(
"Note: Inference with the quantized kv_cache is not supported. ",
"Please use vLLM for inference with the quantized kv_cache.",
)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
101 changes: 101 additions & 0 deletions examples/quantization_kv_cache/phi3.5_fp8_kv_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor.transformers import oneshot

# Select model and load it.
# Phi-3.5 is a special case for KV cache quantization because it has
# fused QKV linear layers.
MODEL_ID = "microsoft/Phi-3.5-mini-instruct"
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
device_map="auto",
torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Select calibration dataset.
DATASET_ID = "HuggingFaceH4/ultrachat_200k"
DATASET_SPLIT = "train_sft"

# Select number of samples. 512 samples is a good place to start.
# Increasing the number of samples can improve accuracy.
NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 2048

# Load dataset and preprocess.
ds = load_dataset(DATASET_ID, split=DATASET_SPLIT)
ds = ds.shuffle(seed=42).select(range(NUM_CALIBRATION_SAMPLES))


def process_and_tokenize(example):
text = tokenizer.apply_chat_template(example["messages"], tokenize=False)
return tokenizer(
text,
padding=False,
max_length=MAX_SEQUENCE_LENGTH,
truncation=True,
add_special_tokens=False,
)


ds = ds.map(process_and_tokenize, remove_columns=ds.column_names)

# Configure the quantization algorithm and scheme.
# In this case, we:
# * quantize the weights to fp8 with per-tensor scales
# * quantize the activations to fp8 with per-tensor scales
# * quantize the kv cache to fp8 with per-tensor scales
recipe = """
quant_stage:
quant_modifiers:
QuantizationModifier:
ignore: ["lm_head"]
config_groups:
group_0:
weights:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
input_activations:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
targets: ["Linear"]
kv_cache_scheme:
num_bits: 8
type: float
strategy: tensor
dynamic: false
symmetric: true
"""

# Apply algorithms.
oneshot(
model=model,
dataset=ds,
recipe=recipe,
max_seq_length=MAX_SEQUENCE_LENGTH,
num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)

print(
"Note: Inference with the quantized kv_cache is not supported. ",
"Please use vLLM for inference with the quantized kv_cache.",
)
# Confirm generations of the quantized model look sane.
print("\n\n")
print("========== SAMPLE GENERATION ==============")
input_ids = tokenizer("Hello my name is", return_tensors="pt").input_ids.to("cuda")
output = model.generate(input_ids, max_new_tokens=100)
print(tokenizer.decode(output[0]))
print("==========================================\n\n")

# Save to disk compressed.
SAVE_DIR = MODEL_ID.split("/")[1] + "-FP8-KV"
model.save_pretrained(SAVE_DIR, save_compressed=True)
tokenizer.save_pretrained(SAVE_DIR)
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,10 @@ def patch_tied_tensors_bug(model: torch.nn.Module):
input_embed = model.get_input_embeddings()
output_embed = model.get_output_embeddings()

if input_embed is None or output_embed is None:
# some models fail to properly override the abstract methods
return

if storage_ptr(input_embed.weight) == storage_ptr(output_embed.weight):
for module in (input_embed, output_embed):
if not is_module_offloaded(module):
Expand Down
47 changes: 45 additions & 2 deletions src/llmcompressor/utils/metric_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,17 @@
__all__ = ["get_GPU_memory_usage", "get_layer_size_mb", "CompressionLogger"]


def get_GPU_memory_usage() -> List[Tuple]:
def get_GPU_memory_usage() -> List[Tuple[float, float]]:
if torch.version.hip:
return get_GPU_usage_amd()
else:
return get_GPU_usage_nv()


def get_GPU_usage_nv() -> List[Tuple[float, float]]:
"""
get gpu usage for Nvidia GPUs using nvml lib
"""
try:
import pynvml
from pynvml import NVMLError
Expand Down Expand Up @@ -39,6 +49,39 @@ def get_GPU_memory_usage() -> List[Tuple]:
return []


def get_GPU_usage_amd() -> List[Tuple[float, float]]:
"""
get gpu usage for AMD GPUs using amdsmi lib
"""
usage = []
try:
import amdsmi

try:
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()

for device in devices:
vram_memory_usage = amdsmi.amdsmi_get_gpu_memory_usage(
device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
)
vram_memory_total = amdsmi.amdsmi_get_gpu_memory_total(
device, amdsmi.amdsmi_interface.AmdSmiMemoryType.VRAM
)

memory_percentage = vram_memory_usage / vram_memory_total
usage.append(
(memory_percentage, vram_memory_total / (1e9)),
)
amdsmi.amdsmi_shut_down()
except amdsmi.AmdSmiException as error:
logger.warning(f"amdsmi library error:\n {error}")
except ImportError:
logger.warning("Failed to obtain GPU usage from amdsmi")

return usage


def get_layer_size_mb(module: Module) -> float:
param_size = 0
buffer_size = 0
Expand Down Expand Up @@ -81,7 +124,7 @@ def __exit__(self, _exc_type, _exc_val, _exc_tb):

if self.start_tick is not None:
duration = stop_tick - self.start_tick
patch.log("METRIC", f"time {duration:.2f}")
patch.log("METRIC", f"time {duration:.2f}s")
if self.loss is not None:
patch.log("METRIC", f"error {self.loss:.2f}")

Expand Down
2 changes: 0 additions & 2 deletions tests/examples/test_big_models_with_accelerate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
copy_and_run_script,
gen_cmd_fail_message,
requires_gpu_count,
requires_gpu_mem,
)


Expand Down Expand Up @@ -43,7 +42,6 @@ def test_readme_has_install_command(self, example_dir: str):
"",
id="multi_gpu_int8",
marks=[
requires_gpu_mem(630),
requires_gpu_count(2),
pytest.mark.multi_gpu,
],
Expand Down

0 comments on commit 72f5b93

Please sign in to comment.