vllm-project · dsikka · Nov 18, 2024 · Oct 31, 2024 · Nov 1, 2024 · Nov 1, 2024
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -1,11 +1,11 @@
 from datasets import load_dataset
-from transformers import AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
-from llmcompressor.transformers import SparseAutoModelForCausalLM, oneshot
+from llmcompressor.transformers import oneshot
 
 # Select model and load it.
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
-model = SparseAutoModelForCausalLM.from_pretrained(
+model = AutoModelForCausalLM.from_pretrained(
     MODEL_ID,
     device_map="auto",
     torch_dtype="auto",

diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -16,6 +16,7 @@
     "log_model_load",
     "initialize_recipe",
     "save_model_and_recipe",
+    "copy_python_files_from_model_cache",
     "fallback_to_cpu",
     "parse_dtype",
     "get_session_model",
@@ -99,7 +100,6 @@ def save_model_and_recipe(
 ):
     """
     Save a model, tokenizer and the currently loaded recipe to file
-
     :param model: pytorch model to save
     :param save_path: path to save output to
     :param tokenizer: model tokenizer to save
@@ -123,7 +123,7 @@ def save_model_and_recipe(
         fp.write(recipe_yaml_str)
 
     # copy python files from cache dir to save_path if any
-    _copy_python_files_from_model_cache(model, save_path)
+    copy_python_files_from_model_cache(model, save_path)
 
 
 def fallback_to_cpu(device: str) -> str:
@@ -213,16 +213,31 @@ def load_safetensors_state_dict(file_path: str) -> Dict[str, torch.Tensor]:
         return {key: f.get_tensor(key) for key in f.keys()}
 
 
-def _copy_python_files_from_model_cache(model: Module, save_path: str):
+def copy_python_files_from_model_cache(model, save_path: str):
     config = model.config
-    cache_dir = None
+    cache_path = None
     if hasattr(config, "_name_or_path"):
         import os
         import shutil
 
-        cache_dir = config._name_or_path
-        for file in os.listdir(cache_dir):
-            full_file_name = os.path.join(cache_dir, file)
+        from huggingface_hub import hf_hub_download
+        from transformers import TRANSFORMERS_CACHE
+        from transformers.utils import http_user_agent
+
+        cache_path = config._name_or_path
+        if not os.path.exists(cache_path):
+            user_agent = http_user_agent()
+            config_file_path = hf_hub_download(
+                repo_id=cache_path,
+                filename="config.json",
+                cache_dir=TRANSFORMERS_CACHE,
+                force_download=False,
+                user_agent=user_agent,
+            )
+            cache_path = os.path.sep.join(config_file_path.split(os.path.sep)[:-1])
+
+        for file in os.listdir(cache_path):
+            full_file_name = os.path.join(cache_path, file)
             if file.endswith(".py") and os.path.isfile(full_file_name):
                 logger.debug(f"Transferring {full_file_name} to {save_path}")
                 shutil.copy(full_file_name, save_path)
diff --git a/src/llmcompressor/transformers/__init__.py b/src/llmcompressor/transformers/__init__.py
@@ -7,9 +7,8 @@
 # isort: skip_file
 # (import order matters for circular import avoidance)
 from .utils import *
+
 from .sparsification import (
-    SparseAutoModel,
     SparseAutoModelForCausalLM,
-    wrap_hf_model_class,
 )
 from .finetune import *
diff --git a/src/llmcompressor/transformers/finetune/README.md b/src/llmcompressor/transformers/finetune/README.md
@@ -101,7 +101,7 @@ accelerate launch
 ```python
 from llmcompressor.transformers import oneshot
 
-model = "Xenova/llama2.c-stories15M"
+model ="Xenova/llama2.c-stories15M"
 dataset_name = "open_platypus"
 concatenate_data = False
 pad_to_max_length = False
@@ -119,7 +119,6 @@ oneshot(
     output_dir=output_dir,
     recipe=recipe,
     overwrite_output_dir=overwrite_output_dir,
-    concatenate_data = concatenate_data,
     pad_to_max_length = pad_to_max_length,
     splits = splits
 )
@@ -141,8 +140,10 @@ of a staged recipe for Llama.
 test_multi.py
 ```python
 from llmcompressor.transformers import apply
+from transformers import AutoModelForCausalLM
 
 model = "../ml-experiments/nlg-text_generation/llama_pretrain-llama_7b-base/dense/training"
+
 dataset_name = "open_platypus"
 concatenate_data = False
 run_stages=True
@@ -167,4 +168,5 @@ apply(
     remove_unused_columns = False,
     splits = splits
 )
+
 ```
diff --git a/src/llmcompressor/transformers/finetune/runner.py b/src/llmcompressor/transformers/finetune/runner.py
@@ -13,7 +13,6 @@
     get_completed_stages,
     get_session_model,
     save_completed_stages,
-    save_model_and_recipe,
 )
 from llmcompressor.pytorch.utils import tensors_to_device
 from llmcompressor.recipe import Recipe, StageRunType
@@ -25,11 +24,7 @@
 )
 from llmcompressor.transformers.finetune.model_args import ModelArguments
 from llmcompressor.transformers.finetune.training_args import TrainingArguments
-from llmcompressor.utils.fsdp.helpers import (
-    find_and_move_state_dicts_to_cpu,
-    is_fsdp_model,
-    unwrap_and_export_model,
-)
+from llmcompressor.utils.fsdp.helpers import is_fsdp_model, save_model_and_recipe
 
 
 class StageRunner:
@@ -170,35 +165,6 @@ def one_shot(self, stage: Optional[str] = None):
 
         self.trainer.one_shot(calibration_data=calib_data, stage=stage)
 
-        if is_fsdp_model(self.trainer.model):
-            try:
-                self.trainer.save_model(output_dir=self._output_dir, _is_oneshot=True)
-            except AssertionError:
-                # fallback to this in the case of quantization
-                unwrap_and_export_model(
-                    model=self.trainer.model,
-                    accelerator=self.trainer.accelerator,
-                    output_dir=self._output_dir,
-                    tokenizer=self.tokenizer,
-                )
-                # only allow the main process move the state
-                # dicts to cpu
-                if self.trainer.accelerator.is_main_process:
-                    # assuming quantization is the last step
-                    # we no longer need the original model
-                    # and can safely delete it to save memory
-                    del self.trainer.model
-                    find_and_move_state_dicts_to_cpu(self._output_dir)
-
-        else:
-            save_model_and_recipe(
-                model=self.trainer.model,
-                save_path=self._output_dir,
-                tokenizer=self.tokenizer,
-                save_safetensors=self._training_args.save_safetensors,
-                save_compressed=self._training_args.save_compressed,
-            )
-
     def train(self, checkpoint: str, stage: Optional[str] = None):
         """
         Run trainer's training loop on train_dataset, saving the resulting model to
@@ -293,6 +259,18 @@ def run_sequential_stages(self, checkpoint: Optional[str] = None):
                 self.train(checkpoint=checkpoint, stage=stage_name)
             checkpoint = None
 
+            if (
+                self._training_args.output_dir
+                != TrainingArguments.__dataclass_fields__["output_dir"].default
+            ):
+                save_model_and_recipe(
+                    model=self.trainer.model,
+                    save_path=self._output_dir,
+                    tokenizer=self.tokenizer,
+                    save_safetensors=self._training_args.save_safetensors,
+                    save_compressed=self._training_args.save_compressed,
+                )
+
             # save stage to checkpoint dir
             if self.trainer.accelerator.is_main_process:
                 completed_stages.append(stage_name)

diff --git a/src/llmcompressor/transformers/finetune/session_mixin.py b/src/llmcompressor/transformers/finetune/session_mixin.py
@@ -452,9 +452,7 @@ def one_shot(
         # self.maybe_log_model_sparsification()
         self.accelerator.wait_for_everyone()
 
-    def save_model(
-        self, output_dir: Optional[str] = None, _internal_call=False, _is_oneshot=False
-    ):
+    def save_model(self, output_dir: str, _internal_call=False, _is_oneshot=False):
         """
         Override of the save_model function and expects it to exist in the parent.
         Calls into super() to save the model and additionally saves any recipes

diff --git a/src/llmcompressor/transformers/finetune/text_generation.py b/src/llmcompressor/transformers/finetune/text_generation.py
@@ -23,6 +23,7 @@
 from loguru import logger
 from transformers import (
     AutoConfig,
+    AutoModelForCausalLM,
     AutoTokenizer,
     DefaultDataCollator,
     HfArgumentParser,
@@ -42,11 +43,16 @@
 from llmcompressor.transformers.finetune.runner import StageRunner
 from llmcompressor.transformers.finetune.trainer import Trainer
 from llmcompressor.transformers.finetune.training_args import TrainingArguments
+from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
+    modify_fsdp_model_save_pretrained,
+    modify_save_pretrained,
+    patch_tied_tensors_bug,
+)
 from llmcompressor.transformers.sparsification.sparse_model import (
-    SparseAutoModel,
     get_shared_tokenizer_src,
 )
 from llmcompressor.transformers.utils.helpers import detect_last_checkpoint
+from llmcompressor.utils.fsdp.helpers import is_fsdp_model
 
 
 def train(**kwargs):
@@ -199,21 +205,23 @@ def initialize_model_from_path(
         "trust_remote_code": model_args.trust_remote_code_model,
     }
     # this calls from_pretrained under the hood so should be FSDP safe
-    model = SparseAutoModel.text_generation_from_pretrained(
-        model_name_or_path=model_path,
-        sequence_length=None,  # use model default
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
         **model_kwargs,
     )
+    if "sequence_length" in model_kwargs:
+        model.seqlen = model_kwargs["sequence_length"]
 
     teacher = (
-        SparseAutoModel.text_generation_from_pretrained(
-            model_name_or_path=model_args.distill_teacher,
-            sequence_length=None,  # use model default
+        AutoModelForCausalLM.from_pretrained(
+            model_args.distill_teacher,
             **teacher_kwargs,
         )
         if model_args.distill_teacher is not None
         else None
     )
+    if teacher is not None and "sequence_length" in teacher_kwargs:
+        teacher.seqlen = teacher_kwargs["sequence_length"]
 
     return teacher, model_path, model
 
@@ -302,6 +310,10 @@ def main(
             training_args,
         )
 
+    # patch a shared tensor bug in HF transformers
+    # https://github.com/huggingface/transformers/issues/33689
+    patch_tied_tensors_bug(model)
+
     if teacher is not None:
         teacher.eval()
 
@@ -337,6 +349,13 @@ def main(
         tokenizer=tokenizer,
         data_collator=data_collator,
     )
+
+    # wrap model.save_pretrained
+    if is_fsdp_model(model):
+        modify_fsdp_model_save_pretrained(trainer, tokenizer)
+    else:
+        modify_save_pretrained(model)
+
     stage_runner.trainer = trainer
 
     # alternating Training/One-shot
@@ -348,7 +367,6 @@ def main(
 
         # exit immediately
         return
-
     # Training
     if training_args.do_train:
         checkpoint = None
@@ -370,6 +388,17 @@ def main(
     if training_args.do_predict:
         stage_runner.predict()
 
+    # save if model was provided as a string or custom output_dir was set
+    if isinstance(model_args.model, str) or (
+        training_args.output_dir
+        != TrainingArguments.__dataclass_fields__["output_dir"].default
+    ):
+        model.save_pretrained(
+            training_args.output_dir, save_compressed=training_args.save_compressed
+        )
+        if tokenizer is not None:
+            tokenizer.save_pretrained(training_args.output_dir)
+
     # Clean up the CompressionSession before exit if requested
     if training_args.clear_sparse_session:
         reset_session()