Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[70B-Part2] Improved save model (that can work with FSDP) #107

Merged
merged 4 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions ultravox/model/ultravox_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):

config_class = UltravoxConfig
config: UltravoxConfig # for type hinting
_no_split_modules = ["Wav2Vec2Model", "WhisperEncoder", "LlamaDecoderLayer"]
# We minimize the weights in state_dict in order to reduce the size of the checkpoint
# The issue is that load_pretrained() uses state_dict() keys to know what keys are expected
# As such we have to tell is to ignore some keys that are not always in the model
Expand All @@ -46,6 +45,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):

def __init__(self, config: UltravoxConfig):
super().__init__(config)
self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook)

self.keep_params: Set[str] = set()
self.vocab_size = config.vocab_size
Expand All @@ -54,6 +54,13 @@ def __init__(self, config: UltravoxConfig):
self.multi_modal_projector = UltravoxProjector(config)
self.language_model = self._create_language_model(config)

# Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
# FSDP throws an error if some of the layer types are not found in the model.
# This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
self._no_split_modules = (self.language_model._no_split_modules or []) + (
self.audio_tower._no_split_modules or []
)

self.loss_config = LossConfig()
self.post_init()

Expand Down Expand Up @@ -356,26 +363,25 @@ def push_to_hub(self, *args, **kwargs):
self.to(self.language_model.dtype)
return super().push_to_hub(*args, **kwargs)

def state_dict(self, *args, **kwargs):
def save_pretrained(
self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
):
if state_dict is None:
state_dict = super().state_dict()

named_params = dict(self.named_parameters())
state_dict = super().state_dict(*args, **kwargs)

state_dict = {
k: v
for k, v in state_dict.items()
if k in self.keep_params
or (k in named_params and named_params[k].requires_grad)
}
return state_dict

def load_state_dict(
self,
state_dict: Dict[str, Any],
*args,
**kwargs,
):
super().save_pretrained(*args, state_dict=state_dict, **kwargs)

def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
self.keep_params.update(set(state_dict.keys()))
return super().load_state_dict(state_dict, *args, **kwargs)

def print_trainable_parameters(self):
"""
Expand Down Expand Up @@ -510,6 +516,7 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
"""

base_model_prefix = "model.encoder"
_no_split_modules = ["WhisperEncoderLayer"]
farzadab marked this conversation as resolved.
Show resolved Hide resolved

def forward(
self,
Expand Down
22 changes: 16 additions & 6 deletions ultravox/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,22 @@ def train(args: config_base.TrainConfig):
logging.info(f"train end time: {t_end}")
logging.info(f"elapsed: {t_end - t_start}")

if is_master:
# Saving the model using pipeline to ensure its code is saved
pipeline = ultravox_pipeline.UltravoxPipeline(
model, tokenizer=text_tokenizer, device=device
)
pipeline.save_pretrained(args.output_dir)
# We use both pipeline.save_pretrained and trainer.save_model to save everything.
# This is because pipeline.save_pretrained knows how to save the pipeline (code and config),
# but it doesn't know how to save FSDP models correctly (the final tensors could be flattened).
# on the other hand, trainer.save_model knows how to save FSDP models correctly, but it won't save the pipeline.
# Saving FSDP models is already quite slow though, so we don't want to save the model twice.
pipeline = ultravox_pipeline.UltravoxPipeline(
model, tokenizer=text_tokenizer, device=model.device
)
old_save_pretrained = model.save_pretrained
model.save_pretrained = lambda *_, **__: None # type: ignore[method-assign]
# saves the pipeline code and populates the config
pipeline.save_pretrained(args.output_dir)
model.save_pretrained = old_save_pretrained # type: ignore[method-assign]

# saves the model weights correctly (FSDP or otherwise)
trainer.save_model(args.output_dir)


def evaluate(args: config_base.TrainConfig):
Expand Down
Loading