Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[70B-Part2] Improved save model (that can work with FSDP) #107

Merged
merged 4 commits into from
Sep 16, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions ultravox/model/ultravox_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):

config_class = UltravoxConfig
config: UltravoxConfig # for type hinting
_no_split_modules = ["Wav2Vec2Model", "WhisperEncoder", "LlamaDecoderLayer"]
# We minimize the weights in state_dict in order to reduce the size of the checkpoint
# The issue is that load_pretrained() uses state_dict() keys to know what keys are expected
# As such we have to tell is to ignore some keys that are not always in the model
Expand All @@ -46,6 +45,7 @@ class UltravoxModel(transformers.LlamaPreTrainedModel):

def __init__(self, config: UltravoxConfig):
super().__init__(config)
self._register_load_state_dict_pre_hook(self._pre_load_state_dict_hook)

self.keep_params: Set[str] = set()
self.vocab_size = config.vocab_size
Expand All @@ -54,6 +54,13 @@ def __init__(self, config: UltravoxConfig):
self.multi_modal_projector = UltravoxProjector(config)
self.language_model = self._create_language_model(config)

# Determine no_split_modules dynamically to use with FSDP auto_wrap policy.
# FSDP throws an error if some of the layer types are not found in the model.
# This would be something like ["LlamaDecoderLayer", "WhisperEncoderLayer"]
self._no_split_modules = (self.language_model._no_split_modules or []) + (
self.audio_tower._no_split_modules or []
)

self.loss_config = LossConfig()
self.post_init()

Expand Down Expand Up @@ -356,26 +363,25 @@ def push_to_hub(self, *args, **kwargs):
self.to(self.language_model.dtype)
return super().push_to_hub(*args, **kwargs)

def state_dict(self, *args, **kwargs):
def save_pretrained(
self, *args, state_dict: Optional[Dict[str, Any]] = None, **kwargs
):
if state_dict is None:
state_dict = super().state_dict()

named_params = dict(self.named_parameters())
state_dict = super().state_dict(*args, **kwargs)

state_dict = {
k: v
for k, v in state_dict.items()
if k in self.keep_params
or (k in named_params and named_params[k].requires_grad)
}
return state_dict

def load_state_dict(
self,
state_dict: Dict[str, Any],
*args,
**kwargs,
):
super().save_pretrained(*args, state_dict=state_dict, **kwargs)

def _pre_load_state_dict_hook(self, state_dict: Dict[str, Any], *args, **kwargs):
self.keep_params.update(set(state_dict.keys()))
return super().load_state_dict(state_dict, *args, **kwargs)

def print_trainable_parameters(self):
"""
Expand Down Expand Up @@ -510,6 +516,7 @@ class ModifiedWhisperEncoder(whisper.WhisperEncoder):
"""

base_model_prefix = "model.encoder"
_no_split_modules = ["WhisperEncoderLayer"]
farzadab marked this conversation as resolved.
Show resolved Hide resolved

def forward(
self,
Expand Down
17 changes: 11 additions & 6 deletions ultravox/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,12 +317,17 @@ def train(args: config_base.TrainConfig):
logging.info(f"train end time: {t_end}")
logging.info(f"elapsed: {t_end - t_start}")

if is_master:
# Saving the model using pipeline to ensure its code is saved
pipeline = ultravox_pipeline.UltravoxPipeline(
model, tokenizer=text_tokenizer, device=device
)
pipeline.save_pretrained(args.output_dir)
# Save the pipeline code and update the config to include the pipeline
pipeline = ultravox_pipeline.UltravoxPipeline(
model, tokenizer=text_tokenizer, device=model.device
)
# We don't want to save the model twice. Trainer.save_model saves the model to the output_dir.
farzadab marked this conversation as resolved.
Show resolved Hide resolved
old_save_pretrained = model.save_pretrained
model.save_pretrained = lambda *_, **__: None # type: ignore[method-assign]
pipeline.save_pretrained(args.output_dir)
model.save_pretrained = old_save_pretrained # type: ignore[method-assign]

trainer.save_model(args.output_dir)


def evaluate(args: config_base.TrainConfig):
Expand Down
Loading