diff --git a/config/finetune_shakespeare.py b/config/finetune_shakespeare.py index 148a4c4fa3..9343e5c7f8 100644 --- a/config/finetune_shakespeare.py +++ b/config/finetune_shakespeare.py @@ -1,4 +1,8 @@ import time +from functools import partial + +import torch +from minlora import LoRAParametrization out_dir = 'out-shakespeare' eval_interval = 5 @@ -9,6 +13,8 @@ dataset = 'shakespeare' init_from = 'gpt2-xl' # this is the largest GPT-2 model +init_from = 'gpt2-large' # use a smaller for faster training +# xl doesn't fit on 24GB GPU, but with LORA it does # only save checkpoints if the validation loss improves always_save_checkpoint = False @@ -23,3 +29,18 @@ # finetune at constant LR learning_rate = 3e-5 decay_lr = False + + +use_lora = True +learning_rate = 1e-3 # use a higher LR for LoRA +lora_dropout_p = 0.0 +rank=4 +lora_alpha = 64 +lora_config = { + torch.nn.Embedding: { + "weight": partial(LoRAParametrization.from_embedding, rank=rank, lora_alpha=lora_alpha), + }, + torch.nn.Linear: { + "weight": partial(LoRAParametrization.from_linear, rank=rank, lora_alpha=lora_alpha), + }, +} \ No newline at end of file diff --git a/sample.py b/sample.py index 670759bc19..118a40a0d0 100644 --- a/sample.py +++ b/sample.py @@ -7,6 +7,7 @@ import torch import tiktoken from model import GPTConfig, GPT +import minlora # ----------------------------------------------------------------------------- init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl') @@ -38,12 +39,23 @@ checkpoint = torch.load(ckpt_path, map_location=device) gptconf = GPTConfig(**checkpoint['model_args']) model = GPT(gptconf) + if use_lora: + minlora.add_lora(model, lora_config) state_dict = checkpoint['model'] unwanted_prefix = '_orig_mod.' for k,v in list(state_dict.items()): if k.startswith(unwanted_prefix): state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) model.load_state_dict(state_dict) + if use_lora: + # the full state dict includes the LoRA state dict + # so actually we don't need to load it separately again + model.load_state_dict(checkpoint['lora'], strict=False) + print('Loaded LoRA state dict') + # sanity check + #model.apply(minlora.apply_to_lora(lambda m: print((m.lora_A.sum(), m.lora_B.sum())))) + # merge for zero-overhead inference + minlora.merge_lora(model) elif init_from.startswith('gpt2'): # init from a given GPT-2 model model = GPT.from_pretrained(init_from, dict(dropout=0.0)) diff --git a/train.py b/train.py index 30d0145ef7..9e44c114ab 100644 --- a/train.py +++ b/train.py @@ -20,6 +20,7 @@ import time import math import pickle +import inspect from contextlib import nullcontext import numpy as np @@ -29,6 +30,7 @@ from model import GPTConfig, GPT +import minlora # ----------------------------------------------------------------------------- # default config values designed to train a gpt2 (124M) on OpenWebText # I/O @@ -180,13 +182,46 @@ def get_batch(split): if block_size < model.config.block_size: model.crop_block_size(block_size) model_args['block_size'] = block_size # so that the checkpoint will have the right value +if use_lora: + minlora.add_lora(model, lora_config=lora_config) + minlora.tie_weights(linear=model.lm_head, embedding=model.transformer.wte) model.to(device) # initialize a GradScaler. If enabled=False scaler is a no-op scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16')) # optimizer -optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) +def configure_optimizers_lora(self, weight_decay, learning_rate, betas, device_type): + # we apply weight decay to all lora params + optim_groups = [ + # note: .get_lora_params() returns a generator + # we need to wrap it in a list so we can consume it twice + {"params": list(minlora.get_lora_params(self)) , "weight_decay": weight_decay}, + # you can also add biases for fine-tuning, + # but I want to make sure lora alone works + # {"params": minlora.get_bias_params(self), "weight_decay": 0.0}, # bias params don't get weight decay + ] + + def parameter_count(optim_groups): + n = sum(p.numel() for group in optim_groups for p in group["params"]) + if n < 1e6: + return f"{n/1e3:.1f}k" + else: + return f"{n/1e6:.1f}M" + + print(f"optimizing {parameter_count(optim_groups)} parameters") + + # new PyTorch nightly has a new 'fused' option for AdamW that is much faster + use_fused = (device_type == "cuda") and ("fused" in inspect.signature(torch.optim.AdamW).parameters) + print(f"using fused AdamW: {use_fused}") + extra_args = dict(fused=True) if use_fused else dict() + optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args) + + return optimizer +if use_lora: + optimizer = configure_optimizers_lora(model, weight_decay, learning_rate, (beta1, beta2), device_type) +else: + optimizer = model.configure_optimizers(weight_decay, learning_rate, (beta1, beta2), device_type) if init_from == 'resume': optimizer.load_state_dict(checkpoint['optimizer']) @@ -271,6 +306,8 @@ def get_lr(it): 'best_val_loss': best_val_loss, 'config': config, } + if use_lora: + checkpoint['lora'] = minlora.get_lora_state_dict(raw_model) print(f"saving checkpoint to {out_dir}") torch.save(checkpoint, os.path.join(out_dir, 'ckpt.pt')) if iter_num == 0 and eval_only: