pytorch · Prakyathkantharaju · May 1, 2024 · May 1, 2024 · May 5, 2024 · May 5, 2024
diff --git a/recipes/configs/llama3/8B_dora_single_device.yaml b/recipes/configs/llama3/8B_dora_single_device.yaml
@@ -0,0 +1,86 @@
+# Config for single device DoRA with lora_finetune_single_device.py
+# using a Llama3 8b Instruct model
+#
+# This config assumes that you've run the following command before launching
+# this run:
+#   tune download meta-llama/Meta-Llama-3-8b-Instruct --output-dir /tmp/Meta-Llama-3-8b-Instruct --hf-token <HF_TOKEN>
+#
+# To launch on a single device, run the following command from root:
+#   tune run lora_finetune_single_device --config llama3/8b_dora_single_device
+#
+# You can add specific overrides through the command line. For example
+# to override the checkpointer directory while launching training
+# you can run:
+#   tune run lora_finetune_single_device --config llama3/8b_dora_single_device checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
+#
+# This config works only for training on single device.
+
+# Model Arguments
+model:
+  _component_: torchtune.models.llama3.dora_llama3_8b
+  lora_attn_modules: ['q_proj', 'v_proj', 'k_proj']
+  apply_lora_to_mlp: True
+  apply_lora_to_output: False
+  lora_rank: 8
+  lora_alpha: 16
+  use_dora: True
+
+# Tokenizer
+tokenizer:
+  _component_: torchtune.models.llama3.llama3_tokenizer
+  path: /teamspace/studios/this_studio/models/Meta-Llama-3-8b-Instruct/original/tokenizer.model
+
+checkpointer:
+  _component_: torchtune.utils.FullModelMetaCheckpointer
+  checkpoint_dir: /teamspace/studios/this_studio/models/Meta-Llama-3-8b-Instruct/original/
+  checkpoint_files: [
+    consolidated.00.pth
+  ]
+  recipe_checkpoint: null
+  output_dir: /teamspace/studios/this_studio/models/Meta-Llama-3-8b-Instruct/
+  model_type: LLAMA3
+resume_from_checkpoint: False
+
+# Dataset and Sampler
+dataset:
+  _component_: torchtune.datasets.alpaca_cleaned_dataset
+  train_on_input: True
+seed: 12345678
+shuffle: True
+batch_size: 1
+
+# Optimizer and Scheduler
+optimizer:
+  _component_: torch.optim.AdamW
+  weight_decay: 0.01
+  lr: 3e-4
+lr_scheduler:
+  _component_: torchtune.modules.get_cosine_schedule_with_warmup
+  num_warmup_steps: 100
+
+loss:
+  _component_: torch.nn.CrossEntropyLoss
+
+# Training
+epochs: 1
+max_steps_per_epoch: null
+gradient_accumulation_steps: 16
+compile: False
+
+# Logging
+output_dir: /tmp/dora_finetune_output/
+metric_logger:
+  _component_: torchtune.utils.metric_logging.DiskLogger
+  log_dir: ${output_dir}
+log_every_n_steps: 1
+log_peak_memory_stats: False
+
+# Environment
+device: cuda
+dtype: bf16
+enable_activation_checkpointing: True
+
+# Profiler (disabled)
+profiler:
+  _component_: torchtune.utils.profiler
+  enabled: False
diff --git a/recipes/lora_finetune_single_device.py b/recipes/lora_finetune_single_device.py
@@ -19,6 +19,7 @@
 from torch.utils.data import DataLoader, DistributedSampler
 from torchtune import config, modules, utils
 from torchtune.modules.peft.peft_utils import (
+    activate_dora_params,
     get_adapter_params,
     get_merged_lora_ckpt,
     set_trainable_params,
@@ -256,6 +257,7 @@ def _setup_model(
 
         self._lora_rank = cfg_model.lora_rank
         self._lora_alpha = cfg_model.lora_alpha
+
         self.adapter_params = get_adapter_params(model)
         set_trainable_params(model, self.adapter_params)
 
@@ -274,6 +276,12 @@ def _setup_model(
         else:
             lora_missing, lora_unexpected = None, None
 
+        if cfg_model.get("use_dora", False):
+            # magnitude vectors for dora are initialized as ones.
+            # Once the weights are loaded, they are replaced by obtaining the norm of the
+            # linear weights. Refer https://arxiv.org/pdf/2402.09353 for more details.
+            activate_dora_params(model)
+
         validate_missing_and_unexpected_for_lora(
             lora_attn_modules=cfg_model.lora_attn_modules,
             apply_lora_to_mlp=cfg_model.apply_lora_to_mlp,

diff --git a/tests/torchtune/modules/peft/test_lora.py b/tests/torchtune/modules/peft/test_lora.py
@@ -60,6 +60,19 @@ def lora_linear(self, in_dim, out_dim) -> LoRALinear:
         fixed_init_model(lora_linear)
         return lora_linear
 
+    @pytest.fixture
+    def dora_linear(self, in_dim, out_dim) -> LoRALinear:
+        lora_linear = LoRALinear(
+            in_dim=in_dim,
+            out_dim=out_dim,
+            rank=RANK,
+            alpha=ALPHA,
+            use_bias=False,
+            use_dora=True,
+        )
+        fixed_init_model(lora_linear)
+        return lora_linear
+
     @pytest.fixture
     def qlora_linear(self, in_dim, out_dim) -> LoRALinear:
         with utils.set_default_dtype(torch.bfloat16):
@@ -97,6 +110,12 @@ def test_forward(self, inputs, lora_linear, out_dim) -> None:
         assert actual.shape == (BSZ, SEQ_LEN, out_dim)
         torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-6)
 
+    def test_dora_forward(self, inputs, dora_linear, out_dim) -> None:
+        expected = torch.tensor(EXPECTED_VAL)
+        actual = dora_linear(inputs)
+        assert actual.shape == (BSZ, SEQ_LEN, out_dim)
+        torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-6)
+
     def test_lora_weight_nf4_when_quantized(self, qlora_linear):
         assert isinstance(qlora_linear.weight, NF4Tensor)
 

diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py
@@ -106,6 +106,10 @@ class Recipe:
                 name="gemma/2B_qlora_single_device",
                 file_path="gemma/2B_qlora_single_device.yaml",
             ),
+            Config(
+                name="llama3/8B_dora_single_device",
+                file_path="llama3/8B_dora_single_device.yaml",
+            ),
         ],
         supports_distributed=False,
     ),

diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py
@@ -7,6 +7,7 @@
 from ._component_builders import llama3, lora_llama3
 
 from ._model_builders import (  # noqa
+    dora_llama3_8b,
     llama3_70b,
     llama3_8b,
     llama3_tokenizer,
@@ -26,4 +27,5 @@
     "lora_llama3_70b",
     "qlora_llama3_8b",
     "scale_hidden_dim_for_mlp",
+    "dora_llama3_8b",
 ]
diff --git a/torchtune/models/llama3/_component_builders.py b/torchtune/models/llama3/_component_builders.py
@@ -150,6 +150,8 @@ def lora_llama3(
     lora_rank: int,
     lora_alpha: float,
     lora_dropout: float = 0.0,
+    # dora args
+    use_dora: bool = False,
     # Quantization args
     quantize_base: bool = False,
 ) -> TransformerDecoder:
@@ -183,6 +185,7 @@ def lora_llama3(
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
         lora_dropout (float): LoRA dropout probability. Default: 0.0
+        use_dora (bool): Whether to use DORA. Default is ``False``.
         quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base
             weights within linear layers LoRA is applied to. The final output linear projection is not
             supported for quantization currently.
@@ -204,6 +207,7 @@ def lora_llama3(
         lora_rank=lora_rank,
         lora_alpha=lora_alpha,
         lora_dropout=lora_dropout,
+        use_dora=use_dora,
         quantize_base=quantize_base,
     )
 
@@ -214,6 +218,7 @@ def lora_llama3(
             hidden_dim=hidden_dim,
             lora_rank=lora_rank,
             lora_alpha=lora_alpha,
+            use_dora=use_dora,
             quantize_base=quantize_base,
         )
     else:
@@ -230,7 +235,7 @@ def lora_llama3(
 
     # TODO: quantize_base is not applied to final output_proj currently.
     output_proj = (
-        LoRALinear(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha)
+        LoRALinear(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha, use_dora=use_dora)
         if apply_lora_to_output
         else nn.Linear(embed_dim, vocab_size, bias=False)
     )
@@ -270,6 +275,7 @@ def lora_llama3_self_attention(
     lora_alpha: float,
     lora_dropout: float = 0.0,
     quantize_base: bool = False,
+    use_dora: bool = False,
 ) -> CausalSelfAttention:
     """
     Return an instance of :func:`~torchtune.modules.CausalSelfAttention` with LoRA
@@ -294,6 +300,7 @@ def lora_llama3_self_attention(
         lora_dropout (float): LoRA dropout probability. Default: 0.0
         quantize_base (bool): Whether to quantize base model parameters for linear layers
             LoRA is being applied to. Default is ``False``.
+        use_dora (bool): Whether to use DORA. Default is ``False``.
 
     Returns:
         CausalSelfAttention: instantiation of self-attention module with LoRA
@@ -316,6 +323,7 @@ def lora_llama3_self_attention(
             rank=lora_rank,
             alpha=lora_alpha,
             quantize_base=quantize_base,
+            use_dora=use_dora,
         )
         if "q_proj" in lora_modules
         else nn.Linear(embed_dim, num_heads * head_dim, bias=False)
@@ -327,6 +335,7 @@ def lora_llama3_self_attention(
             rank=lora_rank,
             alpha=lora_alpha,
             quantize_base=quantize_base,
+            use_dora=use_dora,
         )
         if "k_proj" in lora_modules
         else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
@@ -338,6 +347,7 @@ def lora_llama3_self_attention(
             rank=lora_rank,
             alpha=lora_alpha,
             quantize_base=quantize_base,
+            use_dora=use_dora,
         )
         if "v_proj" in lora_modules
         else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=False)
@@ -349,6 +359,7 @@ def lora_llama3_self_attention(
             rank=lora_rank,
             alpha=lora_alpha,
             quantize_base=quantize_base,
+            use_dora=use_dora,
         )
         if "output_proj" in lora_modules
         else nn.Linear(embed_dim, embed_dim, bias=False)
@@ -377,6 +388,7 @@ def lora_llama3_mlp(
     lora_rank: int,
     lora_alpha: float,
     lora_dropout: float = 0.0,
+    use_dora: bool = False,
     quantize_base: bool = False,
 ) -> FeedForward:
     gate_proj = LoRALinear(
@@ -386,6 +398,7 @@ def lora_llama3_mlp(
         alpha=lora_alpha,
         dropout=lora_dropout,
         quantize_base=quantize_base,
+        use_dora=use_dora,
     )
     down_proj = LoRALinear(
         in_dim=hidden_dim,
@@ -394,6 +407,7 @@ def lora_llama3_mlp(
         alpha=lora_alpha,
         dropout=lora_dropout,
         quantize_base=quantize_base,
+        use_dora=use_dora,
     )
     up_proj = LoRALinear(
         in_dim=dim,
@@ -402,6 +416,7 @@ def lora_llama3_mlp(
         alpha=lora_alpha,
         dropout=lora_dropout,
         quantize_base=quantize_base,
+        use_dora=use_dora,
     )
     return FeedForward(
         gate_proj=gate_proj,

diff --git a/torchtune/models/llama3/_model_builders.py b/torchtune/models/llama3/_model_builders.py
@@ -77,6 +77,7 @@ def lora_llama3_8b(
     apply_lora_to_output: bool = False,
     lora_rank: int = 8,
     lora_alpha: float = 16,
+    use_dora: bool = False,
     quantize_base: bool = False,
 ) -> TransformerDecoder:
     """
@@ -96,6 +97,7 @@ def lora_llama3_8b(
             Default: False
         lora_rank (int): rank of each low-rank approximation
         lora_alpha (float): scaling factor for the low-rank approximation
+        use_dora (bool): Whether to use DORA. Default is ``False``.
         quantize_base (bool): Whether to quantize base model weights
 
     Returns:
@@ -118,6 +120,7 @@ def lora_llama3_8b(
         lora_rank=lora_rank,
         lora_alpha=lora_alpha,
         lora_dropout=0.05,
+        use_dora=use_dora,
         quantize_base=quantize_base,
     )
 
@@ -180,3 +183,12 @@ def lora_llama3_70b(
 that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314.
 Please see `lora_llama3_8b` for full API arguments.
 """
+
+dora_llama3_8b = partial(lora_llama3_8b, use_dora=True)
+
+dora_llama3_8b.__doc__ = """
+Builder for creating a Llama3 model with DoRA enabled. Base model weights in linear layers
+that DoRA is applied to are quantized per the DoRA paper: https://arxiv.org/abs/2402.09353.
+In addition to the lora adaptor weights, DoRA also adds a trainable magnitude parameters.
+Please see `lora_llama3_8b` for full API arguments.
+"""