From cbc38bb26be2034f98e12e24e2d376a982fd1a71 Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Fri, 26 Jul 2024 09:41:13 +0530
Subject: [PATCH 01/11] GPT 2 implementation

---
 docs/transformers/LoRA/GPT2.py            | 239 ++++++++++++++++++++++
 docs/transformers/LoRA/gpt2_state_dict.py |  35 ++++
 2 files changed, 274 insertions(+)
 create mode 100644 docs/transformers/LoRA/GPT2.py
 create mode 100644 docs/transformers/LoRA/gpt2_state_dict.py

diff --git a/docs/transformers/LoRA/GPT2.py b/docs/transformers/LoRA/GPT2.py
new file mode 100644
index 00000000..d772874b
--- /dev/null
+++ b/docs/transformers/LoRA/GPT2.py
@@ -0,0 +1,239 @@
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+# config from GPT
+config = {
+    "_name_or_path": "gpt2",
+    "activation_function": "gelu_new",
+    "architectures": [
+        "GPT2LMHeadModel"
+    ],
+    "attn_pdrop": 0.1,
+    "bos_token_id": 50256,
+    "embd_pdrop": 0.1,
+    "eos_token_id": 0,
+    "initializer_range": 0.02,
+    "layer_norm_epsilon": 1e-05,
+    "model_type": "gpt2",
+    "n_ctx": 1024,
+    "n_embd": 768,
+    "n_head": 12,
+    "n_inner": None,
+    "n_layer": 12,
+    "n_positions": 1024,
+    "reorder_and_upcast_attn": False,
+    "resid_pdrop": 0.1,
+    "scale_attn_by_inverse_layer_idx": False,
+    "scale_attn_weights": True,
+    "summary_activation": None,
+    "summary_first_dropout": 0.1,
+    "summary_proj_to_labels": True,
+    "summary_type": "cls_index",
+    "summary_use_proj": True,
+    "task_specific_params": {
+        "text-generation": {
+            "do_sample": True,
+            "max_length": 50
+        }
+    },
+    "transformers_version": "4.42.4",
+    "use_cache": True,
+    "vocab_size": 50257
+}
+
+import math
+from torch import Tensor
+
+
+# from transformers
+class Conv1D(nn.Module):
+    """
+    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
+
+    Basically works like a linear layer but the weights are transposed.
+
+    Args:
+        nf (`int`): The number of output features.
+        nx (`int`): The number of input features.
+    """
+
+    def __init__(self, nf, nx):
+        super().__init__()
+        self.nf = nf
+        self.weight = nn.Parameter(torch.empty(nx, nf))
+        self.bias = nn.Parameter(torch.zeros(nf))
+        nn.init.normal_(self.weight, std=0.02)
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(size_out)
+        return x
+
+
+# from transformers
+class NewGELUActivation(nn.Module):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def forward(self, input: Tensor) -> Tensor:
+        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
+
+
+class HeadFFN(nn.Module):  # todo rename
+    def __init__(self, dim):
+        super().__init__()
+        self.c_fc = Conv1D(dim, config['n_embd'])
+        self.c_proj = Conv1D(config['n_embd'], dim)
+        self.act = NewGELUActivation()
+        self.dropout = nn.Dropout(config['resid_pdrop'])
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class MultiHead(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.embed_dim = config['n_embd']
+        self.num_heads = config['n_head']
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+
+        self.c_att = Conv1D(config['n_embd'] * 3, config['n_embd'])
+        self.c_proj = Conv1D(config['n_embd'], config['n_embd'])
+
+        self.resid_dropout = nn.Dropout(config['resid_pdrop'])
+        self.attn_dropout = nn.Dropout(config['attn_pdrop'])
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def forward(self, hidden_states):
+        batch_size, seq_length, _ = hidden_states.size()
+
+        query, key, value = self.c_att(hidden_states).split(self.split_size, dim=2)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=None,
+            dropout_p=self.attn_dropout.p if self.training else 0.0,
+            is_causal=True,  # for the triangular mask
+        )
+
+        # todo why this?
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
+
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output
+
+
+class Block(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+        self.attn = MultiHead()
+        self.post_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+        self.ffn = HeadFFN(config['n_embd'] * 4)
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.pre_norm(hidden_states)
+
+        attn_output = self.attn(hidden_states)
+
+        hidden_states = attn_output + residual
+        residual = hidden_states
+        hidden_states = self.post_norm(hidden_states)
+        feed_forward_output = self.ffn(hidden_states)
+        hidden_states = feed_forward_output + residual
+
+        return hidden_states
+
+
+class GPTModel(nn.Module):
+    # todo ignored token type embeds, past key values
+    def __init__(self):
+        super().__init__()
+
+        self.token_embedding = nn.Embedding(config['vocab_size'], config['n_embd'])
+        self.position_embedding = nn.Embedding(config['n_positions'], config['n_embd'])
+
+        self.dropout = nn.Dropout(p=config['embd_pdrop'], inplace=False)
+
+        self.blocks = nn.ModuleList([Block() for _ in range(config['n_layer'])])
+
+        self.final_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
+
+        self.lm_head = nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
+
+    def forward(self, input_ids):
+        batch_size, input_shape = input_ids.size()
+
+        token_embeddings = self.token_embedding(input_ids)  # B T C
+        position_ids = torch.arange(input_shape)  # T C
+        position_embeddings = self.position_embedding(position_ids)  # B T C
+
+        embeddings = token_embeddings + position_embeddings
+
+        hidden_states = self.dropout(embeddings)
+
+        for block in self.blocks:
+            hidden_states = block(hidden_states)
+
+        hidden_states = self.final_norm(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        return logits
+
+
+model = GPTModel()
+
+state_dict = torch.load('transformed.pth')
+
+missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+if missing_keys:
+    print(f"Missing keys: {missing_keys}")
+if unexpected_keys:
+    print(f"Unexpected keys: {unexpected_keys}")
+
+prompt = "hello how are you"
+tokenized = tokenizer(prompt, return_tensors="pt")
+
+with torch.no_grad():
+    model.eval()
+    res = model(tokenized['input_ids'])
+
+print(res)
+
+output_ids = torch.argmax(res, dim=-1)
+
+# Decode the token indices back to text
+output_text = tokenizer.decode(output_ids[0])
+
+# Print the tokens of the output
+print(output_text)
diff --git a/docs/transformers/LoRA/gpt2_state_dict.py b/docs/transformers/LoRA/gpt2_state_dict.py
new file mode 100644
index 00000000..09f27eaf
--- /dev/null
+++ b/docs/transformers/LoRA/gpt2_state_dict.py
@@ -0,0 +1,35 @@
+import torch
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("gpt2")
+
+state_dict = model.state_dict()
+
+mapping = {
+    'transformer.wte.weight': 'token_embedding.weight',
+    'transformer.wpe.weight': 'position_embedding.weight',
+    'transformer.ln_f.weight': 'final_norm.weight',
+    'transformer.ln_f.bias': 'final_norm.bias',
+    'lm_head.weight': 'lm_head.weight'
+}
+
+for i in range(12):
+    mapping[f'transformer.h.{i}.ln_1.weight'] = f'blocks.{i}.pre_norm.weight'
+    mapping[f'transformer.h.{i}.ln_1.bias'] = f'blocks.{i}.pre_norm.bias'
+    mapping[f'transformer.h.{i}.attn.c_attn.weight'] = f'blocks.{i}.attn.c_att.weight'
+    mapping[f'transformer.h.{i}.attn.c_attn.bias'] = f'blocks.{i}.attn.c_att.bias'
+    mapping[f'transformer.h.{i}.attn.c_proj.weight'] = f'blocks.{i}.attn.c_proj.weight'
+    mapping[f'transformer.h.{i}.attn.c_proj.bias'] = f'blocks.{i}.attn.c_proj.bias'
+    mapping[f'transformer.h.{i}.ln_2.weight'] = f'blocks.{i}.post_norm.weight'
+    mapping[f'transformer.h.{i}.ln_2.bias'] = f'blocks.{i}.post_norm.bias'
+    mapping[f'transformer.h.{i}.mlp.c_fc.weight'] = f'blocks.{i}.ffn.c_fc.weight'
+    mapping[f'transformer.h.{i}.mlp.c_fc.bias'] = f'blocks.{i}.ffn.c_fc.bias'
+    mapping[f'transformer.h.{i}.mlp.c_proj.weight'] = f'blocks.{i}.ffn.c_proj.weight'
+    mapping[f'transformer.h.{i}.mlp.c_proj.bias'] = f'blocks.{i}.ffn.c_proj.bias'
+
+new_state_dict = {}
+for old_key, new_key in mapping.items():
+    if old_key in state_dict:
+        new_state_dict[new_key] = state_dict[old_key]
+
+torch.save(new_state_dict, 'transformed.pth')

From b3aedf3093272c1f658a09b5a7544e2625c5732c Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Sat, 27 Jul 2024 21:28:07 +0530
Subject: [PATCH 02/11] remove gelu custom impl and use pytorch impl

---
 docs/transformers/LoRA/GPT2.py | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/docs/transformers/LoRA/GPT2.py b/docs/transformers/LoRA/GPT2.py
index d772874b..ae47320a 100644
--- a/docs/transformers/LoRA/GPT2.py
+++ b/docs/transformers/LoRA/GPT2.py
@@ -44,9 +44,6 @@
     "vocab_size": 50257
 }
 
-import math
-from torch import Tensor
-
 
 # from transformers
 class Conv1D(nn.Module):
@@ -74,23 +71,12 @@ def forward(self, x):
         return x
 
 
-# from transformers
-class NewGELUActivation(nn.Module):
-    """
-    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
-    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
-    """
-
-    def forward(self, input: Tensor) -> Tensor:
-        return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0))))
-
-
 class HeadFFN(nn.Module):  # todo rename
     def __init__(self, dim):
         super().__init__()
         self.c_fc = Conv1D(dim, config['n_embd'])
         self.c_proj = Conv1D(config['n_embd'], dim)
-        self.act = NewGELUActivation()
+        self.act = nn.functional.gelu
         self.dropout = nn.Dropout(config['resid_pdrop'])
 
     def forward(self, hidden_states):

From 106e72605da5831251aa0e2d7b671e0a1175ba97 Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Sat, 27 Jul 2024 21:30:15 +0530
Subject: [PATCH 03/11] remove droput layers

---
 docs/transformers/LoRA/GPT2.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/docs/transformers/LoRA/GPT2.py b/docs/transformers/LoRA/GPT2.py
index ae47320a..9c7887be 100644
--- a/docs/transformers/LoRA/GPT2.py
+++ b/docs/transformers/LoRA/GPT2.py
@@ -77,13 +77,11 @@ def __init__(self, dim):
         self.c_fc = Conv1D(dim, config['n_embd'])
         self.c_proj = Conv1D(config['n_embd'], dim)
         self.act = nn.functional.gelu
-        self.dropout = nn.Dropout(config['resid_pdrop'])
 
     def forward(self, hidden_states):
         hidden_states = self.c_fc(hidden_states)
         hidden_states = self.act(hidden_states)
         hidden_states = self.c_proj(hidden_states)
-        hidden_states = self.dropout(hidden_states)
         return hidden_states
 
 
@@ -98,9 +96,6 @@ def __init__(self):
         self.c_att = Conv1D(config['n_embd'] * 3, config['n_embd'])
         self.c_proj = Conv1D(config['n_embd'], config['n_embd'])
 
-        self.resid_dropout = nn.Dropout(config['resid_pdrop'])
-        self.attn_dropout = nn.Dropout(config['attn_pdrop'])
-
     def _split_heads(self, tensor, num_heads, attn_head_size):
         """
         Splits hidden_size dim into attn_head_size and num_heads
@@ -123,7 +118,7 @@ def forward(self, hidden_states):
             key,
             value,
             attn_mask=None,
-            dropout_p=self.attn_dropout.p if self.training else 0.0,
+            dropout_p=0.0,
             is_causal=True,  # for the triangular mask
         )
 
@@ -132,7 +127,6 @@ def forward(self, hidden_states):
         attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
 
         attn_output = self.c_proj(attn_output)
-        attn_output = self.resid_dropout(attn_output)
 
         return attn_output
 
@@ -168,8 +162,6 @@ def __init__(self):
         self.token_embedding = nn.Embedding(config['vocab_size'], config['n_embd'])
         self.position_embedding = nn.Embedding(config['n_positions'], config['n_embd'])
 
-        self.dropout = nn.Dropout(p=config['embd_pdrop'], inplace=False)
-
         self.blocks = nn.ModuleList([Block() for _ in range(config['n_layer'])])
 
         self.final_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
@@ -183,9 +175,7 @@ def forward(self, input_ids):
         position_ids = torch.arange(input_shape)  # T C
         position_embeddings = self.position_embedding(position_ids)  # B T C
 
-        embeddings = token_embeddings + position_embeddings
-
-        hidden_states = self.dropout(embeddings)
+        hidden_states = token_embeddings + position_embeddings
 
         for block in self.blocks:
             hidden_states = block(hidden_states)

From 50c3cc4eab487baa88ca974f5edb379e030a0a95 Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Sat, 27 Jul 2024 22:01:21 +0530
Subject: [PATCH 04/11] keep only required configs

---
 docs/transformers/LoRA/GPT2.py | 31 -------------------------------
 1 file changed, 31 deletions(-)

diff --git a/docs/transformers/LoRA/GPT2.py b/docs/transformers/LoRA/GPT2.py
index 9c7887be..36d9b74c 100644
--- a/docs/transformers/LoRA/GPT2.py
+++ b/docs/transformers/LoRA/GPT2.py
@@ -4,43 +4,12 @@
 
 tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
-# config from GPT
 config = {
-    "_name_or_path": "gpt2",
-    "activation_function": "gelu_new",
-    "architectures": [
-        "GPT2LMHeadModel"
-    ],
-    "attn_pdrop": 0.1,
-    "bos_token_id": 50256,
-    "embd_pdrop": 0.1,
-    "eos_token_id": 0,
-    "initializer_range": 0.02,
     "layer_norm_epsilon": 1e-05,
-    "model_type": "gpt2",
-    "n_ctx": 1024,
     "n_embd": 768,
     "n_head": 12,
-    "n_inner": None,
     "n_layer": 12,
     "n_positions": 1024,
-    "reorder_and_upcast_attn": False,
-    "resid_pdrop": 0.1,
-    "scale_attn_by_inverse_layer_idx": False,
-    "scale_attn_weights": True,
-    "summary_activation": None,
-    "summary_first_dropout": 0.1,
-    "summary_proj_to_labels": True,
-    "summary_type": "cls_index",
-    "summary_use_proj": True,
-    "task_specific_params": {
-        "text-generation": {
-            "do_sample": True,
-            "max_length": 50
-        }
-    },
-    "transformers_version": "4.42.4",
-    "use_cache": True,
     "vocab_size": 50257
 }
 

From d1e8daa1212c6d99de09c5d258fb4a3641d9ab31 Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Sun, 28 Jul 2024 08:51:03 +0530
Subject: [PATCH 05/11] replace convo1D layers with linear

---
 docs/transformers/LoRA/GPT2.py            | 34 +++--------------------
 docs/transformers/LoRA/gpt2_state_dict.py |  9 ++++++
 2 files changed, 13 insertions(+), 30 deletions(-)

diff --git a/docs/transformers/LoRA/GPT2.py b/docs/transformers/LoRA/GPT2.py
index 36d9b74c..35a65273 100644
--- a/docs/transformers/LoRA/GPT2.py
+++ b/docs/transformers/LoRA/GPT2.py
@@ -14,37 +14,11 @@
 }
 
 
-# from transformers
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-
-    Basically works like a linear layer but the weights are transposed.
-
-    Args:
-        nf (`int`): The number of output features.
-        nx (`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        self.weight = nn.Parameter(torch.empty(nx, nf))
-        self.bias = nn.Parameter(torch.zeros(nf))
-        nn.init.normal_(self.weight, std=0.02)
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(size_out)
-        return x
-
-
 class HeadFFN(nn.Module):  # todo rename
     def __init__(self, dim):
         super().__init__()
-        self.c_fc = Conv1D(dim, config['n_embd'])
-        self.c_proj = Conv1D(config['n_embd'], dim)
+        self.c_fc = nn.Linear(config['n_embd'], dim)
+        self.c_proj = nn.Linear(dim, config['n_embd'])
         self.act = nn.functional.gelu
 
     def forward(self, hidden_states):
@@ -62,8 +36,8 @@ def __init__(self):
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
 
-        self.c_att = Conv1D(config['n_embd'] * 3, config['n_embd'])
-        self.c_proj = Conv1D(config['n_embd'], config['n_embd'])
+        self.c_att = nn.Linear(config['n_embd'], config['n_embd'] * 3)
+        self.c_proj = nn.Linear(config['n_embd'], config['n_embd'])
 
     def _split_heads(self, tensor, num_heads, attn_head_size):
         """
diff --git a/docs/transformers/LoRA/gpt2_state_dict.py b/docs/transformers/LoRA/gpt2_state_dict.py
index 09f27eaf..0e8ff6be 100644
--- a/docs/transformers/LoRA/gpt2_state_dict.py
+++ b/docs/transformers/LoRA/gpt2_state_dict.py
@@ -32,4 +32,13 @@
     if old_key in state_dict:
         new_state_dict[new_key] = state_dict[old_key]
 
+# transpose weight matrices of convo 1d layers to use linear layers instead
+convo_layers = ([f'blocks.{i}.ffn.c_fc.weight' for i in range(12)] +
+                [f'blocks.{i}.ffn.c_proj.weight' for i in range(12)] +
+                [f'blocks.{i}.attn.c_att.weight' for i in range(12)] +
+                [f'blocks.{i}.attn.c_proj.weight' for i in range(12)])
+
+for layer in convo_layers:
+    new_state_dict[layer] = torch.transpose(new_state_dict[layer], 0, 1)
+
 torch.save(new_state_dict, 'transformed.pth')

From 8e756f292bce5b70453575be997d4e87acd43158 Mon Sep 17 00:00:00 2001
From: Varuna Jayasiri <vpjayasiri@gmail.com>
Date: Sun, 28 Jul 2024 11:22:27 +0530
Subject: [PATCH 06/11] lora layers

---
 docs/transformers/LoRA/__init__.py | 68 ++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 docs/transformers/LoRA/__init__.py

diff --git a/docs/transformers/LoRA/__init__.py b/docs/transformers/LoRA/__init__.py
new file mode 100644
index 00000000..8955132e
--- /dev/null
+++ b/docs/transformers/LoRA/__init__.py
@@ -0,0 +1,68 @@
+import torch
+import torch.nn as nn
+
+
+class Linear(nn.Module):
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias: bool,
+            r: int,
+            alpha: int = None):
+        if alpha is None:
+            alpha = r
+        super().__init__()
+        self.weight = nn.Parameter(torch.empty((out_features, in_features)))
+        self.weight.requires_grad = False
+
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+            self.bias.requires_grad = False
+        else:
+            self.bias = None
+
+        self.scaling = alpha / r
+        self.lora_a = nn.Parameter(torch.empty((in_features, r)))
+        self.lora_b = nn.Parameter(torch.empty((r, out_features)))
+
+        with torch.no_grad():
+            nn.init.kaiming_uniform_(self.lora_a, a=5 ** 0.5)
+            nn.init.zeros_(self.lora_b)
+
+    def forward(self, x: torch.Tensor):
+        result = nn.functional.linear(x, self.weight, bias=self.bias)
+
+        result += (x @ self.lora_a @ self.lora_b) * self.scaling
+
+        return result
+
+
+class Embedding(nn.Module):
+    def __init__(
+            self,
+            num_embeddings: int,
+            embedding_dim: int,
+            r: int,
+            alpha: int = None,
+    ):
+        if alpha is None:
+            alpha = r
+        super().__init__()
+
+        self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim)))
+        self.weight.requires_grad = False
+
+        self.scaling = alpha / self.r
+        self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))
+        self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
+
+        with torch.no_grad():
+            nn.init.normal_(self.lora_a)
+            nn.init.zeros_(self.lora_b)
+
+    def forward(self, x: torch.Tensor):
+        result = nn.functional.embedding(x, self.weight)
+        result += (nn.functional.embedding(x, self.lora_a) @ self.lora_b) * self.scaling
+
+        return result

From c82529ce6771e3d375c44acd35777992da01a555 Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Mon, 29 Jul 2024 11:17:38 +0530
Subject: [PATCH 07/11] move LoRA to labml.nn

---
 {docs => labml_nn}/transformers/LoRA/GPT2.py            | 0
 {docs => labml_nn}/transformers/LoRA/__init__.py        | 0
 {docs => labml_nn}/transformers/LoRA/gpt2_state_dict.py | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename {docs => labml_nn}/transformers/LoRA/GPT2.py (100%)
 rename {docs => labml_nn}/transformers/LoRA/__init__.py (100%)
 rename {docs => labml_nn}/transformers/LoRA/gpt2_state_dict.py (100%)

diff --git a/docs/transformers/LoRA/GPT2.py b/labml_nn/transformers/LoRA/GPT2.py
similarity index 100%
rename from docs/transformers/LoRA/GPT2.py
rename to labml_nn/transformers/LoRA/GPT2.py
diff --git a/docs/transformers/LoRA/__init__.py b/labml_nn/transformers/LoRA/__init__.py
similarity index 100%
rename from docs/transformers/LoRA/__init__.py
rename to labml_nn/transformers/LoRA/__init__.py
diff --git a/docs/transformers/LoRA/gpt2_state_dict.py b/labml_nn/transformers/LoRA/gpt2_state_dict.py
similarity index 100%
rename from docs/transformers/LoRA/gpt2_state_dict.py
rename to labml_nn/transformers/LoRA/gpt2_state_dict.py

From 23b7e2ee8e077496adf8e76b8435aff67e8d409d Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Mon, 29 Jul 2024 19:40:39 +0530
Subject: [PATCH 08/11] create experiment notebook and refactoring

---
 labml_nn/transformers/LoRA/GPT2.py            |  38 +-----
 labml_nn/transformers/LoRA/experiment.ipynb   | 125 ++++++++++++++++++
 .../LoRA/{gpt2_state_dict.py => load_hf.py}   |   0
 3 files changed, 129 insertions(+), 34 deletions(-)
 create mode 100644 labml_nn/transformers/LoRA/experiment.ipynb
 rename labml_nn/transformers/LoRA/{gpt2_state_dict.py => load_hf.py} (100%)

diff --git a/labml_nn/transformers/LoRA/GPT2.py b/labml_nn/transformers/LoRA/GPT2.py
index 35a65273..11b92e2d 100644
--- a/labml_nn/transformers/LoRA/GPT2.py
+++ b/labml_nn/transformers/LoRA/GPT2.py
@@ -14,7 +14,7 @@
 }
 
 
-class HeadFFN(nn.Module):  # todo rename
+class FFN(nn.Module):
     def __init__(self, dim):
         super().__init__()
         self.c_fc = nn.Linear(config['n_embd'], dim)
@@ -28,7 +28,7 @@ def forward(self, hidden_states):
         return hidden_states
 
 
-class MultiHead(nn.Module):
+class MultiHeadAttention(nn.Module):
     def __init__(self):
         super().__init__()
         self.embed_dim = config['n_embd']
@@ -65,7 +65,6 @@ def forward(self, hidden_states):
             is_causal=True,  # for the triangular mask
         )
 
-        # todo why this?
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(batch_size, seq_length, self.embed_dim)
 
@@ -78,9 +77,9 @@ class Block(nn.Module):
     def __init__(self):
         super().__init__()
         self.pre_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
-        self.attn = MultiHead()
+        self.attn = MultiHeadAttention()
         self.post_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
-        self.ffn = HeadFFN(config['n_embd'] * 4)
+        self.ffn = FFN(config['n_embd'] * 4)
 
     def forward(self, hidden_states):
         residual = hidden_states
@@ -98,7 +97,6 @@ def forward(self, hidden_states):
 
 
 class GPTModel(nn.Module):
-    # todo ignored token type embeds, past key values
     def __init__(self):
         super().__init__()
 
@@ -128,31 +126,3 @@ def forward(self, input_ids):
         logits = self.lm_head(hidden_states)
 
         return logits
-
-
-model = GPTModel()
-
-state_dict = torch.load('transformed.pth')
-
-missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-if missing_keys:
-    print(f"Missing keys: {missing_keys}")
-if unexpected_keys:
-    print(f"Unexpected keys: {unexpected_keys}")
-
-prompt = "hello how are you"
-tokenized = tokenizer(prompt, return_tensors="pt")
-
-with torch.no_grad():
-    model.eval()
-    res = model(tokenized['input_ids'])
-
-print(res)
-
-output_ids = torch.argmax(res, dim=-1)
-
-# Decode the token indices back to text
-output_text = tokenizer.decode(output_ids[0])
-
-# Print the tokens of the output
-print(output_text)
diff --git a/labml_nn/transformers/LoRA/experiment.ipynb b/labml_nn/transformers/LoRA/experiment.ipynb
new file mode 100644
index 00000000..eb07a516
--- /dev/null
+++ b/labml_nn/transformers/LoRA/experiment.ipynb
@@ -0,0 +1,125 @@
+{
+ "cells": [
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-29T07:14:27.781097Z",
+     "start_time": "2024-07-29T07:14:24.819976Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n",
+    "import torch"
+   ],
+   "id": "cffa3ec341b4905a",
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-29T07:14:28.183960Z",
+     "start_time": "2024-07-29T07:14:27.782683Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")"
+   ],
+   "id": "c2b0b7e18394ea9e",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2024-07-29T07:14:29.840925Z",
+     "start_time": "2024-07-29T07:14:28.185080Z"
+    }
+   },
+   "source": [
+    "model = GPTModel()\n",
+    "\n",
+    "state_dict = torch.load('transformed.pth')\n",
+    "\n",
+    "missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)\n",
+    "if missing_keys:\n",
+    "    print(f\"Missing keys: {missing_keys}\")\n",
+    "if unexpected_keys:\n",
+    "    print(f\"Unexpected keys: {unexpected_keys}\")"
+   ],
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-29T07:22:30.408855Z",
+     "start_time": "2024-07-29T07:22:30.168376Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "prompt = \"hello how are you\"\n",
+    "tokenized = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    model.eval()\n",
+    "    res = model(tokenized['input_ids'])\n",
+    "\n",
+    "output_ids = torch.argmax(res, dim=-1)\n",
+    "for id in output_ids[0]:\n",
+    "    print(tokenizer.decode(id))"
+   ],
+   "id": "f4f7826ec3729b66",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      ",\n",
+      " to\n",
+      " you\n",
+      " doing\n"
+     ]
+    }
+   ],
+   "execution_count": 17
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "c12776360008a974"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (ml)",
+   "language": "python",
+   "name": "ml"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/labml_nn/transformers/LoRA/gpt2_state_dict.py b/labml_nn/transformers/LoRA/load_hf.py
similarity index 100%
rename from labml_nn/transformers/LoRA/gpt2_state_dict.py
rename to labml_nn/transformers/LoRA/load_hf.py

From 0f2a9be6d27023eb4c33130cc10d06d5c71b8f7b Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Mon, 29 Jul 2024 23:01:06 +0530
Subject: [PATCH 09/11] training loop

---
 labml_nn/transformers/LoRA/train.ipynb | 162 +++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 labml_nn/transformers/LoRA/train.ipynb

diff --git a/labml_nn/transformers/LoRA/train.ipynb b/labml_nn/transformers/LoRA/train.ipynb
new file mode 100644
index 00000000..342ba78d
--- /dev/null
+++ b/labml_nn/transformers/LoRA/train.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": "# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "with open('input.txt', 'r', encoding='utf-8') as f:\n",
+    "    text = f.read()"
+   ],
+   "id": "3b1e507015ba6b81",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")\n",
+    "\n",
+    "tokens = tokenizer.encode(text, add_special_tokens=False)"
+   ],
+   "id": "ac8e51ae5bbfcae7",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "context_length = 10\n",
+    "batch_size = 64"
+   ],
+   "id": "aeefcdf813e427e",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "num_batches = len(tokens) // (batch_size * context_length)\n",
+    "tokens = tokens[:num_batches * batch_size * context_length]"
+   ],
+   "id": "a384b42274f008a2",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "import torch\n",
+    "\n",
+    "input_ids = torch.tensor(tokens).view(-1, context_length)"
+   ],
+   "id": "5c4cc78ac1a02c1d",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from torch.utils.data import DataLoader, TensorDataset\n",
+    "from torch.optim import Adam\n",
+    "print(input_ids.shape)\n",
+    "dataset = TensorDataset(input_ids)\n",
+    "dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)"
+   ],
+   "id": "7037fd75e2161382",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n",
+    "\n",
+    "model = GPTModel()"
+   ],
+   "id": "a98b7baa064b8494",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": [
+    "optimizer = Adam(model.parameters(), lr=5e-5)\n",
+    "criterion = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "model.eval()\n",
+    "epochs = 3\n",
+    "for epoch in range(epochs):\n",
+    "    for batch in dataloader:\n",
+    "        inputs = batch[0]\n",
+    "        labels = inputs.clone()\n",
+    "        \n",
+    "        outputs = model(inputs)\n",
+    "        \n",
+    "        shift_logits = outputs[..., :-1, :]\n",
+    "        shift_labels = labels[..., 1:]\n",
+    "        \n",
+    "        loss = criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))\n",
+    "        \n",
+    "        optimizer.zero_grad()\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "\n",
+    "        print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')\n",
+    "        break\n",
+    "\n",
+    "print(\"Training complete.\")"
+   ],
+   "id": "e2f5076894770740",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "source": "",
+   "id": "da2d4023002648dc",
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (ml)",
+   "language": "python",
+   "name": "ml"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 77d00f089b56870ff9d1240c73dd433767cd366a Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmal.com>
Date: Wed, 31 Jul 2024 18:29:24 +0530
Subject: [PATCH 10/11] Add LoRA to GPT2

---
 labml_nn/transformers/LoRA/GPT2.py          |  20 +-
 labml_nn/transformers/LoRA/__init__.py      |   2 +-
 labml_nn/transformers/LoRA/experiment.ipynb |  55 ++--
 labml_nn/transformers/LoRA/train.ipynb      | 262 +++++++++++++++-----
 4 files changed, 255 insertions(+), 84 deletions(-)

diff --git a/labml_nn/transformers/LoRA/GPT2.py b/labml_nn/transformers/LoRA/GPT2.py
index 11b92e2d..a7a59342 100644
--- a/labml_nn/transformers/LoRA/GPT2.py
+++ b/labml_nn/transformers/LoRA/GPT2.py
@@ -1,6 +1,7 @@
 import torch
 import torch.nn as nn
 from transformers import AutoTokenizer
+from labml_nn.transformers.LoRA import Linear, Embedding
 
 tokenizer = AutoTokenizer.from_pretrained("gpt2")
 
@@ -10,15 +11,16 @@
     "n_head": 12,
     "n_layer": 12,
     "n_positions": 1024,
-    "vocab_size": 50257
+    "vocab_size": 50257,
+    "device": "cuda"
 }
 
 
 class FFN(nn.Module):
     def __init__(self, dim):
         super().__init__()
-        self.c_fc = nn.Linear(config['n_embd'], dim)
-        self.c_proj = nn.Linear(dim, config['n_embd'])
+        self.c_fc = Linear(config['n_embd'], dim, r=32, bias=True)
+        self.c_proj = Linear(dim, config['n_embd'], r=32, bias=True)
         self.act = nn.functional.gelu
 
     def forward(self, hidden_states):
@@ -36,8 +38,8 @@ def __init__(self):
         self.head_dim = self.embed_dim // self.num_heads
         self.split_size = self.embed_dim
 
-        self.c_att = nn.Linear(config['n_embd'], config['n_embd'] * 3)
-        self.c_proj = nn.Linear(config['n_embd'], config['n_embd'])
+        self.c_att = Linear(config['n_embd'], config['n_embd'] * 3, r=32, bias=True)
+        self.c_proj = Linear(config['n_embd'], config['n_embd'], r=32, bias=True)
 
     def _split_heads(self, tensor, num_heads, attn_head_size):
         """
@@ -100,20 +102,20 @@ class GPTModel(nn.Module):
     def __init__(self):
         super().__init__()
 
-        self.token_embedding = nn.Embedding(config['vocab_size'], config['n_embd'])
-        self.position_embedding = nn.Embedding(config['n_positions'], config['n_embd'])
+        self.token_embedding = Embedding(config['vocab_size'], config['n_embd'], r=32)
+        self.position_embedding = Embedding(config['n_positions'], config['n_embd'], r=32)
 
         self.blocks = nn.ModuleList([Block() for _ in range(config['n_layer'])])
 
         self.final_norm = nn.LayerNorm(config['n_embd'], eps=config['layer_norm_epsilon'])
 
-        self.lm_head = nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
+        self.lm_head = Linear(config['n_embd'], config['vocab_size'], r=32, bias=False)
 
     def forward(self, input_ids):
         batch_size, input_shape = input_ids.size()
 
         token_embeddings = self.token_embedding(input_ids)  # B T C
-        position_ids = torch.arange(input_shape)  # T C
+        position_ids = torch.arange(input_shape, device=config['device'])  # T C
         position_embeddings = self.position_embedding(position_ids)  # B T C
 
         hidden_states = token_embeddings + position_embeddings
diff --git a/labml_nn/transformers/LoRA/__init__.py b/labml_nn/transformers/LoRA/__init__.py
index 8955132e..302a4bf9 100644
--- a/labml_nn/transformers/LoRA/__init__.py
+++ b/labml_nn/transformers/LoRA/__init__.py
@@ -53,7 +53,7 @@ def __init__(
         self.weight = nn.Parameter(torch.empty((num_embeddings, embedding_dim)))
         self.weight.requires_grad = False
 
-        self.scaling = alpha / self.r
+        self.scaling = alpha / r
         self.lora_a = nn.Parameter(torch.empty((num_embeddings, r)))
         self.lora_b = nn.Parameter(torch.empty((r, embedding_dim)))
 
diff --git a/labml_nn/transformers/LoRA/experiment.ipynb b/labml_nn/transformers/LoRA/experiment.ipynb
index eb07a516..7070991d 100644
--- a/labml_nn/transformers/LoRA/experiment.ipynb
+++ b/labml_nn/transformers/LoRA/experiment.ipynb
@@ -3,8 +3,8 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-07-29T07:14:27.781097Z",
-     "start_time": "2024-07-29T07:14:24.819976Z"
+     "end_time": "2024-07-31T12:22:57.496965Z",
+     "start_time": "2024-07-31T12:22:55.151730Z"
     }
    },
    "cell_type": "code",
@@ -19,8 +19,8 @@
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-07-29T07:14:28.183960Z",
-     "start_time": "2024-07-29T07:14:27.782683Z"
+     "end_time": "2024-07-31T12:22:57.986397Z",
+     "start_time": "2024-07-31T12:22:57.498305Z"
     }
    },
    "cell_type": "code",
@@ -39,8 +39,8 @@
    "metadata": {
     "collapsed": true,
     "ExecuteTime": {
-     "end_time": "2024-07-29T07:14:29.840925Z",
-     "start_time": "2024-07-29T07:14:28.185080Z"
+     "end_time": "2024-07-31T12:22:58.562136Z",
+     "start_time": "2024-07-31T12:22:57.987296Z"
     }
    },
    "source": [
@@ -54,20 +54,38 @@
     "if unexpected_keys:\n",
     "    print(f\"Unexpected keys: {unexpected_keys}\")"
    ],
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_7130/2581223434.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
+      "  state_dict = torch.load('transformed.pth')\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Missing keys: ['token_embedding.lora_a', 'token_embedding.lora_b', 'position_embedding.lora_a', 'position_embedding.lora_b', 'blocks.0.attn.c_att.lora_a', 'blocks.0.attn.c_att.lora_b', 'blocks.0.attn.c_proj.lora_a', 'blocks.0.attn.c_proj.lora_b', 'blocks.0.ffn.c_fc.lora_a', 'blocks.0.ffn.c_fc.lora_b', 'blocks.0.ffn.c_proj.lora_a', 'blocks.0.ffn.c_proj.lora_b', 'blocks.1.attn.c_att.lora_a', 'blocks.1.attn.c_att.lora_b', 'blocks.1.attn.c_proj.lora_a', 'blocks.1.attn.c_proj.lora_b', 'blocks.1.ffn.c_fc.lora_a', 'blocks.1.ffn.c_fc.lora_b', 'blocks.1.ffn.c_proj.lora_a', 'blocks.1.ffn.c_proj.lora_b', 'blocks.2.attn.c_att.lora_a', 'blocks.2.attn.c_att.lora_b', 'blocks.2.attn.c_proj.lora_a', 'blocks.2.attn.c_proj.lora_b', 'blocks.2.ffn.c_fc.lora_a', 'blocks.2.ffn.c_fc.lora_b', 'blocks.2.ffn.c_proj.lora_a', 'blocks.2.ffn.c_proj.lora_b', 'blocks.3.attn.c_att.lora_a', 'blocks.3.attn.c_att.lora_b', 'blocks.3.attn.c_proj.lora_a', 'blocks.3.attn.c_proj.lora_b', 'blocks.3.ffn.c_fc.lora_a', 'blocks.3.ffn.c_fc.lora_b', 'blocks.3.ffn.c_proj.lora_a', 'blocks.3.ffn.c_proj.lora_b', 'blocks.4.attn.c_att.lora_a', 'blocks.4.attn.c_att.lora_b', 'blocks.4.attn.c_proj.lora_a', 'blocks.4.attn.c_proj.lora_b', 'blocks.4.ffn.c_fc.lora_a', 'blocks.4.ffn.c_fc.lora_b', 'blocks.4.ffn.c_proj.lora_a', 'blocks.4.ffn.c_proj.lora_b', 'blocks.5.attn.c_att.lora_a', 'blocks.5.attn.c_att.lora_b', 'blocks.5.attn.c_proj.lora_a', 'blocks.5.attn.c_proj.lora_b', 'blocks.5.ffn.c_fc.lora_a', 'blocks.5.ffn.c_fc.lora_b', 'blocks.5.ffn.c_proj.lora_a', 'blocks.5.ffn.c_proj.lora_b', 'blocks.6.attn.c_att.lora_a', 'blocks.6.attn.c_att.lora_b', 'blocks.6.attn.c_proj.lora_a', 'blocks.6.attn.c_proj.lora_b', 'blocks.6.ffn.c_fc.lora_a', 'blocks.6.ffn.c_fc.lora_b', 'blocks.6.ffn.c_proj.lora_a', 'blocks.6.ffn.c_proj.lora_b', 'blocks.7.attn.c_att.lora_a', 'blocks.7.attn.c_att.lora_b', 'blocks.7.attn.c_proj.lora_a', 'blocks.7.attn.c_proj.lora_b', 'blocks.7.ffn.c_fc.lora_a', 'blocks.7.ffn.c_fc.lora_b', 'blocks.7.ffn.c_proj.lora_a', 'blocks.7.ffn.c_proj.lora_b', 'blocks.8.attn.c_att.lora_a', 'blocks.8.attn.c_att.lora_b', 'blocks.8.attn.c_proj.lora_a', 'blocks.8.attn.c_proj.lora_b', 'blocks.8.ffn.c_fc.lora_a', 'blocks.8.ffn.c_fc.lora_b', 'blocks.8.ffn.c_proj.lora_a', 'blocks.8.ffn.c_proj.lora_b', 'blocks.9.attn.c_att.lora_a', 'blocks.9.attn.c_att.lora_b', 'blocks.9.attn.c_proj.lora_a', 'blocks.9.attn.c_proj.lora_b', 'blocks.9.ffn.c_fc.lora_a', 'blocks.9.ffn.c_fc.lora_b', 'blocks.9.ffn.c_proj.lora_a', 'blocks.9.ffn.c_proj.lora_b', 'blocks.10.attn.c_att.lora_a', 'blocks.10.attn.c_att.lora_b', 'blocks.10.attn.c_proj.lora_a', 'blocks.10.attn.c_proj.lora_b', 'blocks.10.ffn.c_fc.lora_a', 'blocks.10.ffn.c_fc.lora_b', 'blocks.10.ffn.c_proj.lora_a', 'blocks.10.ffn.c_proj.lora_b', 'blocks.11.attn.c_att.lora_a', 'blocks.11.attn.c_att.lora_b', 'blocks.11.attn.c_proj.lora_a', 'blocks.11.attn.c_proj.lora_b', 'blocks.11.ffn.c_fc.lora_a', 'blocks.11.ffn.c_fc.lora_b', 'blocks.11.ffn.c_proj.lora_a', 'blocks.11.ffn.c_proj.lora_b', 'lm_head.lora_a', 'lm_head.lora_b']\n"
+     ]
+    }
+   ],
    "execution_count": 3
   },
   {
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2024-07-29T07:22:30.408855Z",
-     "start_time": "2024-07-29T07:22:30.168376Z"
+     "end_time": "2024-07-31T12:23:00.447976Z",
+     "start_time": "2024-07-31T12:22:58.566527Z"
     }
    },
    "cell_type": "code",
    "source": [
     "prompt = \"hello how are you\"\n",
     "tokenized = tokenizer(prompt, return_tensors=\"pt\")\n",
+    "tokenized['input_ids'] = tokenized['input_ids'].to('cuda')\n",
+    "model = model.to('cuda')\n",
     "\n",
     "with torch.no_grad():\n",
     "    model.eval()\n",
@@ -90,22 +108,27 @@
      ]
     }
    ],
-   "execution_count": 17
+   "execution_count": 4
   },
   {
-   "metadata": {},
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:23:00.452060Z",
+     "start_time": "2024-07-31T12:23:00.448904Z"
+    }
+   },
    "cell_type": "code",
-   "outputs": [],
-   "execution_count": null,
    "source": "",
-   "id": "c12776360008a974"
+   "id": "c12776360008a974",
+   "outputs": [],
+   "execution_count": 4
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (ml)",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "ml"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
diff --git a/labml_nn/transformers/LoRA/train.ipynb b/labml_nn/transformers/LoRA/train.ipynb
index 342ba78d..cd70bfb3 100644
--- a/labml_nn/transformers/LoRA/train.ipynb
+++ b/labml_nn/transformers/LoRA/train.ipynb
@@ -4,26 +4,44 @@
    "cell_type": "code",
    "id": "initial_id",
    "metadata": {
-    "collapsed": true
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:37.296030Z",
+     "start_time": "2024-07-31T12:57:37.292368Z"
+    }
    },
-   "source": "# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
+   "source": "# !wget https://raw.github/zusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 1
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "3b1e507015ba6b81",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:37.317651Z",
+     "start_time": "2024-07-31T12:57:37.313808Z"
+    }
+   },
    "source": [
     "with open('input.txt', 'r', encoding='utf-8') as f:\n",
     "    text = f.read()"
    ],
-   "id": "3b1e507015ba6b81",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 2
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "ac8e51ae5bbfcae7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:40.488939Z",
+     "start_time": "2024-07-31T12:57:37.319486Z"
+    }
+   },
    "source": [
     "from transformers import AutoTokenizer\n",
     "\n",
@@ -31,130 +49,258 @@
     "\n",
     "tokens = tokenizer.encode(text, add_special_tokens=False)"
    ],
-   "id": "ac8e51ae5bbfcae7",
-   "outputs": [],
-   "execution_count": null
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors\n"
+     ]
+    }
+   ],
+   "execution_count": 3
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "aeefcdf813e427e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:40.495510Z",
+     "start_time": "2024-07-31T12:57:40.490341Z"
+    }
+   },
    "source": [
-    "context_length = 10\n",
-    "batch_size = 64"
+    "context_length = 512\n",
+    "batch_size = 2"
    ],
-   "id": "aeefcdf813e427e",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 4
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "a384b42274f008a2",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:40.522050Z",
+     "start_time": "2024-07-31T12:57:40.496842Z"
+    }
+   },
    "source": [
     "num_batches = len(tokens) // (batch_size * context_length)\n",
     "tokens = tokens[:num_batches * batch_size * context_length]"
    ],
-   "id": "a384b42274f008a2",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 5
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "5c4cc78ac1a02c1d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:40.592272Z",
+     "start_time": "2024-07-31T12:57:40.524063Z"
+    }
+   },
    "source": [
     "import torch\n",
     "\n",
     "input_ids = torch.tensor(tokens).view(-1, context_length)"
    ],
-   "id": "5c4cc78ac1a02c1d",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 6
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "7037fd75e2161382",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:40.601199Z",
+     "start_time": "2024-07-31T12:57:40.593250Z"
+    }
+   },
    "source": [
     "from torch.utils.data import DataLoader, TensorDataset\n",
     "from torch.optim import Adam\n",
-    "print(input_ids.shape)\n",
+    "from torch.utils.data import random_split\n",
+    "\n",
     "dataset = TensorDataset(input_ids)\n",
-    "dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)"
+    "\n",
+    "train_ratio = 0.8\n",
+    "test_ratio = 0.2\n",
+    "\n",
+    "train_size = int(train_ratio * len(dataset))\n",
+    "test_size = len(dataset) - train_size\n",
+    "\n",
+    "train_dataset, test_dataset = random_split(dataset, [train_size, test_size])\n",
+    "\n",
+    "train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)\n",
+    "test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)"
    ],
-   "id": "7037fd75e2161382",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 7
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "a98b7baa064b8494",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:41.577878Z",
+     "start_time": "2024-07-31T12:57:40.602187Z"
+    }
+   },
    "source": [
     "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n",
     "\n",
-    "model = GPTModel()"
+    "model = GPTModel()\n",
+    "state_dict = torch.load('transformed.pth', weights_only=True)\n",
+    "\n",
+    "_ = model.load_state_dict(state_dict, strict=False)"
    ],
-   "id": "a98b7baa064b8494",
    "outputs": [],
-   "execution_count": null
+   "execution_count": 8
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:43.098187Z",
+     "start_time": "2024-07-31T12:57:41.578713Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "device = \"cuda\"\n",
+    "model = model.to(device=\"cuda\")"
+   ],
+   "id": "2e0fa8b3082df716",
+   "outputs": [],
+   "execution_count": 9
   },
   {
-   "metadata": {},
    "cell_type": "code",
+   "id": "e2f5076894770740",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:57.044755Z",
+     "start_time": "2024-07-31T12:57:43.099050Z"
+    }
+   },
    "source": [
+    "from labml import tracker, experiment\n",
+    "\n",
     "optimizer = Adam(model.parameters(), lr=5e-5)\n",
     "criterion = torch.nn.CrossEntropyLoss()\n",
     "\n",
-    "model.eval()\n",
+    "model.train()\n",
     "epochs = 3\n",
-    "for epoch in range(epochs):\n",
-    "    for batch in dataloader:\n",
-    "        inputs = batch[0]\n",
-    "        labels = inputs.clone()\n",
-    "        \n",
-    "        outputs = model(inputs)\n",
-    "        \n",
-    "        shift_logits = outputs[..., :-1, :]\n",
-    "        shift_labels = labels[..., 1:]\n",
-    "        \n",
-    "        loss = criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))\n",
-    "        \n",
-    "        optimizer.zero_grad()\n",
-    "        loss.backward()\n",
-    "        optimizer.step()\n",
+    "step = 0\n",
     "\n",
+    "with experiment.record(name='LoRA.GPT2', app_url='http://localhost:5005/api/v1/track'):\n",
+    "    for epoch in range(epochs):\n",
+    "        for batch in train_dataloader:\n",
+    "            inputs = batch[0]\n",
+    "            inputs = inputs.to(device)\n",
+    "            labels = inputs.clone()\n",
+    "            \n",
+    "            outputs = model(inputs)\n",
+    "            \n",
+    "            shift_logits = outputs[..., :-1, :]\n",
+    "            shift_labels = labels[..., 1:]\n",
+    "            \n",
+    "            loss = criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))\n",
+    "            \n",
+    "            optimizer.zero_grad()\n",
+    "            loss.backward()\n",
+    "            optimizer.step()\n",
+    "            \n",
+    "            tracker.save(step, {'loss': loss})\n",
+    "            step += 1\n",
     "        print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')\n",
-    "        break\n",
+    "        \n",
+    "        test_loss = 0\n",
+    "        for batch in test_dataloader:\n",
+    "            inputs = batch[0]\n",
+    "            inputs = inputs.to(device)\n",
+    "            labels = inputs.clone()\n",
+    "            \n",
+    "            outputs = model(inputs)\n",
+    "            \n",
+    "            shift_logits = outputs[..., :-1, :]\n",
+    "            shift_labels = labels[..., 1:]\n",
+    "            \n",
+    "            loss = criterion(shift_logits.reshape(-1, shift_logits.size(-1)), shift_labels.reshape(-1))\n",
+    "            \n",
+    "            test_loss += loss.item()\n",
+    "        test_loss /= len(test_dataloader)\n",
+    "        tracker.save(step, {'test_loss': test_loss})\n",
+    "        \n",
     "\n",
     "print(\"Training complete.\")"
    ],
-   "id": "e2f5076894770740",
-   "outputs": [],
-   "execution_count": null
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ],
+      "text/html": [
+       "<pre style=\"overflow-x: scroll;\">\n",
+       "<strong><span style=\"text-decoration: underline\">LoRA.GPT2</span></strong>: <span style=\"color: #208FFB\">7a14822c4f3c11efad8354ef33f17c7c</span>\n",
+       "\t[dirty]: <strong><span style=\"color: #DDB62B\">\"training loop\"</span></strong>\n",
+       "<span style=\"color: #208FFB\">Monitor experiment at </span><a href='http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c' target='blank'>http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c</a>\n",
+       "<strong><span style=\"color: #DDB62B\">Still updating labml server, please wait for it to complete...</span></strong></pre>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
+      "Cell \u001B[0;32mIn[10], line 25\u001B[0m\n\u001B[1;32m     22\u001B[0m loss \u001B[38;5;241m=\u001B[39m criterion(shift_logits\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m, shift_logits\u001B[38;5;241m.\u001B[39msize(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)), shift_labels\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m))\n\u001B[1;32m     24\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[0;32m---> 25\u001B[0m loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m     26\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mstep()\n\u001B[1;32m     28\u001B[0m tracker\u001B[38;5;241m.\u001B[39msave(step, {\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m'\u001B[39m: loss})\n",
+      "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/_tensor.py:521\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    511\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m    512\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m    513\u001B[0m         Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m    514\u001B[0m         (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m    519\u001B[0m         inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m    520\u001B[0m     )\n\u001B[0;32m--> 521\u001B[0m torch\u001B[38;5;241m.\u001B[39mautograd\u001B[38;5;241m.\u001B[39mbackward(\n\u001B[1;32m    522\u001B[0m     \u001B[38;5;28mself\u001B[39m, gradient, retain_graph, create_graph, inputs\u001B[38;5;241m=\u001B[39minputs\n\u001B[1;32m    523\u001B[0m )\n",
+      "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py:289\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    284\u001B[0m     retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m    286\u001B[0m \u001B[38;5;66;03m# The reason we repeat the same comment below is that\u001B[39;00m\n\u001B[1;32m    287\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m    288\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 289\u001B[0m _engine_run_backward(\n\u001B[1;32m    290\u001B[0m     tensors,\n\u001B[1;32m    291\u001B[0m     grad_tensors_,\n\u001B[1;32m    292\u001B[0m     retain_graph,\n\u001B[1;32m    293\u001B[0m     create_graph,\n\u001B[1;32m    294\u001B[0m     inputs,\n\u001B[1;32m    295\u001B[0m     allow_unreachable\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m    296\u001B[0m     accumulate_grad\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m    297\u001B[0m )\n",
+      "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:768\u001B[0m, in \u001B[0;36m_engine_run_backward\u001B[0;34m(t_outputs, *args, **kwargs)\u001B[0m\n\u001B[1;32m    766\u001B[0m     unregister_hooks \u001B[38;5;241m=\u001B[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001B[1;32m    767\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 768\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m Variable\u001B[38;5;241m.\u001B[39m_execution_engine\u001B[38;5;241m.\u001B[39mrun_backward(  \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m    769\u001B[0m         t_outputs, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs\n\u001B[1;32m    770\u001B[0m     )  \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m    771\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m    772\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m attach_logging_hooks:\n",
+      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
+     ]
+    }
+   ],
+   "execution_count": 10
   },
   {
-   "metadata": {},
    "cell_type": "code",
-   "source": "",
    "id": "da2d4023002648dc",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2024-07-31T12:57:57.046254Z",
+     "start_time": "2024-07-31T12:57:57.045954Z"
+    }
+   },
+   "source": [],
    "outputs": [],
    "execution_count": null
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python (ml)",
+   "display_name": "base",
    "language": "python",
-   "name": "ml"
+   "name": "base"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
   }
  },
  "nbformat": 4,

From bc32b507ea06a51390ddc3d15dc5bdbf19f10986 Mon Sep 17 00:00:00 2001
From: lakshith <lakshith.k.nishshanke@gmail.com>
Date: Wed, 31 Jul 2024 20:39:46 +0530
Subject: [PATCH 11/11] clear notebook outputs

---
 labml_nn/transformers/LoRA/experiment.ipynb |  75 ++---------
 labml_nn/transformers/LoRA/train.ipynb      | 137 ++++----------------
 2 files changed, 34 insertions(+), 178 deletions(-)

diff --git a/labml_nn/transformers/LoRA/experiment.ipynb b/labml_nn/transformers/LoRA/experiment.ipynb
index 7070991d..f0ae1c84 100644
--- a/labml_nn/transformers/LoRA/experiment.ipynb
+++ b/labml_nn/transformers/LoRA/experiment.ipynb
@@ -1,12 +1,7 @@
 {
  "cells": [
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:22:57.496965Z",
-     "start_time": "2024-07-31T12:22:55.151730Z"
-    }
-   },
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n",
@@ -14,15 +9,10 @@
    ],
    "id": "cffa3ec341b4905a",
    "outputs": [],
-   "execution_count": 1
+   "execution_count": null
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:22:57.986397Z",
-     "start_time": "2024-07-31T12:22:57.498305Z"
-    }
-   },
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "from transformers import AutoTokenizer\n",
@@ -31,17 +21,13 @@
    ],
    "id": "c2b0b7e18394ea9e",
    "outputs": [],
-   "execution_count": 2
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "initial_id",
    "metadata": {
-    "collapsed": true,
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:22:58.562136Z",
-     "start_time": "2024-07-31T12:22:57.987296Z"
-    }
+    "collapsed": true
    },
    "source": [
     "model = GPTModel()\n",
@@ -54,32 +40,11 @@
     "if unexpected_keys:\n",
     "    print(f\"Unexpected keys: {unexpected_keys}\")"
    ],
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/tmp/ipykernel_7130/2581223434.py:3: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.\n",
-      "  state_dict = torch.load('transformed.pth')\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Missing keys: ['token_embedding.lora_a', 'token_embedding.lora_b', 'position_embedding.lora_a', 'position_embedding.lora_b', 'blocks.0.attn.c_att.lora_a', 'blocks.0.attn.c_att.lora_b', 'blocks.0.attn.c_proj.lora_a', 'blocks.0.attn.c_proj.lora_b', 'blocks.0.ffn.c_fc.lora_a', 'blocks.0.ffn.c_fc.lora_b', 'blocks.0.ffn.c_proj.lora_a', 'blocks.0.ffn.c_proj.lora_b', 'blocks.1.attn.c_att.lora_a', 'blocks.1.attn.c_att.lora_b', 'blocks.1.attn.c_proj.lora_a', 'blocks.1.attn.c_proj.lora_b', 'blocks.1.ffn.c_fc.lora_a', 'blocks.1.ffn.c_fc.lora_b', 'blocks.1.ffn.c_proj.lora_a', 'blocks.1.ffn.c_proj.lora_b', 'blocks.2.attn.c_att.lora_a', 'blocks.2.attn.c_att.lora_b', 'blocks.2.attn.c_proj.lora_a', 'blocks.2.attn.c_proj.lora_b', 'blocks.2.ffn.c_fc.lora_a', 'blocks.2.ffn.c_fc.lora_b', 'blocks.2.ffn.c_proj.lora_a', 'blocks.2.ffn.c_proj.lora_b', 'blocks.3.attn.c_att.lora_a', 'blocks.3.attn.c_att.lora_b', 'blocks.3.attn.c_proj.lora_a', 'blocks.3.attn.c_proj.lora_b', 'blocks.3.ffn.c_fc.lora_a', 'blocks.3.ffn.c_fc.lora_b', 'blocks.3.ffn.c_proj.lora_a', 'blocks.3.ffn.c_proj.lora_b', 'blocks.4.attn.c_att.lora_a', 'blocks.4.attn.c_att.lora_b', 'blocks.4.attn.c_proj.lora_a', 'blocks.4.attn.c_proj.lora_b', 'blocks.4.ffn.c_fc.lora_a', 'blocks.4.ffn.c_fc.lora_b', 'blocks.4.ffn.c_proj.lora_a', 'blocks.4.ffn.c_proj.lora_b', 'blocks.5.attn.c_att.lora_a', 'blocks.5.attn.c_att.lora_b', 'blocks.5.attn.c_proj.lora_a', 'blocks.5.attn.c_proj.lora_b', 'blocks.5.ffn.c_fc.lora_a', 'blocks.5.ffn.c_fc.lora_b', 'blocks.5.ffn.c_proj.lora_a', 'blocks.5.ffn.c_proj.lora_b', 'blocks.6.attn.c_att.lora_a', 'blocks.6.attn.c_att.lora_b', 'blocks.6.attn.c_proj.lora_a', 'blocks.6.attn.c_proj.lora_b', 'blocks.6.ffn.c_fc.lora_a', 'blocks.6.ffn.c_fc.lora_b', 'blocks.6.ffn.c_proj.lora_a', 'blocks.6.ffn.c_proj.lora_b', 'blocks.7.attn.c_att.lora_a', 'blocks.7.attn.c_att.lora_b', 'blocks.7.attn.c_proj.lora_a', 'blocks.7.attn.c_proj.lora_b', 'blocks.7.ffn.c_fc.lora_a', 'blocks.7.ffn.c_fc.lora_b', 'blocks.7.ffn.c_proj.lora_a', 'blocks.7.ffn.c_proj.lora_b', 'blocks.8.attn.c_att.lora_a', 'blocks.8.attn.c_att.lora_b', 'blocks.8.attn.c_proj.lora_a', 'blocks.8.attn.c_proj.lora_b', 'blocks.8.ffn.c_fc.lora_a', 'blocks.8.ffn.c_fc.lora_b', 'blocks.8.ffn.c_proj.lora_a', 'blocks.8.ffn.c_proj.lora_b', 'blocks.9.attn.c_att.lora_a', 'blocks.9.attn.c_att.lora_b', 'blocks.9.attn.c_proj.lora_a', 'blocks.9.attn.c_proj.lora_b', 'blocks.9.ffn.c_fc.lora_a', 'blocks.9.ffn.c_fc.lora_b', 'blocks.9.ffn.c_proj.lora_a', 'blocks.9.ffn.c_proj.lora_b', 'blocks.10.attn.c_att.lora_a', 'blocks.10.attn.c_att.lora_b', 'blocks.10.attn.c_proj.lora_a', 'blocks.10.attn.c_proj.lora_b', 'blocks.10.ffn.c_fc.lora_a', 'blocks.10.ffn.c_fc.lora_b', 'blocks.10.ffn.c_proj.lora_a', 'blocks.10.ffn.c_proj.lora_b', 'blocks.11.attn.c_att.lora_a', 'blocks.11.attn.c_att.lora_b', 'blocks.11.attn.c_proj.lora_a', 'blocks.11.attn.c_proj.lora_b', 'blocks.11.ffn.c_fc.lora_a', 'blocks.11.ffn.c_fc.lora_b', 'blocks.11.ffn.c_proj.lora_a', 'blocks.11.ffn.c_proj.lora_b', 'lm_head.lora_a', 'lm_head.lora_b']\n"
-     ]
-    }
-   ],
-   "execution_count": 3
+   "outputs": [],
+   "execution_count": null
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:23:00.447976Z",
-     "start_time": "2024-07-31T12:22:58.566527Z"
-    }
-   },
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "prompt = \"hello how are you\"\n",
@@ -96,32 +61,16 @@
     "    print(tokenizer.decode(id))"
    ],
    "id": "f4f7826ec3729b66",
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      ",\n",
-      " to\n",
-      " you\n",
-      " doing\n"
-     ]
-    }
-   ],
-   "execution_count": 4
+   "outputs": [],
+   "execution_count": null
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:23:00.452060Z",
-     "start_time": "2024-07-31T12:23:00.448904Z"
-    }
-   },
+   "metadata": {},
    "cell_type": "code",
    "source": "",
    "id": "c12776360008a974",
    "outputs": [],
-   "execution_count": 4
+   "execution_count": null
   }
  ],
  "metadata": {
diff --git a/labml_nn/transformers/LoRA/train.ipynb b/labml_nn/transformers/LoRA/train.ipynb
index cd70bfb3..b2e3038e 100644
--- a/labml_nn/transformers/LoRA/train.ipynb
+++ b/labml_nn/transformers/LoRA/train.ipynb
@@ -7,41 +7,27 @@
     "collapsed": true,
     "jupyter": {
      "outputs_hidden": true
-    },
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:37.296030Z",
-     "start_time": "2024-07-31T12:57:37.292368Z"
     }
    },
    "source": "# !wget https://raw.github/zusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt",
    "outputs": [],
-   "execution_count": 1
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "3b1e507015ba6b81",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:37.317651Z",
-     "start_time": "2024-07-31T12:57:37.313808Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "with open('input.txt', 'r', encoding='utf-8') as f:\n",
     "    text = f.read()"
    ],
    "outputs": [],
-   "execution_count": 2
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "ac8e51ae5bbfcae7",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:40.488939Z",
-     "start_time": "2024-07-31T12:57:37.319486Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "from transformers import AutoTokenizer\n",
     "\n",
@@ -49,75 +35,47 @@
     "\n",
     "tokens = tokenizer.encode(text, add_special_tokens=False)"
    ],
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Token indices sequence length is longer than the specified maximum sequence length for this model (338025 > 1024). Running this sequence through the model will result in indexing errors\n"
-     ]
-    }
-   ],
-   "execution_count": 3
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "aeefcdf813e427e",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:40.495510Z",
-     "start_time": "2024-07-31T12:57:40.490341Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "context_length = 512\n",
     "batch_size = 2"
    ],
    "outputs": [],
-   "execution_count": 4
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "a384b42274f008a2",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:40.522050Z",
-     "start_time": "2024-07-31T12:57:40.496842Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "num_batches = len(tokens) // (batch_size * context_length)\n",
     "tokens = tokens[:num_batches * batch_size * context_length]"
    ],
    "outputs": [],
-   "execution_count": 5
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "5c4cc78ac1a02c1d",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:40.592272Z",
-     "start_time": "2024-07-31T12:57:40.524063Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "import torch\n",
     "\n",
     "input_ids = torch.tensor(tokens).view(-1, context_length)"
    ],
    "outputs": [],
-   "execution_count": 6
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "7037fd75e2161382",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:40.601199Z",
-     "start_time": "2024-07-31T12:57:40.593250Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "from torch.utils.data import DataLoader, TensorDataset\n",
     "from torch.optim import Adam\n",
@@ -137,17 +95,12 @@
     "test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)"
    ],
    "outputs": [],
-   "execution_count": 7
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "a98b7baa064b8494",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:41.577878Z",
-     "start_time": "2024-07-31T12:57:40.602187Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "from labml_nn.transformers.LoRA.GPT2 import GPTModel\n",
     "\n",
@@ -157,15 +110,10 @@
     "_ = model.load_state_dict(state_dict, strict=False)"
    ],
    "outputs": [],
-   "execution_count": 8
+   "execution_count": null
   },
   {
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:43.098187Z",
-     "start_time": "2024-07-31T12:57:41.578713Z"
-    }
-   },
+   "metadata": {},
    "cell_type": "code",
    "source": [
     "device = \"cuda\"\n",
@@ -173,17 +121,12 @@
    ],
    "id": "2e0fa8b3082df716",
    "outputs": [],
-   "execution_count": 9
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "e2f5076894770740",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:57.044755Z",
-     "start_time": "2024-07-31T12:57:43.099050Z"
-    }
-   },
+   "metadata": {},
    "source": [
     "from labml import tracker, experiment\n",
     "\n",
@@ -236,49 +179,13 @@
     "\n",
     "print(\"Training complete.\")"
    ],
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<IPython.core.display.HTML object>"
-      ],
-      "text/html": [
-       "<pre style=\"overflow-x: scroll;\">\n",
-       "<strong><span style=\"text-decoration: underline\">LoRA.GPT2</span></strong>: <span style=\"color: #208FFB\">7a14822c4f3c11efad8354ef33f17c7c</span>\n",
-       "\t[dirty]: <strong><span style=\"color: #DDB62B\">\"training loop\"</span></strong>\n",
-       "<span style=\"color: #208FFB\">Monitor experiment at </span><a href='http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c' target='blank'>http://localhost:5005/run/7a14822c4f3c11efad8354ef33f17c7c</a>\n",
-       "<strong><span style=\"color: #DDB62B\">Still updating labml server, please wait for it to complete...</span></strong></pre>"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mKeyboardInterrupt\u001B[0m                         Traceback (most recent call last)",
-      "Cell \u001B[0;32mIn[10], line 25\u001B[0m\n\u001B[1;32m     22\u001B[0m loss \u001B[38;5;241m=\u001B[39m criterion(shift_logits\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m, shift_logits\u001B[38;5;241m.\u001B[39msize(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m)), shift_labels\u001B[38;5;241m.\u001B[39mreshape(\u001B[38;5;241m-\u001B[39m\u001B[38;5;241m1\u001B[39m))\n\u001B[1;32m     24\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mzero_grad()\n\u001B[0;32m---> 25\u001B[0m loss\u001B[38;5;241m.\u001B[39mbackward()\n\u001B[1;32m     26\u001B[0m optimizer\u001B[38;5;241m.\u001B[39mstep()\n\u001B[1;32m     28\u001B[0m tracker\u001B[38;5;241m.\u001B[39msave(step, {\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mloss\u001B[39m\u001B[38;5;124m'\u001B[39m: loss})\n",
-      "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/_tensor.py:521\u001B[0m, in \u001B[0;36mTensor.backward\u001B[0;34m(self, gradient, retain_graph, create_graph, inputs)\u001B[0m\n\u001B[1;32m    511\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m has_torch_function_unary(\u001B[38;5;28mself\u001B[39m):\n\u001B[1;32m    512\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m handle_torch_function(\n\u001B[1;32m    513\u001B[0m         Tensor\u001B[38;5;241m.\u001B[39mbackward,\n\u001B[1;32m    514\u001B[0m         (\u001B[38;5;28mself\u001B[39m,),\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m    519\u001B[0m         inputs\u001B[38;5;241m=\u001B[39minputs,\n\u001B[1;32m    520\u001B[0m     )\n\u001B[0;32m--> 521\u001B[0m torch\u001B[38;5;241m.\u001B[39mautograd\u001B[38;5;241m.\u001B[39mbackward(\n\u001B[1;32m    522\u001B[0m     \u001B[38;5;28mself\u001B[39m, gradient, retain_graph, create_graph, inputs\u001B[38;5;241m=\u001B[39minputs\n\u001B[1;32m    523\u001B[0m )\n",
-      "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/__init__.py:289\u001B[0m, in \u001B[0;36mbackward\u001B[0;34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001B[0m\n\u001B[1;32m    284\u001B[0m     retain_graph \u001B[38;5;241m=\u001B[39m create_graph\n\u001B[1;32m    286\u001B[0m \u001B[38;5;66;03m# The reason we repeat the same comment below is that\u001B[39;00m\n\u001B[1;32m    287\u001B[0m \u001B[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001B[39;00m\n\u001B[1;32m    288\u001B[0m \u001B[38;5;66;03m# calls in the traceback and some print out the last line\u001B[39;00m\n\u001B[0;32m--> 289\u001B[0m _engine_run_backward(\n\u001B[1;32m    290\u001B[0m     tensors,\n\u001B[1;32m    291\u001B[0m     grad_tensors_,\n\u001B[1;32m    292\u001B[0m     retain_graph,\n\u001B[1;32m    293\u001B[0m     create_graph,\n\u001B[1;32m    294\u001B[0m     inputs,\n\u001B[1;32m    295\u001B[0m     allow_unreachable\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m    296\u001B[0m     accumulate_grad\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m,\n\u001B[1;32m    297\u001B[0m )\n",
-      "File \u001B[0;32m~/miniconda3/lib/python3.12/site-packages/torch/autograd/graph.py:768\u001B[0m, in \u001B[0;36m_engine_run_backward\u001B[0;34m(t_outputs, *args, **kwargs)\u001B[0m\n\u001B[1;32m    766\u001B[0m     unregister_hooks \u001B[38;5;241m=\u001B[39m _register_logging_hooks_on_whole_graph(t_outputs)\n\u001B[1;32m    767\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 768\u001B[0m     \u001B[38;5;28;01mreturn\u001B[39;00m Variable\u001B[38;5;241m.\u001B[39m_execution_engine\u001B[38;5;241m.\u001B[39mrun_backward(  \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m    769\u001B[0m         t_outputs, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs\n\u001B[1;32m    770\u001B[0m     )  \u001B[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001B[39;00m\n\u001B[1;32m    771\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m    772\u001B[0m     \u001B[38;5;28;01mif\u001B[39;00m attach_logging_hooks:\n",
-      "\u001B[0;31mKeyboardInterrupt\u001B[0m: "
-     ]
-    }
-   ],
-   "execution_count": 10
+   "outputs": [],
+   "execution_count": null
   },
   {
    "cell_type": "code",
    "id": "da2d4023002648dc",
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2024-07-31T12:57:57.046254Z",
-     "start_time": "2024-07-31T12:57:57.045954Z"
-    }
-   },
+   "metadata": {},
    "source": [],
    "outputs": [],
    "execution_count": null