-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
144 lines (132 loc) · 3.16 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from typing import Literal
import jax
from jaxonloader import get_tiny_shakespeare, make
from kira import Kira, Mamba
from kira.generate import generate_text
from kira.model_args import get_kira_args, get_mamba_args
from kira.train import train
def main():
max_seq_len = 8
early_stop = 500
batch_size = 64
tinyshakespeare = get_tiny_shakespeare()
train_dataset, test_dataset, vocab_size, encode, decode = tinyshakespeare
key = jax.random.PRNGKey(100)
key, subkey = jax.random.split(key)
train_dataloader, train_index = make(
train_dataset,
batch_size=batch_size,
shuffle=True,
drop_last=True,
key=key,
jit=True,
)
test_dataloader, test_index = make(
test_dataset,
batch_size=batch_size,
shuffle=True,
drop_last=True,
key=subkey,
)
n_dims = vocab_size
n_embd = 64 # 384
learning_rate = 3e-4
num_heads = 4 # 6
query_multihead_dim = num_heads
kv_multihead_dim = 2
n_layers = 3 # 6
max_new_tokens = 200 # noqa
kira = train_kira(
train_dataloader,
train_index,
n_dims,
n_embd,
n_layers,
max_seq_len,
num_heads,
query_multihead_dim,
kv_multihead_dim,
learning_rate,
early_stop,
kv_interpolation_mode="repeat",
)
# train_mamba(
# train_dataloader,
# train_index,
# n_dims,
# n_embd,
# n_layers,
# learning_rate,
# early_stop,
# key,
# )
generate_text(kira, max_seq_len, 200, decode, vocab_size)
def train_mamba(
train_dataloader,
train_index,
n_dims,
n_embd,
n_layers,
learning_rate,
early_stop,
key,
):
model_args = get_mamba_args(
n_embd=n_embd, n_dims=n_dims, n_layers=n_layers, d_state=4
)
mamba = Mamba(model_args=model_args, key=key)
key, subkey = jax.random.split(key)
mamba = train(
train_dataloader,
train_index,
learning_rate,
mamba,
subkey,
early_stop=early_stop,
)
return mamba
def train_kira(
train_dataloader,
train_index,
n_dims,
n_embd,
n_layers,
max_seq_len,
num_heads,
query_multihead_dim,
kv_multihead_dim,
learning_rate,
early_stop,
kv_interpolation_mode: Literal["average", "repeat"] = "average",
):
kira_model_args = get_kira_args(
n_dims=n_dims,
n_embd=n_embd,
n_layers=n_layers,
max_seq_len=max_seq_len,
num_heads=num_heads,
num_query_heads=query_multihead_dim,
num_kv_heads=kv_multihead_dim,
width_size=256,
depth=4,
key_seed=0,
kv_interpolation_mode=kv_interpolation_mode,
)
key = jax.random.PRNGKey(kira_model_args.key_seed)
kira = Kira(
model_args=kira_model_args,
key=key,
)
key, subkey = jax.random.split(key)
kira = train(
train_dataloader,
train_index,
learning_rate,
kira,
early_stop=early_stop,
key=subkey,
)
return kira
if __name__ == "__main__":
with jax.checking_leaks():
main()