-
Notifications
You must be signed in to change notification settings - Fork 10.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introduce GGML migration tool for new file format
If you deleted your old Meta LLaMA .pth files, then the migrate-ggml-2023-03-30-pr613.py script will allow you to convert your old ggml files into the new mmap()'able format. See #613
- Loading branch information
Showing
3 changed files
with
325 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,317 @@ | ||
# Migrate ggml file(s) with ggmf magic to ggml file with ggjt magic | ||
# | ||
# We caused a breaking change to the file format on 2023-03-30 in: | ||
# https://github.com/ggerganov/llama.cpp/pull/613 | ||
# | ||
# (1) If you still have the Meta LLaMA .pth files, then close this | ||
# file now; you can just run `convert-pth-to-ggml.py` again to | ||
# migrate to the new format. The tool is easier to use too. It | ||
# isn't necessary anymore to manage split output files because | ||
# the new format always combines things into a single file. | ||
# | ||
# (2) If you deleted the Meta LLaMA .pth files due to save on disk | ||
# space, then this tool is intended to help you. Please check | ||
# out the instructions below. | ||
# | ||
# USAGE | ||
# | ||
# python migrate-ggml-2023-03-30-pr613.py INPUT OUTPUT | ||
# | ||
# PREREQUISITES | ||
# | ||
# pip install numpy | ||
# cd llama.cpp | ||
# make -j4 | ||
# | ||
# EXAMPLE (7B MODEL) | ||
# | ||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights | ||
# python migrate-ggml-2023-03-30-pr613.py models/7B/ggml-model-f16.bin models/7B/ggml-model-f16-ggjt.bin | ||
# | ||
# # check that it works | ||
# ./main -m models/7B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' | ||
# | ||
# # you can delete the old files | ||
# rm -f models/7B/ggml-model-f16.bin | ||
# mv models/7B/ggml-model-f16-ggjt.bin models/7B/ggml-model-f16.bin | ||
# | ||
# EXAMPLE (13B MODEL) | ||
# | ||
# # you can replace all the 'f16' with 'q4_0' if you're using quantized weights | ||
# python migrate-ggml-2023-03-30-pr613.py models/13B/ggml-model-f16.bin models/13B/ggml-model-f16-ggjt.bin | ||
# | ||
# # check that it works | ||
# ./main -m models/13B/ggml-model-f16-ggjt.bin -p 'Question: Do you love me?' | ||
# | ||
# # you can delete the old files | ||
# rm -f models/13B/ggml-model-f16.bin* | ||
# mv models/13B/ggml-model-f16-ggjt.bin models/13B/ggml-model-f16.bin | ||
# | ||
|
||
import argparse | ||
import os | ||
import sys | ||
import json | ||
import struct | ||
import numpy as np | ||
|
||
QK = 32 | ||
|
||
GGML_TYPE_Q4_0 = 0 | ||
GGML_TYPE_Q4_1 = 1 | ||
GGML_TYPE_I8 = 2 | ||
GGML_TYPE_I16 = 3 | ||
GGML_TYPE_I32 = 4 | ||
GGML_TYPE_F16 = 5 | ||
GGML_TYPE_F32 = 6 | ||
|
||
WTYPE_NAMES = { | ||
0: "F32", | ||
1: "F16", | ||
2: "Q4_0", | ||
3: "Q4_1", | ||
} | ||
|
||
WTYPES = { | ||
0: GGML_TYPE_F32, | ||
1: GGML_TYPE_F16, | ||
2: GGML_TYPE_Q4_0, | ||
3: GGML_TYPE_Q4_1, | ||
} | ||
|
||
GGML_BLCK_SIZE = { | ||
GGML_TYPE_Q4_0: QK, | ||
GGML_TYPE_Q4_1: QK, | ||
GGML_TYPE_I8: 1, | ||
GGML_TYPE_I16: 1, | ||
GGML_TYPE_I32: 1, | ||
GGML_TYPE_F16: 1, | ||
GGML_TYPE_F32: 1, | ||
} | ||
|
||
GGML_TYPE_SIZE = { | ||
GGML_TYPE_Q4_0: 4 + QK//2, | ||
GGML_TYPE_Q4_1: 4*2 + QK//2, | ||
GGML_TYPE_I8: 1, | ||
GGML_TYPE_I16: 2, | ||
GGML_TYPE_I32: 4, | ||
GGML_TYPE_F16: 2, | ||
GGML_TYPE_F32: 4, | ||
} | ||
|
||
HPARAMS = [ | ||
'magic', # int32 | ||
'version', # int32 | ||
'n_vocab', # int32 | ||
'n_embd', # int32 | ||
'n_mult', # int32 | ||
'n_head', # int32 | ||
'n_layer', # int32 | ||
'n_rot', # int32 | ||
'f16', # int32 | ||
] | ||
|
||
def read_hparams(fin): | ||
struct_fmt = "i" * len(HPARAMS) | ||
struct_size = struct.calcsize(struct_fmt) | ||
buf = fin.read(struct_size) | ||
ints = struct.unpack(struct_fmt, buf) | ||
hparams = dict(zip(HPARAMS, ints)) | ||
return hparams | ||
|
||
def write_hparams(fout, hparams): | ||
struct_fmt = "i" * len(HPARAMS) | ||
struct_size = struct.calcsize(struct_fmt) | ||
ints = [hparams[h] for h in HPARAMS] | ||
fout.write(struct.pack(struct_fmt, *ints)) | ||
|
||
def read_tokens(fin, hparams): | ||
tokens = [] | ||
for i in range(hparams['n_vocab']): | ||
len_b = fin.read(4) | ||
(length,) = struct.unpack("i", len_b) | ||
word = fin.read(length) | ||
score_b = fin.read(4) | ||
(score,) = struct.unpack("f", score_b) | ||
tokens.append((word, score)) | ||
return tokens | ||
|
||
def write_tokens(fout, tokens): | ||
for word, score in tokens: | ||
fout.write(struct.pack("i", len(word))) | ||
fout.write(word) | ||
fout.write(struct.pack("f", score)) | ||
|
||
def ggml_nelements(shape): | ||
r = 1 | ||
for i in shape: | ||
r *= i | ||
return r | ||
|
||
def ggml_nbytes(shape, ftype): | ||
x = ggml_nelements(shape) | ||
t = WTYPES[ftype] | ||
x *= GGML_TYPE_SIZE[t] | ||
x //= GGML_BLCK_SIZE[t] | ||
return x | ||
|
||
def get_n_parts(dim): | ||
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8} | ||
n_parts = mappings.get(dim) | ||
if n_parts is None: | ||
print(f"Invalid dim: {dim}") | ||
sys.exit(1) | ||
print(f"n_parts = {n_parts}\n") | ||
return n_parts | ||
|
||
def copy_tensors(fin, fout, part_id, n_parts): | ||
while True: | ||
|
||
b = fin.read(4) | ||
if not b: break | ||
(n_dims,) = struct.unpack("i", b) | ||
b = fin.read(4) | ||
(length,) = struct.unpack("i", b) | ||
b = fin.read(4) | ||
(ftype,) = struct.unpack("i", b) | ||
|
||
assert n_dims in (1, 2) | ||
|
||
nelements = 1 | ||
partshape = list(range(n_dims)) | ||
for i in range(n_dims): | ||
b = fin.read(4) | ||
partshape[i] = struct.unpack("i", b)[0] | ||
nelements *= partshape[i] | ||
partshape = list(reversed(partshape)) | ||
|
||
name = fin.read(length) | ||
data = fin.read(ggml_nbytes(partshape, ftype)) | ||
|
||
blck_size = GGML_BLCK_SIZE[WTYPES[ftype]] | ||
type_size = GGML_TYPE_SIZE[WTYPES[ftype]] | ||
|
||
print(f"Processing tensor {name} with shape: {partshape} and type: {WTYPE_NAMES[ftype]}") | ||
|
||
# determine dimension along which multipart tensor is sharded | ||
# | ||
# split_dim 0 regex: | ||
# - output.* | ||
# - layers.*.attention.wq.weight | ||
# - layers.*.attention.wk.weight | ||
# - layers.*.attention.wv.weight | ||
# - layers.*.feed_forward.w1.weight | ||
# - layers.*.feed_forward.w3.weight | ||
# | ||
# split_dim 1 regex: | ||
# - tok_embeddings.* | ||
# - layers.*.attention.wo.weight | ||
# - layers.*.feed_forward.w2.weight | ||
# | ||
if n_dims > 1: | ||
split_dim = 1 | ||
if b"tok_embeddings" in name: | ||
split_dim = 1 | ||
elif b"layers" in name: | ||
if b"attention.wo.weight" in name: | ||
split_dim = 1 | ||
elif b"feed_forward.w2.weight" in name: | ||
split_dim = 1 | ||
else: | ||
split_dim = 0 | ||
elif b"output" in name: | ||
split_dim = 0 | ||
|
||
# output tensor header | ||
fullshape = list(partshape) | ||
if n_dims > 1: | ||
fullshape[split_dim] *= n_parts | ||
fout.write(struct.pack("iii", n_dims, len(name), ftype)) | ||
for dim in reversed(fullshape): | ||
fout.write(struct.pack("i", dim)) | ||
fout.write(name) | ||
|
||
# ensure tensor data is aligned | ||
tensor_data_offset = fout.tell() | ||
while tensor_data_offset % QK != 0: | ||
fout.write(struct.pack("B", 0)) | ||
tensor_data_offset += 1 | ||
|
||
# output unified mappable tensor data | ||
if n_dims == 1 or n_parts == 1: | ||
# copy tensor which we thankfully received in one piece | ||
if part_id == 0: | ||
fout.write(data) | ||
elif split_dim == 0: | ||
# reassemble multifile tensor containing some of the rows | ||
rows_per_chunk = partshape[0] | ||
current_row = part_id * rows_per_chunk | ||
bytes_per_row = fullshape[1] // blck_size * type_size | ||
offset = current_row * bytes_per_row | ||
fout.seek(tensor_data_offset + offset) | ||
fout.write(data) | ||
elif split_dim == 1: | ||
# reassemble multifile tensor containing some of the cols | ||
cols_per_chunk = partshape[1] | ||
current_col = part_id * cols_per_chunk | ||
bpr = partshape[1] // blck_size * type_size | ||
bytes_per_row = fullshape[1] // blck_size * type_size | ||
offset_current_col = current_col // blck_size * type_size | ||
for row in range(partshape[0]): | ||
offset_row = row * bytes_per_row | ||
offset = offset_row + offset_current_col | ||
fout.seek(tensor_data_offset + offset) | ||
fout.write(data[row * bpr:row * bpr + bpr]) | ||
|
||
# advance file position to next tensor | ||
fout.seek(tensor_data_offset + ggml_nbytes(fullshape, ftype)) | ||
|
||
def parse_args(): | ||
parser = argparse.ArgumentParser(description='Migrate from GGML to new GGJT file format') | ||
parser.add_argument('fin_path', help='your old ggml file (leave out the .1 .2 etc.)') | ||
parser.add_argument('fout_path', help='your new ggjt file name') | ||
return parser.parse_args() | ||
|
||
def main(): | ||
args = parse_args() | ||
assert args.fin_path | ||
assert args.fout_path | ||
assert args.fin_path != args.fout_path | ||
|
||
with open(args.fin_path, "rb") as fin: | ||
hparams = read_hparams(fin) | ||
tokens = read_tokens(fin, hparams) | ||
|
||
if hparams['magic'] == 0x67676a74: # ggjt | ||
print("%s: input ggml has already been converted to 'ggjt' magic\n" % | ||
(args.fin_path)) | ||
sys.exit(1) | ||
|
||
if hparams['magic'] != 0x67676d66: # ggmf | ||
print("%s: input ggml file doesn't have expected 'ggmf' magic: %#x\n" % | ||
(args.fin_path, hparams['magic'])) | ||
sys.exit(1) | ||
|
||
hparams['magic'] = 0x67676a74 # ggjt | ||
n_parts = get_n_parts(hparams['n_embd']) | ||
|
||
# we output a single file for ggml | ||
with open(args.fout_path, "wb") as fout: | ||
write_hparams(fout, hparams) | ||
write_tokens(fout, tokens) | ||
offset_of_tensors = fout.tell() | ||
# the tensors we load could be split across multiple files | ||
for part_id in range(n_parts): | ||
fout.seek(offset_of_tensors) | ||
print(f"Processing part {part_id+1} of {n_parts}\n") | ||
fin_path = args.fin_path | ||
if part_id > 0: | ||
fin_path += ".%d" % (part_id) | ||
with open(fin_path, "rb") as fin: | ||
read_tokens(fin, read_hparams(fin)) | ||
copy_tensors(fin, fout, part_id, n_parts) | ||
|
||
print(f"Done. Output file: {args.fout_path}\n") | ||
|
||
if __name__ == "__main__": | ||
main() |