Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable llama.cpp on s390x big endian platform #3552

Merged
merged 10 commits into from
Oct 20, 2023
7 changes: 6 additions & 1 deletion convert-baichuan-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()

args = parse_args()
Expand All @@ -86,6 +87,10 @@ def parse_args() -> argparse.Namespace:
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)

endianess =gguf.GGUFEndian.LITTLE
ggerganov marked this conversation as resolved.
Show resolved Hide resolved
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
Expand Down Expand Up @@ -113,7 +118,7 @@ def parse_args() -> argparse.Namespace:
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

print("gguf: get model metadata")

Expand Down
22 changes: 14 additions & 8 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:


class OutputFile:
def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
Expand Down Expand Up @@ -875,10 +875,10 @@ def close(self) -> None:
self.gguf.close()

@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)

of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
Expand All @@ -903,10 +903,10 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
return dt.quantize(arr)

@staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)

of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
Expand All @@ -932,6 +932,8 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
if endianess==gguf.GGUFEndian.BIG:
ndarray.byteswap(inplace=True)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be handle in GGUFWriter.write_tensor_data just like you do in add_tensor. Conversion script should not have no responsibility for handling endianness other than setting it in the constructor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@monatis updated as your comments

print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
of.gguf.write_tensor_data(ndarray)

Expand Down Expand Up @@ -1123,8 +1125,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
args = parser.parse_args(args_in)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")

args = parser.parse_args(args_in)
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
Expand All @@ -1138,6 +1141,9 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG

params = Params.load(model_plus)
if params.n_ctx == -1:
Expand Down Expand Up @@ -1185,7 +1191,7 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")

OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
print(f"Wrote {outfile}")


Expand Down
69 changes: 42 additions & 27 deletions gguf-py/gguf/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
GGUF_VERSION = 2
GGUF_DEFAULT_ALIGNMENT = 32


# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
Expand Down Expand Up @@ -569,6 +570,10 @@ class GGMLQuantizationType(IntEnum):
Q6_K = 14
Q8_K = 15

class GGUFEndian(IntEnum):
LITTLE = 0
BIG = 1


class GGUFValueType(IntEnum):
UINT8 = 0
Expand Down Expand Up @@ -616,18 +621,39 @@ class GGUFWriter:
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
tensors: list[tuple[np.ndarray[Any, Any], int]]

def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
def get_pack_prefix(self):
chenqiny marked this conversation as resolved.
Show resolved Hide resolved
if self.endianess==GGUFEndian.LITTLE:
return "<"
else:
return ">"

def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
self.fout = open(path, "wb")
self.arch = arch
self.endianess = endianess
self._simple_value_packing = {
GGUFValueType.UINT8: f"{self.get_pack_prefix()}B",
GGUFValueType.INT8: f"{self.get_pack_prefix()}b",
GGUFValueType.UINT16: f"{self.get_pack_prefix()}H",
GGUFValueType.INT16: f"{self.get_pack_prefix()}h",
GGUFValueType.UINT32: f"{self.get_pack_prefix()}I",
GGUFValueType.INT32: f"{self.get_pack_prefix()}i",
GGUFValueType.FLOAT32: f"{self.get_pack_prefix()}f",
GGUFValueType.UINT64: f"{self.get_pack_prefix()}Q",
GGUFValueType.INT64: f"{self.get_pack_prefix()}q",
GGUFValueType.FLOAT64: f"{self.get_pack_prefix()}d",
GGUFValueType.BOOL: "?" ,
}
self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
print(f"This gguf file is for {self.endianess} only")

def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC))
self.fout.write(struct.pack("<I", GGUF_VERSION))
self.fout.write(struct.pack("<Q", self.ti_data_count))
self.fout.write(struct.pack("<Q", self.kv_data_count))
self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_MAGIC))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic is meant to be exactly the ascii bytes G G U F in the file, regardless of the system endianness.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this requires to bump the GGUF version because the current spec is explicit in little endianess. The spec should also be updated to reflect this change. We cannot simply trust that people do not distribute big endian files.

And of course bump the package version in pyproject.toml

I suggest to check magic code instead. If the endianess is not match, magic code is 0x47475546. Then we can warn user: "Endianess of the GGUF file and platform do not match"

Suggested change
self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_MAGIC))
diff --git a/ggml.c b/ggml.c
index 6d1776c..04b88c9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20916,7 +20916,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
gguf_fread_el(file, &magic, sizeof(magic), &offset);
if (magic != GGUF_MAGIC) {
- fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+ if (magic == GGUF_WRONG_ENIAN_MAGIC)
+ {
+ fprintf(stderr, "Endianess of the GGUF file and platform do not match.%s: invalid magic number %08x.\n", __func__, magic);
+ }
+ else {
+ fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+ }
fclose(file);
return NULL;
}
diff --git a/ggml.h b/ggml.h
index 3eddc44..2ecf893 100644
--- a/ggml.h
+++ b/ggml.h
@@ -232,6 +232,7 @@
#define GGML_EXIT_ABORTED 1
#define GGUF_MAGIC 0x46554747 // "GGUF"
+#define GGUF_WRONG_ENIAN_MAGIC 0x47475546
#define GGUF_VERSION 2
#define GGUF_DEFAULT_ALIGNMENT 32

Result after apply fix

~/code/aiu/work/llama.cpp> ./main -m ~/gguf-s390/Baichuan-7B-f16.gguf
Log start
main: build = 1360 (51e9d39)
main: built with cc (SUSE Linux) 7.5.0 for x86_64-suse-linux
main: seed  = 1697040195
Endianess of the GGUF file and platform do not match.gguf_init_from_file: invalid magic number 47475546.
error loading model: llama_model_loader: failed to load model from /home/cqy/gguf-s390/Baichuan-7B-f16.ggufllama_load_model_from_file: failed to load model
llama_init_from_gpt_params: error: failed to load model '/home/cqy/gguf-s390/Baichuan-7B-f16.gguf'
main: error: unable to load model

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you do want to start loading and saving files that start with F U G G (look in a hex editor), you will have to request a spec change, because that's no longer a GGUF file by its current definition.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cebtenzzre 

I added endianess check

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you do want to start loading and saving files that start with F U G G (look in a hex editor), you will have to request a spec change, because that's no longer a GGUF file by its current definition.

@cebtenzzre This depends on whether we think the magic number is a number or a string.

ggml.c read the magic number as uint_32. This is endianess sensitive. If we think magic is int type, I think my update is compatible to the spec. But if we think magic is string type, then we need to update both ggml.h, ggml.c and gguf.py.

@ggerganov what's your opinion?

struct gguf_header {
    uint32_t magic;
    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
};

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it better to fix ggml.c to read and write the magic byte-per-byte to match the spec?
Currently, technically it does not match the spec

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't it better to fix ggml.c to read and write the magic byte-per-byte to match the spec? Currently, technically it does not match the spec

@ggerganov @cebtenzzre

Appreciate for your comments.

Yes. Let me clarify my update. I fixed the ggml.h to use the difference int magic value according to endianess which always represents "GGUF" characters. Now the file is always compatible to the spec. Now the GGUF file for big endian is started with "GGUF" as small endian GGUF file is.

See the hexdump of llama2 gguf file on s390x big endian:

[aiu gguf-s390]$  hexdump -C gguf-s390/llama-2-7b-f16-new.gguf|head -n 20
00000000  47 47 55 46 00 00 00 03  00 00 00 00 00 00 01 23  |GGUF...........#|
00000010  00 00 00 00 00 00 00 0f  00 00 00 00 00 00 00 14  |................|
00000020  67 65 6e 65 72 61 6c 2e  61 72 63 68 69 74 65 63  |general.architec|
00000030  74 75 72 65 00 00 00 08  00 00 00 00 00 00 00 05  |ture............|
00000040  6c 6c 61 6d 61 00 00 00  00 00 00 00 0c 67 65 6e  |llama........gen|
00000050  65 72 61 6c 2e 6e 61 6d  65 00 00 00 08 00 00 00  |eral.name.......|
00000060  00 00 00 00 08 4c 4c 61  4d 41 20 76 32 00 00 00  |.....LLaMA v2...|
00000070  00 00 00 00 14 6c 6c 61  6d 61 2e 63 6f 6e 74 65  |.....llama.conte|
00000080  78 74 5f 6c 65 6e 67 74  68 00 00 00 04 00 00 10  |xt_length.......|
00000090  00 00 00 00 00 00 00 00  16 6c 6c 61 6d 61 2e 65  |.........llama.e|
000000a0  6d 62 65 64 64 69 6e 67  5f 6c 65 6e 67 74 68 00  |mbedding_length.|
000000b0  00 00 04 00 00 10 00 00  00 00 00 00 00 00 11 6c  |...............l|
000000c0  6c 61 6d 61 2e 62 6c 6f  63 6b 5f 63 6f 75 6e 74  |lama.block_count|
000000d0  00 00 00 04 00 00 00 20  00 00 00 00 00 00 00 19  |....... ........|
000000e0  6c 6c 61 6d 61 2e 66 65  65 64 5f 66 6f 72 77 61  |llama.feed_forwa|
000000f0  72 64 5f 6c 65 6e 67 74  68 00 00 00 04 00 00 2b  |rd_length......+|
00000100  00 00 00 00 00 00 00 00  1a 6c 6c 61 6d 61 2e 72  |.........llama.r|
00000110  6f 70 65 2e 64 69 6d 65  6e 73 69 6f 6e 5f 63 6f  |ope.dimension_co|
00000120  75 6e 74 00 00 00 04 00  00 00 80 00 00 00 00 00  |unt.............|
00000130  00 00 1a 6c 6c 61 6d 61  2e 61 74 74 65 6e 74 69  |...llama.attenti|

And I rolled back the line to write GGUF_MAGIC in gguf.py. It always write the magic in byte order.

    def write_header_to_file(self):
        self.fout.write(struct.pack("<I", GGUF_MAGIC))
        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
        self.flush()

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this works, but I wish to avoid the ifdef in the header and the inclusion of extra headers (endian.h).
We should implement the multi-character constant alternative as proposed by @cebtenzzre and instead of read / write uint32_t at once, we should read / write byte-by-byte and compare the multi-byte constant.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggerganov @monatis 

I like this choice. Previously I thought maybe this change is too big.

I will also need to change the magic in struct gguf_header to char array.

If you agree,  I will update according to your comments.

struct gguf_header {
    uint32_t magic; => char magic[4];
    uint32_t version;
    uint64_t n_tensors; // GGUFv2
    uint64_t n_kv;      // GGUFv2
};

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ggerganov updated according to your comments

self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_VERSION))
self.fout.write(struct.pack(f"{self.get_pack_prefix()}Q", self.ti_data_count))
self.fout.write(struct.pack(f"{self.get_pack_prefix()}Q", self.kv_data_count))
self.flush()
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))

Expand Down Expand Up @@ -699,40 +725,27 @@ def add_array(self, key: str, val: Sequence[Any]):
self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY)

_simple_value_packing = {
GGUFValueType.UINT8: "<B",
GGUFValueType.INT8: "<b",
GGUFValueType.UINT16: "<H",
GGUFValueType.INT16: "<h",
GGUFValueType.UINT32: "<I",
GGUFValueType.INT32: "<i",
GGUFValueType.FLOAT32: "<f",
GGUFValueType.UINT64: "<Q",
GGUFValueType.INT64: "<q",
GGUFValueType.FLOAT64: "<d",
GGUFValueType.BOOL: "?" ,
}
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
if vtype is None:
vtype = GGUFValueType.get_type(val)

if add_vtype:
self.kv_data += struct.pack("<I", vtype)
self.kv_data += struct.pack(f"{self.get_pack_prefix()}I", vtype)
self.kv_data_count += 1

pack_fmt = self._simple_value_packing.get(vtype)
if pack_fmt is not None:
self.kv_data += struct.pack(pack_fmt, val)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val
self.kv_data += struct.pack("<Q", len(encoded_val))
self.kv_data += struct.pack(f"{self.get_pack_prefix()}Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
self.kv_data += struct.pack("<I", ltype)
self.kv_data += struct.pack("<Q", len(val))
self.kv_data += struct.pack(f"{self.get_pack_prefix()}I", ltype)
self.kv_data += struct.pack(f"{self.get_pack_prefix()}Q", len(val))
for item in val:
self.add_val(item, add_vtype=False)
else:
Expand All @@ -746,22 +759,24 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<Q", len(encoded_name))
self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", len(encoded_name))
self.ti_data += encoded_name
n_dims = len(tensor_shape)
self.ti_data += struct.pack("<I", n_dims)
self.ti_data += struct.pack(f"{self.get_pack_prefix()}I", n_dims)
for i in range(n_dims):
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor)
self.ti_data += struct.pack(f"{self.get_pack_prefix()}I", dtype)
self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1

def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True)
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp.seek(0)
Expand Down
2 changes: 1 addition & 1 deletion k_quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if !defined(__riscv)
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#endif
Expand Down
2 changes: 2 additions & 0 deletions tests/test-double-float.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

#undef NDEBUG
#include <cassert>
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#include <cmath>
#include <cstdint>
#include <cstring>
Expand Down