Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable llama.cpp on s390x big endian platform #3552

Merged
merged 10 commits into from
Oct 20, 2023
8 changes: 7 additions & 1 deletion convert-baichuan-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def parse_args() -> argparse.Namespace:
"ftype", type=int, choices=[0, 1], default=1, nargs='?',
help="output format - use 0 for float32, 1 for float16",
)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")
return parser.parse_args()

args = parse_args()
Expand All @@ -86,6 +87,11 @@ def parse_args() -> argparse.Namespace:
print(f'Error: {args.model} is not a directory', file = sys.stderr)
sys.exit(1)

endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG
endianess_str = "Big Endian" if args.bigendian else "Little Endian"
print(f"gguf: Conversion Endianess {endianess}")
# possible tensor data types
# ftype == 0 -> float32
# ftype == 1 -> float16
Expand Down Expand Up @@ -113,7 +119,7 @@ def parse_args() -> argparse.Namespace:
num_parts = count_model_parts(dir_model)
print(f"num_parts:{num_parts}\n")
ARCH=gguf.MODEL_ARCH.BAICHUAN
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

print("gguf: get model metadata")

Expand Down
20 changes: 12 additions & 8 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -803,8 +803,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:


class OutputFile:
def __init__(self, fname_out: Path) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)

def add_meta_arch(self, params: Params) -> None:
name = "LLaMA"
Expand Down Expand Up @@ -875,10 +875,10 @@ def close(self) -> None:
self.gguf.close()

@staticmethod
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)

of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
Expand All @@ -903,10 +903,10 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
return dt.quantize(arr)

@staticmethod
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
check_vocab_size(params, vocab)

of = OutputFile(fname_out)
of = OutputFile(fname_out, endianess=endianess)

# meta data
of.add_meta_arch(params)
Expand Down Expand Up @@ -1123,8 +1123,9 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--vocabtype", choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
parser.add_argument("--ctx", type=int, help="model training context (default: based on input)")
parser.add_argument("--concurrency", type=int, help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
args = parser.parse_args(args_in)
parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine")

args = parser.parse_args(args_in)
if args.dump_single:
model_plus = lazy_load_file(args.model)
do_dump_model(model_plus)
Expand All @@ -1138,6 +1139,9 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.bigendian:
endianess = gguf.GGUFEndian.BIG

params = Params.load(model_plus)
if params.n_ctx == -1:
Expand Down Expand Up @@ -1185,7 +1189,7 @@ def main(args_in: list[str] | None = None) -> None:
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")

OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
print(f"Wrote {outfile}")


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
if (file.size < 4) {
return false;
}
uint32_t magic = file.read_u32();
std::string magic = file.read_string(4);
return magic == GGUF_MAGIC;
}

Expand Down
19 changes: 11 additions & 8 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -20845,7 +20845,7 @@ struct gguf_kv {
};

struct gguf_header {
uint32_t magic;
char magic[4];
uint32_t version;
uint64_t n_tensors; // GGUFv2
uint64_t n_kv; // GGUFv2
Expand Down Expand Up @@ -20915,7 +20915,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
struct gguf_context * gguf_init_empty(void) {
struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));

ctx->header.magic = GGUF_MAGIC;
memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
ctx->header.version = GGUF_VERSION;
ctx->header.n_tensors = 0;
ctx->header.n_kv = 0;
Expand All @@ -20941,16 +20941,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// offset from start of file
size_t offset = 0;

uint32_t magic = 0;
char magic[4];

// check the magic before making allocations
{
gguf_fread_el(file, &magic, sizeof(magic), &offset);

if (magic != GGUF_MAGIC) {
fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
fclose(file);
return NULL;
for (uint32_t i = 0; i < sizeof(magic); i++) {
if (magic[i] != GGUF_MAGIC[i]) {
fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
fclose(file);
return NULL;
}
}
}

Expand All @@ -20960,7 +20962,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p

// read the header
{
ctx->header.magic = magic;
strncpy(ctx->header.magic, magic, 4);


ctx->kv = NULL;
ctx->infos = NULL;
Expand Down
5 changes: 3 additions & 2 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,9 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1

#define GGUF_MAGIC 0x46554747 // "GGUF"
#define GGUF_VERSION 2
#define GGUF_MAGIC "GGUF"

#define GGUF_VERSION 3

#define GGUF_DEFAULT_ALIGNMENT 32

Expand Down
73 changes: 46 additions & 27 deletions gguf-py/gguf/gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
#

GGUF_MAGIC = 0x46554747
GGUF_VERSION = 2
GGUF_VERSION = 3
GGUF_DEFAULT_ALIGNMENT = 32


# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
Expand Down Expand Up @@ -597,6 +598,10 @@ class GGMLQuantizationType(IntEnum):
Q6_K = 14
Q8_K = 15

class GGUFEndian(IntEnum):
LITTLE = 0
BIG = 1


class GGUFValueType(IntEnum):
UINT8 = 0
Expand Down Expand Up @@ -644,18 +649,41 @@ class GGUFWriter:
temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
tensors: list[tuple[np.ndarray[Any, Any], int]]

def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
@property
def pack_prefix(self):
if self.endianess==GGUFEndian.LITTLE:
return "<"
else:
return ">"

def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
self.fout = open(path, "wb")
self.arch = arch
self.endianess = endianess
self._simple_value_packing = {
GGUFValueType.UINT8: f"{self.pack_prefix}B",
GGUFValueType.INT8: f"{self.pack_prefix}b",
GGUFValueType.UINT16: f"{self.pack_prefix}H",
GGUFValueType.INT16: f"{self.pack_prefix}h",
GGUFValueType.UINT32: f"{self.pack_prefix}I",
GGUFValueType.INT32: f"{self.pack_prefix}i",
GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
GGUFValueType.UINT64: f"{self.pack_prefix}Q",
GGUFValueType.INT64: f"{self.pack_prefix}q",
GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
GGUFValueType.BOOL: "?" ,
}
self.add_architecture()
self.use_temp_file = use_temp_file
self.tensors = []
endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
print(f"This gguf file is for {endianess_str} only")

def write_header_to_file(self):
self.fout.write(struct.pack("<I", GGUF_MAGIC))
self.fout.write(struct.pack("<I", GGUF_VERSION))
self.fout.write(struct.pack("<Q", self.ti_data_count))
self.fout.write(struct.pack("<Q", self.kv_data_count))
self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
self.flush()
# print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))

Expand Down Expand Up @@ -727,40 +755,27 @@ def add_array(self, key: str, val: Sequence[Any]):
self.add_key(key)
self.add_val(val, GGUFValueType.ARRAY)

_simple_value_packing = {
GGUFValueType.UINT8: "<B",
GGUFValueType.INT8: "<b",
GGUFValueType.UINT16: "<H",
GGUFValueType.INT16: "<h",
GGUFValueType.UINT32: "<I",
GGUFValueType.INT32: "<i",
GGUFValueType.FLOAT32: "<f",
GGUFValueType.UINT64: "<Q",
GGUFValueType.INT64: "<q",
GGUFValueType.FLOAT64: "<d",
GGUFValueType.BOOL: "?" ,
}
def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
if vtype is None:
vtype = GGUFValueType.get_type(val)

if add_vtype:
self.kv_data += struct.pack("<I", vtype)
self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
self.kv_data_count += 1

pack_fmt = self._simple_value_packing.get(vtype)
if pack_fmt is not None:
self.kv_data += struct.pack(pack_fmt, val)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val
self.kv_data += struct.pack("<Q", len(encoded_val))
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
ltype = GGUFValueType.get_type(val[0])
if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
raise ValueError("All items in a GGUF array should be of the same type")
self.kv_data += struct.pack("<I", ltype)
self.kv_data += struct.pack("<Q", len(val))
self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
for item in val:
self.add_val(item, add_vtype=False)
else:
Expand All @@ -774,22 +789,24 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"

encoded_name = name.encode("utf8")
self.ti_data += struct.pack("<Q", len(encoded_name))
self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
self.ti_data += encoded_name
n_dims = len(tensor_shape)
self.ti_data += struct.pack("<I", n_dims)
self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
for i in range(n_dims):
self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
if raw_dtype is None:
dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
else:
dtype = raw_dtype
self.ti_data += struct.pack("<I", dtype)
self.ti_data += struct.pack("<Q", self.offset_tensor)
self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
self.ti_data_count += 1

def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
if self.endianess == GGUFEndian.BIG:
tensor.byteswap(inplace=True)
if self.use_temp_file and self.temp_file is None:
fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
fp.seek(0)
Expand All @@ -815,6 +832,8 @@ def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
fp.write(bytes([0] * pad))

def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
if self.endianess==GGUFEndian.BIG:
tensor.byteswap(inplace=True)
self.write_padding(self.fout, self.fout.tell())
tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
Expand Down
2 changes: 1 addition & 1 deletion gguf-py/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gguf"
version = "0.4.4"
version = "0.4.5"
description = "Write ML models in GGUF for GGML"
authors = ["GGML <ggml@ggml.ai>"]
packages = [
Expand Down
2 changes: 1 addition & 1 deletion k_quants.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if !defined(__riscv)
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#endif
Expand Down
2 changes: 2 additions & 0 deletions tests/test-double-float.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

#undef NDEBUG
#include <cassert>
#if !defined(__riscv) && !defined(__s390__)
#include <immintrin.h>
#endif
#include <cmath>
#include <cstdint>
#include <cstring>
Expand Down
Loading