From 061356241297c0f389ea743ce932a302961aed75 Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Mon, 2 Oct 2023 04:28:22 -0400
Subject: [PATCH 1/8] check whether platform is 390x if yes->do not import
 immintrin.h

---
 k_quants.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/k_quants.c b/k_quants.c
index 62085882df71c..7db605c11ee56 100644
--- a/k_quants.c
+++ b/k_quants.c
@@ -46,7 +46,7 @@ inline static int32_t vaddvq_s32(int32x4_t v) {
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if !defined(__riscv)
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
 #endif
 #endif

From fa62c8c73a58f4f6ac88097a936862f90c666e0e Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Sun, 8 Oct 2023 11:47:39 +0800
Subject: [PATCH 2/8] support s390x big endian

---
 convert.py                  |  1 +
 gguf-py/gguf/gguf.py        | 49 +++++++++++++++++++------------------
 tests/test-double-float.cpp |  2 ++
 3 files changed, 28 insertions(+), 24 deletions(-)

diff --git a/convert.py b/convert.py
index 8bb6c7e410852..20e27aa426cfc 100755
--- a/convert.py
+++ b/convert.py
@@ -947,6 +947,7 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM
             elapsed = time.time() - start
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
+            ndarray.byteswap(inplace=True)
             print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
             of.gguf.write_tensor_data(ndarray)
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 598cf8e594aa8..2e997f72a11d1 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -22,6 +22,7 @@
 GGUF_VERSION           = 2
 GGUF_DEFAULT_ALIGNMENT = 32
 
+
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -428,7 +429,6 @@ class GGMLQuantizationType(IntEnum):
     Q6_K = 14
     Q8_K = 15
 
-
 class GGUFValueType(IntEnum):
     UINT8   = 0
     INT8    = 1
@@ -483,10 +483,10 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True
         self.tensors = []
 
     def write_header_to_file(self):
-        self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", GGUF_VERSION))
-        self.fout.write(struct.pack("<Q", self.ti_data_count))
-        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.fout.write(struct.pack(">I", GGUF_MAGIC))
+        self.fout.write(struct.pack(">I", GGUF_VERSION))
+        self.fout.write(struct.pack(">Q", self.ti_data_count))
+        self.fout.write(struct.pack(">Q", self.kv_data_count))
         self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
 
@@ -559,16 +559,16 @@ def add_array(self, key: str, val: Sequence[Any]):
         self.add_val(val, GGUFValueType.ARRAY)
 
     _simple_value_packing = {
-        GGUFValueType.UINT8:   "<B",
-        GGUFValueType.INT8:    "<b",
-        GGUFValueType.UINT16:  "<H",
-        GGUFValueType.INT16:   "<h",
-        GGUFValueType.UINT32:  "<I",
-        GGUFValueType.INT32:   "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64:  "<Q",
-        GGUFValueType.INT64:   "<q",
-        GGUFValueType.FLOAT64: "<d",
+        GGUFValueType.UINT8:   f"{GGUF_ENDIANESS}B",
+        GGUFValueType.INT8:    f"{GGUF_ENDIANESS.}b",
+        GGUFValueType.UINT16:  f"{GGUF_ENDIANESS.get}H",
+        GGUFValueType.INT16:   ">h",
+        GGUFValueType.UINT32:  ">I",
+        GGUFValueType.INT32:   ">i",
+        GGUFValueType.FLOAT32: ">f",
+        GGUFValueType.UINT64:  ">Q",
+        GGUFValueType.INT64:   ">q",
+        GGUFValueType.FLOAT64: ">d",
         GGUFValueType.BOOL:    "?" ,
     }
     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
@@ -576,7 +576,7 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
             vtype = GGUFValueType.get_type(val)
 
         if add_vtype:
-            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data += struct.pack(">I", vtype)
             self.kv_data_count += 1
 
         pack_fmt = self._simple_value_packing.get(vtype)
@@ -584,14 +584,14 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
             self.kv_data += struct.pack(pack_fmt, val)
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += struct.pack(">Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
             ltype = GGUFValueType.get_type(val[0])
             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                 raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
-            self.kv_data += struct.pack("<Q", len(val))
+            self.kv_data += struct.pack(">I", ltype)
+            self.kv_data += struct.pack(">Q", len(val))
             for item in val:
                 self.add_val(item, add_vtype=False)
         else:
@@ -605,22 +605,23 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += struct.pack(">Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
-        self.ti_data += struct.pack("<I", n_dims)
+        self.ti_data += struct.pack(">I", n_dims)
         for i in range(n_dims):
-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(">Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
         else:
             dtype = raw_dtype
-        self.ti_data += struct.pack("<I", dtype)
-        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.ti_data += struct.pack(">I", dtype)
+        self.ti_data += struct.pack(">Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
     def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
+        tensor.byteswap(inplace=True)
         if self.use_temp_file and self.temp_file is None:
             fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
             fp.seek(0)
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
index b506f273fee9f..afd7bf77fcb55 100644
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
 
 #undef NDEBUG
 #include <cassert>
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
+#endif
 #include <cmath>
 #include <cstdint>
 #include <cstring>

From 1ce890a7c0007564dc57c8c70ff37fec83973d12 Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Mon, 9 Oct 2023 10:40:41 +0800
Subject: [PATCH 3/8] support --bigendian option for s390x 1. verified with
 baichuan7b-chat with float 16 on s390x 2. verified with baichuan7b-chat 3.
 verified with chinese-alpaca-2-13b-f16

---
 convert-baichuan-hf-to-gguf.py |  7 +++-
 convert.py                     | 23 ++++++-----
 gguf-py/gguf/gguf.py           | 73 +++++++++++++++++++++-------------
 3 files changed, 65 insertions(+), 38 deletions(-)

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 8bd34dc440769..eac2663fc7a63 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -73,6 +73,7 @@ def parse_args() -> argparse.Namespace:
         "ftype", type=int, choices=[0, 1], default=1, nargs='?',
         help="output format - use 0 for float32, 1 for float16",
     )
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
     return parser.parse_args()
 
 args = parse_args()
@@ -83,6 +84,10 @@ def parse_args() -> argparse.Namespace:
     print(f'Error: {args.model} is not a directory', file = sys.stderr)
     sys.exit(1)
 
+endianess =gguf.GGUFEndian.LITTLE
+if args.bigendian:
+    endianess = gguf.GGUFEndian.BIG
+print(f"gguf: Conversion Endianess {endianess}")
 # possible tensor data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
@@ -110,7 +115,7 @@ def parse_args() -> argparse.Namespace:
 num_parts = count_model_parts(dir_model)
 print(f"num_parts:{num_parts}\n")
 ARCH=gguf.MODEL_ARCH.BAICHUAN
-gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
 print("gguf: get model metadata")
 
diff --git a/convert.py b/convert.py
index 20e27aa426cfc..444b04ca6b6bc 100755
--- a/convert.py
+++ b/convert.py
@@ -818,8 +818,8 @@ def check_vocab_size(params: Params, vocab: Vocab) -> None:
 
 
 class OutputFile:
-    def __init__(self, fname_out: Path) -> None:
-        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+    def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
+        self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
 
     def add_meta_arch(self, params: Params) -> None:
         name = "LLaMA"
@@ -890,10 +890,10 @@ def close(self) -> None:
         self.gguf.close()
 
     @staticmethod
-    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab) -> None:
+    def write_vocab_only(fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab, endianess:gguf.GGUFEndian=gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
         of.add_meta_arch(params)
@@ -918,10 +918,10 @@ def maybe_do_quantize(item: tuple[DataType, NDArray]) -> NDArray:
         return dt.quantize(arr)
 
     @staticmethod
-    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY) -> None:
+    def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess=gguf.GGUFEndian.LITTLE) -> None:
         check_vocab_size(params, vocab)
 
-        of = OutputFile(fname_out)
+        of = OutputFile(fname_out, endianess=endianess)
 
         # meta data
         of.add_meta_arch(params)
@@ -947,7 +947,8 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM
             elapsed = time.time() - start
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
-            ndarray.byteswap(inplace=True)
+            if endianess==gguf.GGUFEndian.BIG:
+                ndarray.byteswap(inplace=True)
             print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
             of.gguf.write_tensor_data(ndarray)
 
@@ -1139,8 +1140,9 @@ def main(args_in: list[str] | None = None) -> None:
     parser.add_argument("--vocabtype",   choices=["spm", "bpe"], help="vocab format (default: spm)", default="spm")
     parser.add_argument("--ctx",         type=int,               help="model training context (default: based on input)")
     parser.add_argument("--concurrency", type=int,               help=f"concurrency used for conversion (default: {DEFAULT_CONCURRENCY})", default = DEFAULT_CONCURRENCY)
-    args = parser.parse_args(args_in)
+    parser.add_argument("--bigendian",   action="store_true",    help="model is executed on big endian machine")
 
+    args = parser.parse_args(args_in)
     if args.dump_single:
         model_plus = lazy_load_file(args.model)
         do_dump_model(model_plus)
@@ -1154,6 +1156,9 @@ def main(args_in: list[str] | None = None) -> None:
     if args.dump:
         do_dump_model(model_plus)
         return
+    endianess = gguf.GGUFEndian.LITTLE
+    if args.bigendian:
+        endianess = gguf.GGUFEndian.BIG
 
     params = Params.load(model_plus)
     if params.n_ctx == -1:
@@ -1201,7 +1206,7 @@ def main(args_in: list[str] | None = None) -> None:
     params.ftype = ftype
     print(f"Writing {outfile}, format {ftype}")
 
-    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency)
+    OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency = args.concurrency, endianess=endianess)
     print(f"Wrote {outfile}")
 
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 2e997f72a11d1..bcb543eaedce1 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -429,6 +429,11 @@ class GGMLQuantizationType(IntEnum):
     Q6_K = 14
     Q8_K = 15
 
+class GGUFEndian(IntEnum):
+    LITTLE = 0
+    BIG = 1
+
+
 class GGUFValueType(IntEnum):
     UINT8   = 0
     INT8    = 1
@@ -475,18 +480,41 @@ class GGUFWriter:
     temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
     tensors: list[tuple[np.ndarray[Any, Any], int]]
 
-    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True):
+    def get_pack_prefix(self):
+        if self.endianess==GGUFEndian.LITTLE:
+            return "<"
+        else:
+            return ">"
+
+    def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True, endianess=GGUFEndian.LITTLE):
         self.fout = open(path, "wb")
         self.arch = arch
+        self.endianess = endianess
+        self._simple_value_packing = {
+            GGUFValueType.UINT8:   f"{self.get_pack_prefix()}B",
+            GGUFValueType.INT8:    f"{self.get_pack_prefix()}b",
+            GGUFValueType.UINT16:  f"{self.get_pack_prefix()}H",
+            GGUFValueType.INT16:   f"{self.get_pack_prefix()}h",
+            GGUFValueType.UINT32:  f"{self.get_pack_prefix()}I",
+            GGUFValueType.INT32:   f"{self.get_pack_prefix()}i",
+            GGUFValueType.FLOAT32: f"{self.get_pack_prefix()}f",
+            GGUFValueType.UINT64:  f"{self.get_pack_prefix()}Q",
+            GGUFValueType.INT64:   f"{self.get_pack_prefix()}q",
+            GGUFValueType.FLOAT64: f"{self.get_pack_prefix()}d",
+            GGUFValueType.BOOL:    "?" ,
+        }
         self.add_architecture()
         self.use_temp_file = use_temp_file
         self.tensors = []
+        
+       
+        print(f"This gguf file is for {self.endianess} only")
 
     def write_header_to_file(self):
-        self.fout.write(struct.pack(">I", GGUF_MAGIC))
-        self.fout.write(struct.pack(">I", GGUF_VERSION))
-        self.fout.write(struct.pack(">Q", self.ti_data_count))
-        self.fout.write(struct.pack(">Q", self.kv_data_count))
+        self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_MAGIC))
+        self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_VERSION))
+        self.fout.write(struct.pack(f"{self.get_pack_prefix()}Q", self.ti_data_count))
+        self.fout.write(struct.pack(f"{self.get_pack_prefix()}Q", self.kv_data_count))
         self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
 
@@ -558,25 +586,13 @@ def add_array(self, key: str, val: Sequence[Any]):
         self.add_key(key)
         self.add_val(val, GGUFValueType.ARRAY)
 
-    _simple_value_packing = {
-        GGUFValueType.UINT8:   f"{GGUF_ENDIANESS}B",
-        GGUFValueType.INT8:    f"{GGUF_ENDIANESS.}b",
-        GGUFValueType.UINT16:  f"{GGUF_ENDIANESS.get}H",
-        GGUFValueType.INT16:   ">h",
-        GGUFValueType.UINT32:  ">I",
-        GGUFValueType.INT32:   ">i",
-        GGUFValueType.FLOAT32: ">f",
-        GGUFValueType.UINT64:  ">Q",
-        GGUFValueType.INT64:   ">q",
-        GGUFValueType.FLOAT64: ">d",
-        GGUFValueType.BOOL:    "?" ,
-    }
+    
     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
         if vtype is None:
             vtype = GGUFValueType.get_type(val)
 
         if add_vtype:
-            self.kv_data += struct.pack(">I", vtype)
+            self.kv_data += struct.pack(f"{self.get_pack_prefix()}I", vtype)
             self.kv_data_count += 1
 
         pack_fmt = self._simple_value_packing.get(vtype)
@@ -584,14 +600,14 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
             self.kv_data += struct.pack(pack_fmt, val)
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack(">Q", len(encoded_val))
+            self.kv_data += struct.pack(f"{self.get_pack_prefix()}Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
             ltype = GGUFValueType.get_type(val[0])
             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                 raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack(">I", ltype)
-            self.kv_data += struct.pack(">Q", len(val))
+            self.kv_data += struct.pack(f"{self.get_pack_prefix()}I", ltype)
+            self.kv_data += struct.pack(f"{self.get_pack_prefix()}Q", len(val))
             for item in val:
                 self.add_val(item, add_vtype=False)
         else:
@@ -605,23 +621,24 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack(">Q", len(encoded_name))
+        self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
-        self.ti_data += struct.pack(">I", n_dims)
+        self.ti_data += struct.pack(f"{self.get_pack_prefix()}I", n_dims)
         for i in range(n_dims):
-            self.ti_data += struct.pack(">Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
         else:
             dtype = raw_dtype
-        self.ti_data += struct.pack(">I", dtype)
-        self.ti_data += struct.pack(">Q", self.offset_tensor)
+        self.ti_data += struct.pack(f"{self.get_pack_prefix()}I", dtype)
+        self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
     def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
-        tensor.byteswap(inplace=True)
+        if self.endianess == GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
         if self.use_temp_file and self.temp_file is None:
             fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
             fp.seek(0)

From e4efbdb8c181cce356e3d03ef7adea608729b17e Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Mon, 9 Oct 2023 11:51:15 +0800
Subject: [PATCH 4/8] update format based on editor-config checker result

---
 gguf-py/gguf/gguf.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 1704ff6303f2e..c24bbc1ba6400 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -647,8 +647,6 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True
         self.add_architecture()
         self.use_temp_file = use_temp_file
         self.tensors = []
-        
-       
         print(f"This gguf file is for {self.endianess} only")
 
     def write_header_to_file(self):
@@ -727,7 +725,6 @@ def add_array(self, key: str, val: Sequence[Any]):
         self.add_key(key)
         self.add_val(val, GGUFValueType.ARRAY)
 
-    
     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
         if vtype is None:
             vtype = GGUFValueType.get_type(val)

From 51e9d39117cb99aa48e00917402563eaf7bda1d1 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 11 Oct 2023 09:55:15 +0300
Subject: [PATCH 5/8] Update convert-baichuan-hf-to-gguf.py

---
 convert-baichuan-hf-to-gguf.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 8054823cf027b..c1c080f608584 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -87,7 +87,7 @@ def parse_args() -> argparse.Namespace:
     print(f'Error: {args.model} is not a directory', file = sys.stderr)
     sys.exit(1)
 
-endianess =gguf.GGUFEndian.LITTLE
+endianess = gguf.GGUFEndian.LITTLE
 if args.bigendian:
     endianess = gguf.GGUFEndian.BIG
 print(f"gguf: Conversion Endianess {endianess}")

From 7fc0250d1556de0353cfe31bfeea313cd758f57b Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Fri, 13 Oct 2023 00:23:16 +0800
Subject: [PATCH 6/8] 1. check in ggml.c if endianess is not match 2. update
 GGUF version 3. change get_pack_prefix to property 4. update information log

---
 convert-baichuan-hf-to-gguf.py |  1 +
 ggml.c                         |  8 ++++-
 ggml.h                         |  3 +-
 gguf-py/gguf/gguf.py           | 54 ++++++++++++++++++----------------
 gguf-py/pyproject.toml         |  2 +-
 5 files changed, 39 insertions(+), 29 deletions(-)

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index c1c080f608584..a1783f71fb668 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -90,6 +90,7 @@ def parse_args() -> argparse.Namespace:
 endianess = gguf.GGUFEndian.LITTLE
 if args.bigendian:
     endianess = gguf.GGUFEndian.BIG
+endianess_str = "Big Endian" if args.bigendian else "Little Endian"
 print(f"gguf: Conversion Endianess {endianess}")
 # possible tensor data types
 #   ftype == 0 -> float32
diff --git a/ggml.c b/ggml.c
index 6d1776ca46741..04b88c98a837a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20916,7 +20916,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         gguf_fread_el(file, &magic, sizeof(magic), &offset);
 
         if (magic != GGUF_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+            if (magic == GGUF_WRONG_ENIAN_MAGIC)
+            {
+                fprintf(stderr, "Endianess of the GGUF file and platform do not match.%s: invalid magic number %08x.\n", __func__, magic);
+            }
+            else {
+                fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
+            }
             fclose(file);
             return NULL;
         }
diff --git a/ggml.h b/ggml.h
index 3eddc44b90fdd..fdd8e31bec70e 100644
--- a/ggml.h
+++ b/ggml.h
@@ -232,7 +232,8 @@
 #define GGML_EXIT_ABORTED 1
 
 #define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 2
+#define GGUF_WRONG_ENIAN_MAGIC 0x47475546 
+#define GGUF_VERSION 3
 
 #define GGUF_DEFAULT_ALIGNMENT 32
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index c24bbc1ba6400..748d59343c27b 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -19,7 +19,7 @@
 #
 
 GGUF_MAGIC             = 0x46554747
-GGUF_VERSION           = 2
+GGUF_VERSION           = 3
 GGUF_DEFAULT_ALIGNMENT = 32
 
 
@@ -621,7 +621,8 @@ class GGUFWriter:
     temp_file: tempfile.SpooledTemporaryFile[bytes] | None = None
     tensors: list[tuple[np.ndarray[Any, Any], int]]
 
-    def get_pack_prefix(self):
+    @property
+    def pack_prefix(self):
         if self.endianess==GGUFEndian.LITTLE:
             return "<"
         else:
@@ -632,28 +633,29 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True
         self.arch = arch
         self.endianess = endianess
         self._simple_value_packing = {
-            GGUFValueType.UINT8:   f"{self.get_pack_prefix()}B",
-            GGUFValueType.INT8:    f"{self.get_pack_prefix()}b",
-            GGUFValueType.UINT16:  f"{self.get_pack_prefix()}H",
-            GGUFValueType.INT16:   f"{self.get_pack_prefix()}h",
-            GGUFValueType.UINT32:  f"{self.get_pack_prefix()}I",
-            GGUFValueType.INT32:   f"{self.get_pack_prefix()}i",
-            GGUFValueType.FLOAT32: f"{self.get_pack_prefix()}f",
-            GGUFValueType.UINT64:  f"{self.get_pack_prefix()}Q",
-            GGUFValueType.INT64:   f"{self.get_pack_prefix()}q",
-            GGUFValueType.FLOAT64: f"{self.get_pack_prefix()}d",
+            GGUFValueType.UINT8:   f"{self.pack_prefix}B",
+            GGUFValueType.INT8:    f"{self.pack_prefix}b",
+            GGUFValueType.UINT16:  f"{self.pack_prefix}H",
+            GGUFValueType.INT16:   f"{self.pack_prefix}h",
+            GGUFValueType.UINT32:  f"{self.pack_prefix}I",
+            GGUFValueType.INT32:   f"{self.pack_prefix}i",
+            GGUFValueType.FLOAT32: f"{self.pack_prefix}f",
+            GGUFValueType.UINT64:  f"{self.pack_prefix}Q",
+            GGUFValueType.INT64:   f"{self.pack_prefix}q",
+            GGUFValueType.FLOAT64: f"{self.pack_prefix}d",
             GGUFValueType.BOOL:    "?" ,
         }
         self.add_architecture()
         self.use_temp_file = use_temp_file
         self.tensors = []
-        print(f"This gguf file is for {self.endianess} only")
+        endianess_str = "Big Endian" if self.endianess == GGUFEndian.BIG else "Little Endian"
+        print(f"This gguf file is for {endianess_str} only")
 
     def write_header_to_file(self):
-        self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_MAGIC))
-        self.fout.write(struct.pack(f"{self.get_pack_prefix()}I", GGUF_VERSION))
-        self.fout.write(struct.pack(f"{self.get_pack_prefix()}Q", self.ti_data_count))
-        self.fout.write(struct.pack(f"{self.get_pack_prefix()}Q", self.kv_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_MAGIC))
+        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
+        self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))
         self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
 
@@ -730,7 +732,7 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
             vtype = GGUFValueType.get_type(val)
 
         if add_vtype:
-            self.kv_data += struct.pack(f"{self.get_pack_prefix()}I", vtype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", vtype)
             self.kv_data_count += 1
 
         pack_fmt = self._simple_value_packing.get(vtype)
@@ -738,14 +740,14 @@ def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool
             self.kv_data += struct.pack(pack_fmt, val)
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack(f"{self.get_pack_prefix()}Q", len(encoded_val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
             ltype = GGUFValueType.get_type(val[0])
             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                 raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack(f"{self.get_pack_prefix()}I", ltype)
-            self.kv_data += struct.pack(f"{self.get_pack_prefix()}Q", len(val))
+            self.kv_data += struct.pack(f"{self.pack_prefix}I", ltype)
+            self.kv_data += struct.pack(f"{self.pack_prefix}Q", len(val))
             for item in val:
                 self.add_val(item, add_vtype=False)
         else:
@@ -759,18 +761,18 @@ def add_tensor_info(self, name: str, tensor_shape: Sequence[int], tensor_dtype:
         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", len(encoded_name))
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
-        self.ti_data += struct.pack(f"{self.get_pack_prefix()}I", n_dims)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", n_dims)
         for i in range(n_dims):
-            self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(f"{self.pack_prefix}Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
         else:
             dtype = raw_dtype
-        self.ti_data += struct.pack(f"{self.get_pack_prefix()}I", dtype)
-        self.ti_data += struct.pack(f"{self.get_pack_prefix()}Q", self.offset_tensor)
+        self.ti_data += struct.pack(f"{self.pack_prefix}I", dtype)
+        self.ti_data += struct.pack(f"{self.pack_prefix}Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml
index 07a7ab4dd11fc..f0741a7c23e03 100644
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gguf"
-version = "0.4.4"
+version = "0.4.5"
 description = "Write ML models in GGUF for GGML"
 authors = ["GGML <ggml@ggml.ai>"]
 packages = [

From e513abe37e5994873a6d3f8a1adcb1bcfc6a8b40 Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Sun, 15 Oct 2023 23:59:53 +0800
Subject: [PATCH 7/8] always use "GGUF" as beginng of GGUF file

---
 ggml.c               |  8 +-------
 ggml.h               | 14 ++++++++++++--
 gguf-py/gguf/gguf.py |  2 +-
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/ggml.c b/ggml.c
index 04b88c98a837a..6d1776ca46741 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20916,13 +20916,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
         gguf_fread_el(file, &magic, sizeof(magic), &offset);
 
         if (magic != GGUF_MAGIC) {
-            if (magic == GGUF_WRONG_ENIAN_MAGIC)
-            {
-                fprintf(stderr, "Endianess of the GGUF file and platform do not match.%s: invalid magic number %08x.\n", __func__, magic);
-            }
-            else {
-                fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
-            }
+            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
             fclose(file);
             return NULL;
         }
diff --git a/ggml.h b/ggml.h
index fdd8e31bec70e..10ae3c033bab9 100644
--- a/ggml.h
+++ b/ggml.h
@@ -231,8 +231,18 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_WRONG_ENIAN_MAGIC 0x47475546 
+#if defined(__linux__) 
+    #include <endian.h>
+    #if BYTE_ORDER == LITTLE_ENDIAN
+        #define GGUF_MAGIC 0x46554747
+    #elif BYTE_ORDER == BIG_ENDIAN
+        #define GGUF_MAGIC 0x47475546
+    #endif
+#else
+    // Use little endian magic uint_32 value
+    #define GGUF_MAGIC 0x46554747
+#endif
+
 #define GGUF_VERSION 3
 
 #define GGUF_DEFAULT_ALIGNMENT 32
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 748d59343c27b..a5f92dd6df958 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -652,7 +652,7 @@ def __init__(self, path: os.PathLike[str] | str, arch: str, use_temp_file = True
         print(f"This gguf file is for {endianess_str} only")
 
     def write_header_to_file(self):
-        self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_MAGIC))
+        self.fout.write(struct.pack("<I", GGUF_MAGIC))
         self.fout.write(struct.pack(f"{self.pack_prefix}I", GGUF_VERSION))
         self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.ti_data_count))
         self.fout.write(struct.pack(f"{self.pack_prefix}Q", self.kv_data_count))

From eb5b8327f688d1815f61786020468ccd8dfc53f7 Mon Sep 17 00:00:00 2001
From: chenqiny <chenqiny@cn.ibm.com>
Date: Fri, 20 Oct 2023 18:45:19 +0800
Subject: [PATCH 8/8] Compare "GGUF" with file header char by char 1.  Set
 GGUF_MAGIC to "GGUF" string instead of int value 2. Compare "GGUF" char by
 char to ensure its byte order 3. Move bytes swap code from convert.py to
 gguf.py write_tensor_data

---
 convert.py                                    |  2 --
 .../convert-llama2c-to-ggml.cpp               |  2 +-
 ggml.c                                        | 19 +++++++++++--------
 ggml.h                                        | 12 +-----------
 gguf-py/gguf/gguf.py                          |  2 ++
 5 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/convert.py b/convert.py
index 59f6222ed5a68..24da25efcaca1 100755
--- a/convert.py
+++ b/convert.py
@@ -932,8 +932,6 @@ def write_all(fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyM
             elapsed = time.time() - start
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
-            if endianess==gguf.GGUFEndian.BIG:
-                ndarray.byteswap(inplace=True)
             print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
             of.gguf.write_tensor_data(ndarray)
 
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index c291f0adf20e1..cae3bf3c3dc65 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -536,7 +536,7 @@ static bool is_ggml_file(const char * filename) {
     if (file.size < 4) {
         return false;
     }
-    uint32_t magic = file.read_u32();
+    std::string magic = file.read_string(4);
     return magic == GGUF_MAGIC;
 }
 
diff --git a/ggml.c b/ggml.c
index 6d1776ca46741..a24933a1b9907 100644
--- a/ggml.c
+++ b/ggml.c
@@ -20813,7 +20813,7 @@ struct gguf_kv {
 };
 
 struct gguf_header {
-    uint32_t magic;
+    char magic[4];
     uint32_t version;
     uint64_t n_tensors; // GGUFv2
     uint64_t n_kv;      // GGUFv2
@@ -20883,7 +20883,7 @@ static bool gguf_fread_str_v1(FILE * file, struct gguf_str * p, size_t * offset)
 struct gguf_context * gguf_init_empty(void) {
     struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context));
 
-    ctx->header.magic     = GGUF_MAGIC;
+    memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic));
     ctx->header.version   = GGUF_VERSION;
     ctx->header.n_tensors = 0;
     ctx->header.n_kv      = 0;
@@ -20909,16 +20909,18 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
     // offset from start of file
     size_t offset = 0;
 
-    uint32_t magic = 0;
+    char magic[4];
 
     // check the magic before making allocations
     {
         gguf_fread_el(file, &magic, sizeof(magic), &offset);
 
-        if (magic != GGUF_MAGIC) {
-            fprintf(stderr, "%s: invalid magic number %08x\n", __func__, magic);
-            fclose(file);
-            return NULL;
+        for (uint32_t i = 0; i < sizeof(magic); i++) {
+            if (magic[i] != GGUF_MAGIC[i]) {
+                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+                fclose(file);
+                return NULL;
+            }
         }
     }
 
@@ -20928,7 +20930,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
 
     // read the header
     {
-        ctx->header.magic = magic;
+        strncpy(ctx->header.magic, magic, 4);
+
 
         ctx->kv    = NULL;
         ctx->infos = NULL;
diff --git a/ggml.h b/ggml.h
index 10ae3c033bab9..c748fea941cff 100644
--- a/ggml.h
+++ b/ggml.h
@@ -231,17 +231,7 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 
-#if defined(__linux__) 
-    #include <endian.h>
-    #if BYTE_ORDER == LITTLE_ENDIAN
-        #define GGUF_MAGIC 0x46554747
-    #elif BYTE_ORDER == BIG_ENDIAN
-        #define GGUF_MAGIC 0x47475546
-    #endif
-#else
-    // Use little endian magic uint_32 value
-    #define GGUF_MAGIC 0x46554747
-#endif
+#define GGUF_MAGIC "GGUF"
 
 #define GGUF_VERSION 3
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index a5f92dd6df958..16e7359792632 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -804,6 +804,8 @@ def write_padding(self, fp: BinaryIO, n: int, align: int | None = None):
             fp.write(bytes([0] * pad))
 
     def write_tensor_data(self, tensor: np.ndarray[Any, Any]):
+        if self.endianess==GGUFEndian.BIG:
+            tensor.byteswap(inplace=True)
         self.write_padding(self.fout, self.fout.tell())
         tensor.tofile(self.fout)
         self.write_padding(self.fout, tensor.nbytes)