From 85e4e2b77790bf304318422e837f9a4d5cb61a6a Mon Sep 17 00:00:00 2001
From: Galunid <karolek1231456@gmail.com>
Date: Tue, 21 May 2024 19:06:24 +0200
Subject: [PATCH] Fix CI, scripts, readme files

---
 CMakeLists.txt                     |  2 +-
 README.md                          |  7 ++-
 ci/run.sh                          |  4 +-
 docs/HOWTO-add-model.md            |  2 +-
 examples/llava/MobileVLM-README.md |  4 +-
 examples/llava/README.md           |  6 +-
 examples/make-ggml.py              | 98 ------------------------------
 scripts/convert-gg.sh              | 20 +++---
 scripts/pod-llama.sh               | 14 ++---
 9 files changed, 30 insertions(+), 127 deletions(-)
 delete mode 100755 examples/make-ggml.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cc60039a8416..9abf7d91de804 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1305,7 +1305,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
 install(TARGETS llama LIBRARY PUBLIC_HEADER)
 
 install(
-    FILES convert.py
+    FILES convert-hf-to-gguf.py
     PERMISSIONS
         OWNER_READ
         OWNER_WRITE
diff --git a/README.md b/README.md
index f4088c05e6eee..6b64d2acbc341 100644
--- a/README.md
+++ b/README.md
@@ -690,7 +690,8 @@ Building the program with BLAS support may lead to some performance improvements
 
 To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
 
-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-no-torch.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
 
 ```bash
 # obtain the official LLaMA model weights and place them in ./models
@@ -707,10 +708,10 @@ ls ./models
 python3 -m pip install -r requirements.txt
 
 # convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/
 
 # [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
 
 # quantize the model to 4-bits (using Q4_K_M method)
 ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
diff --git a/ci/run.sh b/ci/run.sh
index d5972480bc6c1..e86b7f3f36844 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -282,7 +282,7 @@ function gg_run_open_llama_3b_v2 {
     (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    python3 ../convert.py ${path_models}
+    python3 ../examples/convert-no-torch.py ${path_models}
 
     model_f16="${path_models}/ggml-model-f16.gguf"
     model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -417,7 +417,7 @@ function gg_run_open_llama_7b_v2 {
     (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
     (time make -j                                                           ) 2>&1 | tee -a $OUT/${ci}-make.log
 
-    python3 ../convert.py ${path_models}
+    python3 ../examples/convert-no-torch.py ${path_models}
 
     model_f16="${path_models}/ggml-model-f16.gguf"
     model_q8_0="${path_models}/ggml-model-q8_0.gguf"
diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md
index 48769cdf61092..873ebcd0b0bb3 100644
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
 ### 1. Convert the model to GGUF
 
 This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-no-torch.py](../examples/convert-no-torch.py) (for `llama/llama2` models in `.pth` format).
 
 The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
 
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 413e433dd9c07..3b107709b2f59 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
     --projector-type ldpv2
 ```
 
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-no-torch.py` to convert the LLaMA part of LLaVA to GGUF:
 
 ```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/convert-no-torch.py path/to/MobileVLM-1.7B
 ```
 
 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 4fb0cf3816383..17becafde21f6 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
 python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
 ```
 
-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-no-torch.py` to convert the LLaMA part of LLaVA to GGUF:
 
 ```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-no-torch.py ../llava-v1.5-7b --skip-unknown
 ```
 
 Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
 
 6) Then convert the model to gguf format:
 ```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-no-torch.py ../llava-v1.6-vicuna-7b/ --skip-unknown
 ```
 
 7) And finally we can run the llava-cli using the 1.6 model version:
diff --git a/examples/make-ggml.py b/examples/make-ggml.py
deleted file mode 100755
index c73485ebf1eff..0000000000000
--- a/examples/make-ggml.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
-
-Usage:
-python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
-
-Arguments:
-- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
-- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
-- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
-- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
-- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
-- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
-
-Old quant types (some base model types require these):
-- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
-- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
-- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
-- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
-
-New quant types (recommended):
-- Q2_K: smallest, extreme quality loss - not recommended
-- Q3_K: alias for Q3_K_M
-- Q3_K_S: very small, very high quality loss
-- Q3_K_M: very small, very high quality loss
-- Q3_K_L: small, substantial quality loss
-- Q4_K: alias for Q4_K_M
-- Q4_K_S: small, significant quality loss
-- Q4_K_M: medium, balanced quality - recommended
-- Q5_K: alias for Q5_K_M
-- Q5_K_S: large, low quality loss - recommended
-- Q5_K_M: large, very low quality loss - recommended
-- Q6_K: very large, extremely low quality loss
-- Q8_0: very large, extremely low quality loss - not recommended
-- F16: extremely large, virtually no quality loss - not recommended
-- F32: absolutely huge, lossless - not recommended
-"""
-import subprocess
-subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
-
-import argparse
-import os
-from huggingface_hub import snapshot_download
-
-def main(model, model_type, outname, outdir, quants, keep_fp16):
-    if not os.path.isdir(model):
-        print(f"Model not found at {model}. Downloading...")
-        try:
-            if outname is None:
-                outname = model.split('/')[-1]
-            model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
-        except Exception as e:
-            raise Exception(f"Could not download the model: {e}")
-
-    if outdir is None:
-        outdir = f'../models/{outname}'
-
-    if not os.path.isfile(f"{model}/config.json"):
-        raise Exception(f"Could not find config.json in {model}")
-
-    os.makedirs(outdir, exist_ok=True)
-
-    print("Building llama.cpp")
-    subprocess.run(f"cd .. && make quantize", shell=True, check=True)
-
-    fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
-
-    print(f"Making unquantised GGUF at {fp16}")
-    if not os.path.isfile(fp16):
-        if model_type != "llama":
-            subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
-        else:
-            subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
-    else:
-        print(f"Unquantised GGML already exists at: {fp16}")
-
-    print("Making quants")
-    for type in quants:
-        outfile = f"{outdir}/{outname}.gguf.{type}.bin"
-        print(f"Making {type} : {outfile}")
-        subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
-
-    if not keep_fp16:
-        os.remove(fp16)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
-    parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
-    parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
-    parser.add_argument('--outname', default=None, help='Output model(s) name')
-    parser.add_argument('--outdir', default=None, help='Output directory')
-    parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
-    parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
-
-    args = parser.parse_args()
-
-    main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
diff --git a/scripts/convert-gg.sh b/scripts/convert-gg.sh
index 01fda16fd7efc..3e31bf7f40751 100755
--- a/scripts/convert-gg.sh
+++ b/scripts/convert-gg.sh
@@ -3,20 +3,20 @@
 set -e
 
 # LLaMA v1
-python3 convert.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama1/7B  --outfile models/llama-7b/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-no-torch.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
 
 # LLaMA v2
-python3 convert.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama2/llama-2-7b  --outfile models/llama-7b-v2/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-no-torch.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
 
 # Code Llama
-python3 convert.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
-python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
-python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../codellama/CodeLlama-7b/  --outfile models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+python3 examples/convert-no-torch.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
+python3 examples/convert-no-torch.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
 
 # Falcon
 python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b  1
diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh
index 2058ceabf9730..500494330a4ff 100644
--- a/scripts/pod-llama.sh
+++ b/scripts/pod-llama.sh
@@ -75,7 +75,7 @@ if [ "$1" -eq "1" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-no-torch.py ./models/tinyllama-1b  --outfile ./models/tinyllama-1b/ggml-model-f16.gguf  --outtype f16
 
     ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
@@ -90,7 +90,7 @@ if [ "$1" -eq "2" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-7b  --outfile ./models/codellama-7b/ggml-model-f16.gguf  --outtype f16
 
     ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
@@ -105,7 +105,7 @@ if [ "$1" -eq "3" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
 
     ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
@@ -120,7 +120,7 @@ if [ "$1" -eq "4" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
 
     ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
@@ -135,7 +135,7 @@ if [ "$1" -eq "5" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-7b-instruct  --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf  --outtype f16
 
     ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
@@ -150,7 +150,7 @@ if [ "$1" -eq "6" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
 
     ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
@@ -165,7 +165,7 @@ if [ "$1" -eq "7" ]; then
 
     cd /workspace/llama.cpp
 
-    python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
+    python3 examples/convert-no-torch.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
 
     ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
     ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k