NexaAI · zhiyuan8 · Oct 23, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -86,4 +86,7 @@ build_*/
 *.sqlite
 
 # Other
-.cache/
+.cache/
+
+# tests
+quantization_test.py
diff --git a/CLI.md b/CLI.md
@@ -13,6 +13,7 @@ positional arguments:
     run                 Run inference for various tasks using GGUF models.
     onnx                Run inference for various tasks using ONNX models.
     embed               Generate embeddings for text.
+    convert             Convert and quantize a Hugging Face model to GGUF format.
     server              Run the Nexa AI Text Generation Service.
     eval                Run the Nexa AI Evaluation Tasks.
     pull                Pull a model from official or hub.
@@ -268,6 +269,63 @@ nexa embed nomic-embed-text-v1.5:fp16 "I love Nexa AI."
 nexa embed sentence-transformers/all-MiniLM-L6-v2:gguf-fp16 "I love Nexa AI." >> generated_embeddings.txt
 ```
 
+### Convert and quantize a Hugging Face Model to GGUF
+
+```
+nexa convert HF_MODEL_PATH [ftype] [output_file]
+usage: nexa convert [-h] [-t NTHREAD] [--convert_type CONVERT_TYPE] [--bigendian] [--use_temp_file] [--no_lazy]
+                    [--metadata METADATA] [--split_max_tensors SPLIT_MAX_TENSORS] [--split_max_size SPLIT_MAX_SIZE]
+                    [--no_tensor_first_split] [--vocab_only] [--dry_run] [--output_tensor_type OUTPUT_TENSOR_TYPE]
+                    [--token_embedding_type TOKEN_EMBEDDING_TYPE] [--allow_requantize] [--quantize_output_tensor]
+                    [--only_copy] [--pure] [--keep_split] input_path [ftype] [output_file]
+
+positional arguments:
+  input_path            Path to the input Hugging Face model directory or GGUF file
+  ftype                 Quantization type (default: q4_0)
+  output_file           Path to the output quantized GGUF file
+
+options:
+  -h, --help            show this help message and exit
+  -t, --nthread NTHREAD Number of threads to use (default: 4)
+  --convert_type CONVERT_TYPE
+                        Conversion type for safetensors to GGUF (default: f16)
+  --bigendian           Use big endian format
+  --use_temp_file       Use a temporary file during conversion
+  --no_lazy             Disable lazy loading
+  --metadata METADATA   Additional metadata as JSON string
+  --split_max_tensors SPLIT_MAX_TENSORS
+                        Maximum number of tensors per split
+  --split_max_size SPLIT_MAX_SIZE
+  --no_tensor_first_split
+                        Disable tensor-first splitting
+  --vocab_only          Only process vocabulary
+  --dry_run             Perform a dry run without actual conversion
+  --output_tensor_type  Output tensor type
+  --token_embedding_type
+                        Token embedding type
+  --allow_requantize    Allow quantizing non-f32/f16 tensors
+  --quantize_output_tensor
+                        Quantize output.weight
+  --only_copy           Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)
+  --pure                Quantize all tensors to the default type
+  --keep_split          Quantize to the same number of shards
+```
+
+#### Example
+
+```
+# Default quantization type (q4_0) and output file in current directory
+nexa convert meta-llama/Llama-3.2-1B-Instruct
+
+# Equivalent to:
+# nexa convert meta-llama/Llama-3.2-1B-Instruct q4_0 ./Llama-3.2-1B-Instruct-q4_0.gguf
+
+# Specifying quantization type and output file
+nexa convert meta-llama/Llama-3.2-1B-Instruct q6_k llama3.2-1b-instruct-q6_k.gguf
+```
+
+Note: When not specified, the default quantization type is set to q4_0, and the output file will be created in the current directory with the name format: `<model_name>-q4_0.gguf`.
+
 ### Start Local Server
 
 Start a local server using models on your local computer.
@@ -329,3 +387,7 @@ For `model_path` in nexa commands, it's better to follow the standard format to
 - `gemma-2b:q4_0`
 - `Meta-Llama-3-8B-Instruct:onnx-cpu-int8`
 - `liuhaotian/llava-v1.6-vicuna-7b:gguf-q4_0`
+
+```
+</rewritten_chunk>
+```
diff --git a/README.md b/README.md
diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py
@@ -2,6 +2,7 @@
 import os
 from nexa import __version__
 from nexa.constants import ModelType
+import json
 
 
 def _choose_files(local_path):
@@ -260,6 +261,54 @@ def run_embedding_generation(args):
         print(f"Error generating embedding: {e}")
         print("Please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation")
 
+def run_convert(args):
+    input_path = args.input_path
+
+    # Check if input_path is a valid directory
+    if not os.path.isdir(input_path):
+        from nexa.general import download_repo_from_hf
+        success, local_path = download_repo_from_hf(input_path)
+
+        if success:
+            input_path = local_path
+        else:
+            print("Error: Failed to download the repository and the provided path is not a valid directory.")
+            return
+
+    # Input_path here should be a valid directory
+    kwargs = {k: v for k, v in vars(args).items() if v is not None and k not in ['input_path', 'ftype', 'output_file', 'convert_type']}
+
+    try:
+        from nexa.gguf.converter.nexa_convert import convert_hf_to_quantized_gguf
+        converted_path = convert_hf_to_quantized_gguf(
+            input_path,
+            output_file=args.output_file,
+            ftype=args.ftype,
+            convert_type=args.convert_type,
+            **kwargs
+        )
+        if converted_path:
+            print(f"Conversion completed successfully. Output file: {converted_path}")
+
+            # Ask user if they want to run the converted model
+            user_choice = input("Would you like to run the converted model? (y/N) (Currently only supports NLP): ").strip().lower()
+            if user_choice == 'y':
+                try:
+                    import subprocess
+                    command = f"nexa run {converted_path} -lp -mt NLP"
+                    print(f"Running command: {command}")
+                    subprocess.run(command.split(), check=True, text=True)
+                except subprocess.CalledProcessError as e:
+                    print("Error running the converted model.")
+                    print("Change model type with -mt to run the model correctly. Or refer to our docs: https://docs.nexa.ai/sdk/cli-reference")
+            else:
+                print("Exiting without running the model.")
+                return
+        else:
+            print("Conversion failed.")
+    except Exception as e:
+        print(f"Error during conversion: {e}")
+
 def main():
     parser = argparse.ArgumentParser(description="Nexa CLI tool for handling various model operations.")
     parser.add_argument("-V", "--version", action="version", version=__version__, help="Show the version of the Nexa SDK.")
@@ -336,6 +385,43 @@ def main():
     onnx_voice_group = onnx_parser.add_argument_group('Voice generation options')
     onnx_voice_group.add_argument("-o", "--output_dir", type=str, default="voice_output", help="Output directory for audio processing")
     onnx_voice_group.add_argument("-r", "--sampling_rate", type=int, default=16000, help="Sampling rate for audio processing")
+
+    # Embed command
+    embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.")
+    embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
+    embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
+    embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
+    embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
+    embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
+    embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")
+
+    # Convert command
+    convert_parser = subparsers.add_parser("convert", help="Convert and quantize a Hugging Face model to GGUF format.")
+    convert_parser.add_argument("input_path", type=str, help="Path to the input Hugging Face model directory or GGUF file")
+    convert_parser.add_argument("ftype", nargs='?', type=str, default="q4_0", help="Quantization type (default: q4_0)")
+    convert_parser.add_argument("output_file", nargs='?', type=str, help="Path to the output quantized GGUF file")    
+
+    convert_hf_parser = convert_parser.add_argument_group('Convert from safetensors options')
+    convert_hf_parser.add_argument("--convert_type", type=str, default="f16", help="Conversion type for safetensors to GGUF (default: f16)")
+    convert_hf_parser.add_argument("--bigendian", action="store_true", help="Use big endian format")
+    convert_hf_parser.add_argument("--use_temp_file", action="store_true", help="Use a temporary file during conversion")
+    convert_hf_parser.add_argument("--no_lazy", action="store_true", help="Disable lazy loading")
+    convert_hf_parser.add_argument("--metadata", type=json.loads, help="Additional metadata as JSON string")
+    convert_hf_parser.add_argument("--split_max_tensors", type=int, default=0, help="Maximum number of tensors per split")
+    convert_hf_parser.add_argument("--split_max_size", type=str, default="0", help="Maximum size per split")
+    convert_hf_parser.add_argument("--no_tensor_first_split", action="store_true", help="Disable tensor-first splitting")
+    convert_hf_parser.add_argument("--vocab_only", action="store_true", help="Only process vocabulary")
+    convert_hf_parser.add_argument("--dry_run", action="store_true", help="Perform a dry run without actual conversion")
+
+    quantization_parser = convert_parser.add_argument_group('Quantization options')
+    quantization_parser.add_argument("--nthread", type=int, default=4, help="Number of threads to use (default: 4)")
+    quantization_parser.add_argument("--output_tensor_type", type=str, help="Output tensor type")
+    quantization_parser.add_argument("--token_embedding_type", type=str, help="Token embedding type")
+    quantization_parser.add_argument("--allow_requantize", action="store_true", help="Allow quantizing non-f32/f16 tensors")
+    quantization_parser.add_argument("--quantize_output_tensor", action="store_true", help="Quantize output.weight")
+    quantization_parser.add_argument("--only_copy", action="store_true", help="Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)")
+    quantization_parser.add_argument("--pure", action="store_true", help="Quantize all tensors to the default type")
+    quantization_parser.add_argument("--keep_split", action="store_true", help="Quantize to the same number of shards")
 
     # GGML server parser
     server_parser = subparsers.add_parser("server", help="Run the Nexa AI Text Generation Service")
@@ -378,15 +464,6 @@ def main():
     perf_eval_group.add_argument("--device", type=str, help="Device to run performance evaluation on, choose from 'cpu', 'cuda', 'mps'", default="cpu")
     perf_eval_group.add_argument("--new_tokens", type=int, help="Number of new tokens to evaluate", default=100)
 
-    # Embed command
-    embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.")
-    embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
-    embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
-    embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
-    embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
-    embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
-    embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")
-
     args = parser.parse_args()
 
     if args.command == "run":
@@ -418,6 +495,8 @@ def main():
         from nexa.general import pull_model
         hf = getattr(args, 'huggingface', False)
         pull_model(args.model_path, hf)
+    elif args.command == "convert":
+        run_convert(args)
     elif args.command == "remove":
         from nexa.general import remove_model
         remove_model(args.model_path)
@@ -440,4 +519,4 @@ def main():
         parser.print_help()
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/nexa/constants.py b/nexa/constants.py
@@ -404,3 +404,153 @@ class ModelType(Enum):
     "all-MiniLM-L6-v2": ModelType.TEXT_EMBEDDING,
     "all-MiniLM-L12-v2": ModelType.TEXT_EMBEDDING,
 }
+
+from nexa.gguf.llama.llama_cpp import (
+    LLAMA_FTYPE_ALL_F32,
+    LLAMA_FTYPE_MOSTLY_F16,
+    LLAMA_FTYPE_MOSTLY_Q4_0,
+    LLAMA_FTYPE_MOSTLY_Q4_1,
+    LLAMA_FTYPE_MOSTLY_Q8_0,
+    LLAMA_FTYPE_MOSTLY_Q5_0,
+    LLAMA_FTYPE_MOSTLY_Q5_1,
+    LLAMA_FTYPE_MOSTLY_Q2_K,
+    LLAMA_FTYPE_MOSTLY_Q3_K_S,
+    LLAMA_FTYPE_MOSTLY_Q3_K_M,
+    LLAMA_FTYPE_MOSTLY_Q3_K_L,
+    LLAMA_FTYPE_MOSTLY_Q4_K_S,
+    LLAMA_FTYPE_MOSTLY_Q4_K_M,
+    LLAMA_FTYPE_MOSTLY_Q5_K_S,
+    LLAMA_FTYPE_MOSTLY_Q5_K_M,
+    LLAMA_FTYPE_MOSTLY_Q6_K,
+    LLAMA_FTYPE_MOSTLY_IQ2_XXS,
+    LLAMA_FTYPE_MOSTLY_IQ2_XS,
+    LLAMA_FTYPE_MOSTLY_Q2_K_S,
+    LLAMA_FTYPE_MOSTLY_IQ3_XS,
+    LLAMA_FTYPE_MOSTLY_IQ3_XXS,
+    LLAMA_FTYPE_MOSTLY_IQ1_S,
+    LLAMA_FTYPE_MOSTLY_IQ4_NL,
+    LLAMA_FTYPE_MOSTLY_IQ3_S,
+    LLAMA_FTYPE_MOSTLY_IQ3_M,
+    LLAMA_FTYPE_MOSTLY_IQ2_S,
+    LLAMA_FTYPE_MOSTLY_IQ2_M,
+    LLAMA_FTYPE_MOSTLY_IQ4_XS,
+    LLAMA_FTYPE_MOSTLY_IQ1_M,
+    LLAMA_FTYPE_MOSTLY_BF16,
+    LLAMA_FTYPE_MOSTLY_Q4_0_4_4,
+    LLAMA_FTYPE_MOSTLY_Q4_0_4_8,
+    LLAMA_FTYPE_MOSTLY_Q4_0_8_8,
+    LLAMA_FTYPE_MOSTLY_TQ1_0,
+    LLAMA_FTYPE_MOSTLY_TQ2_0,
+)
+from nexa.gguf.llama.llama_cpp import (
+    GGML_TYPE_F32,
+    GGML_TYPE_F16,
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
+    GGML_TYPE_Q5_0,
+    GGML_TYPE_Q5_1,
+    GGML_TYPE_Q8_0,
+    GGML_TYPE_Q8_1,
+    GGML_TYPE_Q2_K,
+    GGML_TYPE_Q3_K,
+    GGML_TYPE_Q4_K,
+    GGML_TYPE_Q5_K,
+    GGML_TYPE_Q6_K,
+    GGML_TYPE_Q8_K,
+    GGML_TYPE_IQ2_XXS,
+    GGML_TYPE_IQ2_XS,
+    GGML_TYPE_IQ3_XXS,
+    GGML_TYPE_IQ1_S,
+    GGML_TYPE_IQ4_NL,
+    GGML_TYPE_IQ3_S,
+    GGML_TYPE_IQ2_S,
+    GGML_TYPE_IQ4_XS,
+    GGML_TYPE_I8,
+    GGML_TYPE_I16,
+    GGML_TYPE_I32,
+    GGML_TYPE_I64,
+    GGML_TYPE_F64,
+    GGML_TYPE_IQ1_M,
+    GGML_TYPE_BF16,
+    GGML_TYPE_Q4_0_4_4,
+    GGML_TYPE_Q4_0_4_8,
+    GGML_TYPE_Q4_0_8_8,
+    GGML_TYPE_COUNT,
+)
+
+# From quantize.cpp
+# For mapping of general quantization options (ftypes)
+LLAMA_QUANTIZATION_TYPES = {
+    "q4_0": LLAMA_FTYPE_MOSTLY_Q4_0,
+    "q4_1": LLAMA_FTYPE_MOSTLY_Q4_1,
+    "q5_0": LLAMA_FTYPE_MOSTLY_Q5_0,
+    "q5_1": LLAMA_FTYPE_MOSTLY_Q5_1,
+    "q8_0": LLAMA_FTYPE_MOSTLY_Q8_0,
+    "q2_k": LLAMA_FTYPE_MOSTLY_Q2_K,
+    "q3_k_s": LLAMA_FTYPE_MOSTLY_Q3_K_S,
+    "q3_k_m": LLAMA_FTYPE_MOSTLY_Q3_K_M,
+    "q3_k_l": LLAMA_FTYPE_MOSTLY_Q3_K_L,
+    "q4_k_s": LLAMA_FTYPE_MOSTLY_Q4_K_S,
+    "q4_k_m": LLAMA_FTYPE_MOSTLY_Q4_K_M,
+    "q5_k_s": LLAMA_FTYPE_MOSTLY_Q5_K_S,
+    "q5_k_m": LLAMA_FTYPE_MOSTLY_Q5_K_M,
+    "q6_k": LLAMA_FTYPE_MOSTLY_Q6_K,
+    "iq2_xxs": LLAMA_FTYPE_MOSTLY_IQ2_XXS,
+    "iq2_xs": LLAMA_FTYPE_MOSTLY_IQ2_XS,
+    "q2_k_s": LLAMA_FTYPE_MOSTLY_Q2_K_S,
+    "iq3_xs": LLAMA_FTYPE_MOSTLY_IQ3_XS,
+    "iq3_xxs": LLAMA_FTYPE_MOSTLY_IQ3_XXS,
+    "iq1_s": LLAMA_FTYPE_MOSTLY_IQ1_S,
+    "iq4_nl": LLAMA_FTYPE_MOSTLY_IQ4_NL,
+    "iq3_s": LLAMA_FTYPE_MOSTLY_IQ3_S,
+    "iq3_m": LLAMA_FTYPE_MOSTLY_IQ3_M,
+    "iq2_s": LLAMA_FTYPE_MOSTLY_IQ2_S,
+    "iq2_m": LLAMA_FTYPE_MOSTLY_IQ2_M,
+    "iq4_xs": LLAMA_FTYPE_MOSTLY_IQ4_XS,
+    "iq1_m": LLAMA_FTYPE_MOSTLY_IQ1_M,
+    "f16": LLAMA_FTYPE_MOSTLY_F16,
+    "f32": LLAMA_FTYPE_ALL_F32,
+    "bf16": LLAMA_FTYPE_MOSTLY_BF16,
+    "q4_0_4_4": LLAMA_FTYPE_MOSTLY_Q4_0_4_4,
+    "q4_0_4_8": LLAMA_FTYPE_MOSTLY_Q4_0_4_8,
+    "q4_0_8_8": LLAMA_FTYPE_MOSTLY_Q4_0_8_8,
+    "tq1_0": LLAMA_FTYPE_MOSTLY_TQ1_0,
+    "tq2_0": LLAMA_FTYPE_MOSTLY_TQ2_0,
+}
+
+# From ggml.h
+# For mapping of output_tensor_type and token_embedding_type only
+GGML_TYPES = {
+    "f32": GGML_TYPE_F32,
+    "f16": GGML_TYPE_F16,
+    "q4_0": GGML_TYPE_Q4_0,
+    "q4_1": GGML_TYPE_Q4_1,
+    "q5_0": GGML_TYPE_Q5_0,
+    "q5_1": GGML_TYPE_Q5_1,
+    "q8_0": GGML_TYPE_Q8_0,
+    "q8_1": GGML_TYPE_Q8_1,
+    "q2_k": GGML_TYPE_Q2_K,
+    "q3_k": GGML_TYPE_Q3_K,
+    "q4_k": GGML_TYPE_Q4_K,
+    "q5_k": GGML_TYPE_Q5_K,
+    "q6_k": GGML_TYPE_Q6_K,
+    "q8_k": GGML_TYPE_Q8_K,
+    "iq2_xxs": GGML_TYPE_IQ2_XXS,
+    "iq2_xs": GGML_TYPE_IQ2_XS,
+    "iq3_xxs": GGML_TYPE_IQ3_XXS,
+    "iq1_s": GGML_TYPE_IQ1_S,
+    "iq4_nl": GGML_TYPE_IQ4_NL,
+    "iq3_s": GGML_TYPE_IQ3_S,
+    "iq2_s": GGML_TYPE_IQ2_S,
+    "iq4_xs": GGML_TYPE_IQ4_XS,
+    "i8": GGML_TYPE_I8,
+    "i16": GGML_TYPE_I16,
+    "i32": GGML_TYPE_I32,
+    "i64": GGML_TYPE_I64,
+    "f64": GGML_TYPE_F64,
+    "iq1_m": GGML_TYPE_IQ1_M,
+    "bf16": GGML_TYPE_BF16,
+    "q4_0_4_4": GGML_TYPE_Q4_0_4_4,
+    "q4_0_4_8": GGML_TYPE_Q4_0_4_8,
+    "q4_0_8_8": GGML_TYPE_Q4_0_8_8,
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -86,4 +86,7 @@ build_*/ @@
     *.sqlite
     # Other
-    .cache/
+    .cache/
+    # tests
+    quantization_test.py