Skip to content

Commit

Permalink
Merge pull request #167 from NexaAI/perry/convert-and-quantize
Browse files Browse the repository at this point in the history
support convert and quantize from hf models
  • Loading branch information
zhiyuan8 authored Oct 23, 2024
2 parents 5ddc035 + 468d35f commit 12b1bb4
Show file tree
Hide file tree
Showing 10 changed files with 667 additions and 91 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,7 @@ build_*/
*.sqlite

# Other
.cache/
.cache/

# tests
quantization_test.py
62 changes: 62 additions & 0 deletions CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ positional arguments:
run Run inference for various tasks using GGUF models.
onnx Run inference for various tasks using ONNX models.
embed Generate embeddings for text.
convert Convert and quantize a Hugging Face model to GGUF format.
server Run the Nexa AI Text Generation Service.
eval Run the Nexa AI Evaluation Tasks.
pull Pull a model from official or hub.
Expand Down Expand Up @@ -268,6 +269,63 @@ nexa embed nomic-embed-text-v1.5:fp16 "I love Nexa AI."
nexa embed sentence-transformers/all-MiniLM-L6-v2:gguf-fp16 "I love Nexa AI." >> generated_embeddings.txt
```

### Convert and quantize a Hugging Face Model to GGUF

```
nexa convert HF_MODEL_PATH [ftype] [output_file]
usage: nexa convert [-h] [-t NTHREAD] [--convert_type CONVERT_TYPE] [--bigendian] [--use_temp_file] [--no_lazy]
[--metadata METADATA] [--split_max_tensors SPLIT_MAX_TENSORS] [--split_max_size SPLIT_MAX_SIZE]
[--no_tensor_first_split] [--vocab_only] [--dry_run] [--output_tensor_type OUTPUT_TENSOR_TYPE]
[--token_embedding_type TOKEN_EMBEDDING_TYPE] [--allow_requantize] [--quantize_output_tensor]
[--only_copy] [--pure] [--keep_split] input_path [ftype] [output_file]
positional arguments:
input_path Path to the input Hugging Face model directory or GGUF file
ftype Quantization type (default: q4_0)
output_file Path to the output quantized GGUF file
options:
-h, --help show this help message and exit
-t, --nthread NTHREAD Number of threads to use (default: 4)
--convert_type CONVERT_TYPE
Conversion type for safetensors to GGUF (default: f16)
--bigendian Use big endian format
--use_temp_file Use a temporary file during conversion
--no_lazy Disable lazy loading
--metadata METADATA Additional metadata as JSON string
--split_max_tensors SPLIT_MAX_TENSORS
Maximum number of tensors per split
--split_max_size SPLIT_MAX_SIZE
--no_tensor_first_split
Disable tensor-first splitting
--vocab_only Only process vocabulary
--dry_run Perform a dry run without actual conversion
--output_tensor_type Output tensor type
--token_embedding_type
Token embedding type
--allow_requantize Allow quantizing non-f32/f16 tensors
--quantize_output_tensor
Quantize output.weight
--only_copy Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)
--pure Quantize all tensors to the default type
--keep_split Quantize to the same number of shards
```

#### Example

```
# Default quantization type (q4_0) and output file in current directory
nexa convert meta-llama/Llama-3.2-1B-Instruct
# Equivalent to:
# nexa convert meta-llama/Llama-3.2-1B-Instruct q4_0 ./Llama-3.2-1B-Instruct-q4_0.gguf
# Specifying quantization type and output file
nexa convert meta-llama/Llama-3.2-1B-Instruct q6_k llama3.2-1b-instruct-q6_k.gguf
```

Note: When not specified, the default quantization type is set to q4_0, and the output file will be created in the current directory with the name format: `<model_name>-q4_0.gguf`.

### Start Local Server

Start a local server using models on your local computer.
Expand Down Expand Up @@ -329,3 +387,7 @@ For `model_path` in nexa commands, it's better to follow the standard format to
- `gemma-2b:q4_0`
- `Meta-Llama-3-8B-Instruct:onnx-cpu-int8`
- `liuhaotian/llava-v1.6-vicuna-7b:gguf-q4_0`

```
</rewritten_chunk>
```
166 changes: 87 additions & 79 deletions README.md

Large diffs are not rendered by default.

99 changes: 89 additions & 10 deletions nexa/cli/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from nexa import __version__
from nexa.constants import ModelType
import json


def _choose_files(local_path):
Expand Down Expand Up @@ -260,6 +261,54 @@ def run_embedding_generation(args):
print(f"Error generating embedding: {e}")
print("Please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation")

def run_convert(args):
input_path = args.input_path

# Check if input_path is a valid directory
if not os.path.isdir(input_path):
from nexa.general import download_repo_from_hf
success, local_path = download_repo_from_hf(input_path)

if success:
input_path = local_path
else:
print("Error: Failed to download the repository and the provided path is not a valid directory.")
return

# Input_path here should be a valid directory
kwargs = {k: v for k, v in vars(args).items() if v is not None and k not in ['input_path', 'ftype', 'output_file', 'convert_type']}

try:
from nexa.gguf.converter.nexa_convert import convert_hf_to_quantized_gguf
converted_path = convert_hf_to_quantized_gguf(
input_path,
output_file=args.output_file,
ftype=args.ftype,
convert_type=args.convert_type,
**kwargs
)
if converted_path:
print(f"Conversion completed successfully. Output file: {converted_path}")

# Ask user if they want to run the converted model
user_choice = input("Would you like to run the converted model? (y/N) (Currently only supports NLP): ").strip().lower()
if user_choice == 'y':
try:
import subprocess
command = f"nexa run {converted_path} -lp -mt NLP"
print(f"Running command: {command}")
subprocess.run(command.split(), check=True, text=True)
except subprocess.CalledProcessError as e:
print("Error running the converted model.")
print("Change model type with -mt to run the model correctly. Or refer to our docs: https://docs.nexa.ai/sdk/cli-reference")
else:
print("Exiting without running the model.")
return
else:
print("Conversion failed.")
except Exception as e:
print(f"Error during conversion: {e}")

def main():
parser = argparse.ArgumentParser(description="Nexa CLI tool for handling various model operations.")
parser.add_argument("-V", "--version", action="version", version=__version__, help="Show the version of the Nexa SDK.")
Expand Down Expand Up @@ -336,6 +385,43 @@ def main():
onnx_voice_group = onnx_parser.add_argument_group('Voice generation options')
onnx_voice_group.add_argument("-o", "--output_dir", type=str, default="voice_output", help="Output directory for audio processing")
onnx_voice_group.add_argument("-r", "--sampling_rate", type=int, default=16000, help="Sampling rate for audio processing")

# Embed command
embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.")
embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")

# Convert command
convert_parser = subparsers.add_parser("convert", help="Convert and quantize a Hugging Face model to GGUF format.")
convert_parser.add_argument("input_path", type=str, help="Path to the input Hugging Face model directory or GGUF file")
convert_parser.add_argument("ftype", nargs='?', type=str, default="q4_0", help="Quantization type (default: q4_0)")
convert_parser.add_argument("output_file", nargs='?', type=str, help="Path to the output quantized GGUF file")

convert_hf_parser = convert_parser.add_argument_group('Convert from safetensors options')
convert_hf_parser.add_argument("--convert_type", type=str, default="f16", help="Conversion type for safetensors to GGUF (default: f16)")
convert_hf_parser.add_argument("--bigendian", action="store_true", help="Use big endian format")
convert_hf_parser.add_argument("--use_temp_file", action="store_true", help="Use a temporary file during conversion")
convert_hf_parser.add_argument("--no_lazy", action="store_true", help="Disable lazy loading")
convert_hf_parser.add_argument("--metadata", type=json.loads, help="Additional metadata as JSON string")
convert_hf_parser.add_argument("--split_max_tensors", type=int, default=0, help="Maximum number of tensors per split")
convert_hf_parser.add_argument("--split_max_size", type=str, default="0", help="Maximum size per split")
convert_hf_parser.add_argument("--no_tensor_first_split", action="store_true", help="Disable tensor-first splitting")
convert_hf_parser.add_argument("--vocab_only", action="store_true", help="Only process vocabulary")
convert_hf_parser.add_argument("--dry_run", action="store_true", help="Perform a dry run without actual conversion")

quantization_parser = convert_parser.add_argument_group('Quantization options')
quantization_parser.add_argument("--nthread", type=int, default=4, help="Number of threads to use (default: 4)")
quantization_parser.add_argument("--output_tensor_type", type=str, help="Output tensor type")
quantization_parser.add_argument("--token_embedding_type", type=str, help="Token embedding type")
quantization_parser.add_argument("--allow_requantize", action="store_true", help="Allow quantizing non-f32/f16 tensors")
quantization_parser.add_argument("--quantize_output_tensor", action="store_true", help="Quantize output.weight")
quantization_parser.add_argument("--only_copy", action="store_true", help="Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)")
quantization_parser.add_argument("--pure", action="store_true", help="Quantize all tensors to the default type")
quantization_parser.add_argument("--keep_split", action="store_true", help="Quantize to the same number of shards")

# GGML server parser
server_parser = subparsers.add_parser("server", help="Run the Nexa AI Text Generation Service")
Expand Down Expand Up @@ -378,15 +464,6 @@ def main():
perf_eval_group.add_argument("--device", type=str, help="Device to run performance evaluation on, choose from 'cpu', 'cuda', 'mps'", default="cpu")
perf_eval_group.add_argument("--new_tokens", type=int, help="Number of new tokens to evaluate", default=100)

# Embed command
embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.")
embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")

args = parser.parse_args()

if args.command == "run":
Expand Down Expand Up @@ -418,6 +495,8 @@ def main():
from nexa.general import pull_model
hf = getattr(args, 'huggingface', False)
pull_model(args.model_path, hf)
elif args.command == "convert":
run_convert(args)
elif args.command == "remove":
from nexa.general import remove_model
remove_model(args.model_path)
Expand All @@ -440,4 +519,4 @@ def main():
parser.print_help()

if __name__ == "__main__":
main()
main()
150 changes: 150 additions & 0 deletions nexa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,3 +404,153 @@ class ModelType(Enum):
"all-MiniLM-L6-v2": ModelType.TEXT_EMBEDDING,
"all-MiniLM-L12-v2": ModelType.TEXT_EMBEDDING,
}

from nexa.gguf.llama.llama_cpp import (
LLAMA_FTYPE_ALL_F32,
LLAMA_FTYPE_MOSTLY_F16,
LLAMA_FTYPE_MOSTLY_Q4_0,
LLAMA_FTYPE_MOSTLY_Q4_1,
LLAMA_FTYPE_MOSTLY_Q8_0,
LLAMA_FTYPE_MOSTLY_Q5_0,
LLAMA_FTYPE_MOSTLY_Q5_1,
LLAMA_FTYPE_MOSTLY_Q2_K,
LLAMA_FTYPE_MOSTLY_Q3_K_S,
LLAMA_FTYPE_MOSTLY_Q3_K_M,
LLAMA_FTYPE_MOSTLY_Q3_K_L,
LLAMA_FTYPE_MOSTLY_Q4_K_S,
LLAMA_FTYPE_MOSTLY_Q4_K_M,
LLAMA_FTYPE_MOSTLY_Q5_K_S,
LLAMA_FTYPE_MOSTLY_Q5_K_M,
LLAMA_FTYPE_MOSTLY_Q6_K,
LLAMA_FTYPE_MOSTLY_IQ2_XXS,
LLAMA_FTYPE_MOSTLY_IQ2_XS,
LLAMA_FTYPE_MOSTLY_Q2_K_S,
LLAMA_FTYPE_MOSTLY_IQ3_XS,
LLAMA_FTYPE_MOSTLY_IQ3_XXS,
LLAMA_FTYPE_MOSTLY_IQ1_S,
LLAMA_FTYPE_MOSTLY_IQ4_NL,
LLAMA_FTYPE_MOSTLY_IQ3_S,
LLAMA_FTYPE_MOSTLY_IQ3_M,
LLAMA_FTYPE_MOSTLY_IQ2_S,
LLAMA_FTYPE_MOSTLY_IQ2_M,
LLAMA_FTYPE_MOSTLY_IQ4_XS,
LLAMA_FTYPE_MOSTLY_IQ1_M,
LLAMA_FTYPE_MOSTLY_BF16,
LLAMA_FTYPE_MOSTLY_Q4_0_4_4,
LLAMA_FTYPE_MOSTLY_Q4_0_4_8,
LLAMA_FTYPE_MOSTLY_Q4_0_8_8,
LLAMA_FTYPE_MOSTLY_TQ1_0,
LLAMA_FTYPE_MOSTLY_TQ2_0,
)
from nexa.gguf.llama.llama_cpp import (
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q8_1,
GGML_TYPE_Q2_K,
GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K,
GGML_TYPE_Q8_K,
GGML_TYPE_IQ2_XXS,
GGML_TYPE_IQ2_XS,
GGML_TYPE_IQ3_XXS,
GGML_TYPE_IQ1_S,
GGML_TYPE_IQ4_NL,
GGML_TYPE_IQ3_S,
GGML_TYPE_IQ2_S,
GGML_TYPE_IQ4_XS,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_I64,
GGML_TYPE_F64,
GGML_TYPE_IQ1_M,
GGML_TYPE_BF16,
GGML_TYPE_Q4_0_4_4,
GGML_TYPE_Q4_0_4_8,
GGML_TYPE_Q4_0_8_8,
GGML_TYPE_COUNT,
)

# From quantize.cpp
# For mapping of general quantization options (ftypes)
LLAMA_QUANTIZATION_TYPES = {
"q4_0": LLAMA_FTYPE_MOSTLY_Q4_0,
"q4_1": LLAMA_FTYPE_MOSTLY_Q4_1,
"q5_0": LLAMA_FTYPE_MOSTLY_Q5_0,
"q5_1": LLAMA_FTYPE_MOSTLY_Q5_1,
"q8_0": LLAMA_FTYPE_MOSTLY_Q8_0,
"q2_k": LLAMA_FTYPE_MOSTLY_Q2_K,
"q3_k_s": LLAMA_FTYPE_MOSTLY_Q3_K_S,
"q3_k_m": LLAMA_FTYPE_MOSTLY_Q3_K_M,
"q3_k_l": LLAMA_FTYPE_MOSTLY_Q3_K_L,
"q4_k_s": LLAMA_FTYPE_MOSTLY_Q4_K_S,
"q4_k_m": LLAMA_FTYPE_MOSTLY_Q4_K_M,
"q5_k_s": LLAMA_FTYPE_MOSTLY_Q5_K_S,
"q5_k_m": LLAMA_FTYPE_MOSTLY_Q5_K_M,
"q6_k": LLAMA_FTYPE_MOSTLY_Q6_K,
"iq2_xxs": LLAMA_FTYPE_MOSTLY_IQ2_XXS,
"iq2_xs": LLAMA_FTYPE_MOSTLY_IQ2_XS,
"q2_k_s": LLAMA_FTYPE_MOSTLY_Q2_K_S,
"iq3_xs": LLAMA_FTYPE_MOSTLY_IQ3_XS,
"iq3_xxs": LLAMA_FTYPE_MOSTLY_IQ3_XXS,
"iq1_s": LLAMA_FTYPE_MOSTLY_IQ1_S,
"iq4_nl": LLAMA_FTYPE_MOSTLY_IQ4_NL,
"iq3_s": LLAMA_FTYPE_MOSTLY_IQ3_S,
"iq3_m": LLAMA_FTYPE_MOSTLY_IQ3_M,
"iq2_s": LLAMA_FTYPE_MOSTLY_IQ2_S,
"iq2_m": LLAMA_FTYPE_MOSTLY_IQ2_M,
"iq4_xs": LLAMA_FTYPE_MOSTLY_IQ4_XS,
"iq1_m": LLAMA_FTYPE_MOSTLY_IQ1_M,
"f16": LLAMA_FTYPE_MOSTLY_F16,
"f32": LLAMA_FTYPE_ALL_F32,
"bf16": LLAMA_FTYPE_MOSTLY_BF16,
"q4_0_4_4": LLAMA_FTYPE_MOSTLY_Q4_0_4_4,
"q4_0_4_8": LLAMA_FTYPE_MOSTLY_Q4_0_4_8,
"q4_0_8_8": LLAMA_FTYPE_MOSTLY_Q4_0_8_8,
"tq1_0": LLAMA_FTYPE_MOSTLY_TQ1_0,
"tq2_0": LLAMA_FTYPE_MOSTLY_TQ2_0,
}

# From ggml.h
# For mapping of output_tensor_type and token_embedding_type only
GGML_TYPES = {
"f32": GGML_TYPE_F32,
"f16": GGML_TYPE_F16,
"q4_0": GGML_TYPE_Q4_0,
"q4_1": GGML_TYPE_Q4_1,
"q5_0": GGML_TYPE_Q5_0,
"q5_1": GGML_TYPE_Q5_1,
"q8_0": GGML_TYPE_Q8_0,
"q8_1": GGML_TYPE_Q8_1,
"q2_k": GGML_TYPE_Q2_K,
"q3_k": GGML_TYPE_Q3_K,
"q4_k": GGML_TYPE_Q4_K,
"q5_k": GGML_TYPE_Q5_K,
"q6_k": GGML_TYPE_Q6_K,
"q8_k": GGML_TYPE_Q8_K,
"iq2_xxs": GGML_TYPE_IQ2_XXS,
"iq2_xs": GGML_TYPE_IQ2_XS,
"iq3_xxs": GGML_TYPE_IQ3_XXS,
"iq1_s": GGML_TYPE_IQ1_S,
"iq4_nl": GGML_TYPE_IQ4_NL,
"iq3_s": GGML_TYPE_IQ3_S,
"iq2_s": GGML_TYPE_IQ2_S,
"iq4_xs": GGML_TYPE_IQ4_XS,
"i8": GGML_TYPE_I8,
"i16": GGML_TYPE_I16,
"i32": GGML_TYPE_I32,
"i64": GGML_TYPE_I64,
"f64": GGML_TYPE_F64,
"iq1_m": GGML_TYPE_IQ1_M,
"bf16": GGML_TYPE_BF16,
"q4_0_4_4": GGML_TYPE_Q4_0_4_4,
"q4_0_4_8": GGML_TYPE_Q4_0_4_8,
"q4_0_8_8": GGML_TYPE_Q4_0_8_8,
}
Loading

0 comments on commit 12b1bb4

Please sign in to comment.