Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support convert and quantize from hf models #167

Merged
merged 17 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,7 @@ build_*/
*.sqlite

# Other
.cache/
.cache/

# tests
quantization_test.py
62 changes: 62 additions & 0 deletions CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ positional arguments:
run Run inference for various tasks using GGUF models.
onnx Run inference for various tasks using ONNX models.
embed Generate embeddings for text.
convert Convert and quantize a Hugging Face model to GGUF format.
server Run the Nexa AI Text Generation Service.
eval Run the Nexa AI Evaluation Tasks.
pull Pull a model from official or hub.
Expand Down Expand Up @@ -268,6 +269,63 @@ nexa embed nomic-embed-text-v1.5:fp16 "I love Nexa AI."
nexa embed sentence-transformers/all-MiniLM-L6-v2:gguf-fp16 "I love Nexa AI." >> generated_embeddings.txt
```

### Convert and quantize a Hugging Face Model to GGUF

```
nexa convert HF_MODEL_PATH [ftype] [output_file]
usage: nexa convert [-h] [-t NTHREAD] [--convert_type CONVERT_TYPE] [--bigendian] [--use_temp_file] [--no_lazy]
[--metadata METADATA] [--split_max_tensors SPLIT_MAX_TENSORS] [--split_max_size SPLIT_MAX_SIZE]
[--no_tensor_first_split] [--vocab_only] [--dry_run] [--output_tensor_type OUTPUT_TENSOR_TYPE]
[--token_embedding_type TOKEN_EMBEDDING_TYPE] [--allow_requantize] [--quantize_output_tensor]
[--only_copy] [--pure] [--keep_split] input_path [ftype] [output_file]

positional arguments:
input_path Path to the input Hugging Face model directory or GGUF file
ftype Quantization type (default: q4_0)
output_file Path to the output quantized GGUF file

options:
-h, --help show this help message and exit
-t, --nthread NTHREAD Number of threads to use (default: 4)
--convert_type CONVERT_TYPE
Conversion type for safetensors to GGUF (default: f16)
--bigendian Use big endian format
--use_temp_file Use a temporary file during conversion
--no_lazy Disable lazy loading
--metadata METADATA Additional metadata as JSON string
--split_max_tensors SPLIT_MAX_TENSORS
Maximum number of tensors per split
--split_max_size SPLIT_MAX_SIZE
--no_tensor_first_split
Disable tensor-first splitting
--vocab_only Only process vocabulary
--dry_run Perform a dry run without actual conversion
--output_tensor_type Output tensor type
--token_embedding_type
Token embedding type
--allow_requantize Allow quantizing non-f32/f16 tensors
--quantize_output_tensor
Quantize output.weight
--only_copy Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)
--pure Quantize all tensors to the default type
--keep_split Quantize to the same number of shards
```

#### Example

```
# Default quantization type (q4_0) and output file in current directory
nexa convert meta-llama/Llama-3.2-1B-Instruct
zhycheng614 marked this conversation as resolved.
Show resolved Hide resolved

# Equivalent to:
# nexa convert meta-llama/Llama-3.2-1B-Instruct q4_0 ./Llama-3.2-1B-Instruct-q4_0.gguf

# Specifying quantization type and output file
nexa convert meta-llama/Llama-3.2-1B-Instruct q6_k llama3.2-1b-instruct-q6_k.gguf
```

Note: When not specified, the default quantization type is set to q4_0, and the output file will be created in the current directory with the name format: `<model_name>-q4_0.gguf`.

### Start Local Server

Start a local server using models on your local computer.
Expand Down Expand Up @@ -329,3 +387,7 @@ For `model_path` in nexa commands, it's better to follow the standard format to
- `gemma-2b:q4_0`
- `Meta-Llama-3-8B-Instruct:onnx-cpu-int8`
- `liuhaotian/llava-v1.6-vicuna-7b:gguf-q4_0`

```
</rewritten_chunk>
```
166 changes: 87 additions & 79 deletions README.md

Large diffs are not rendered by default.

99 changes: 89 additions & 10 deletions nexa/cli/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
from nexa import __version__
from nexa.constants import ModelType
import json


def _choose_files(local_path):
Expand Down Expand Up @@ -260,6 +261,54 @@ def run_embedding_generation(args):
print(f"Error generating embedding: {e}")
print("Please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation")

def run_convert(args):
input_path = args.input_path

# Check if input_path is a valid directory
if not os.path.isdir(input_path):
from nexa.general import download_repo_from_hf
success, local_path = download_repo_from_hf(input_path)

if success:
input_path = local_path
else:
print("Error: Failed to download the repository and the provided path is not a valid directory.")
return

# Input_path here should be a valid directory
kwargs = {k: v for k, v in vars(args).items() if v is not None and k not in ['input_path', 'ftype', 'output_file', 'convert_type']}

try:
from nexa.gguf.converter.nexa_convert import convert_hf_to_quantized_gguf
converted_path = convert_hf_to_quantized_gguf(
input_path,
output_file=args.output_file,
ftype=args.ftype,
convert_type=args.convert_type,
**kwargs
)
if converted_path:
print(f"Conversion completed successfully. Output file: {converted_path}")

# Ask user if they want to run the converted model
user_choice = input("Would you like to run the converted model? (y/N) (Currently only supports NLP): ").strip().lower()
if user_choice == 'y':
try:
import subprocess
command = f"nexa run {converted_path} -lp -mt NLP"
print(f"Running command: {command}")
subprocess.run(command.split(), check=True, text=True)
except subprocess.CalledProcessError as e:
print("Error running the converted model.")
print("Change model type with -mt to run the model correctly. Or refer to our docs: https://docs.nexa.ai/sdk/cli-reference")
else:
print("Exiting without running the model.")
return
else:
print("Conversion failed.")
except Exception as e:
print(f"Error during conversion: {e}")

def main():
parser = argparse.ArgumentParser(description="Nexa CLI tool for handling various model operations.")
parser.add_argument("-V", "--version", action="version", version=__version__, help="Show the version of the Nexa SDK.")
Expand Down Expand Up @@ -336,6 +385,43 @@ def main():
onnx_voice_group = onnx_parser.add_argument_group('Voice generation options')
onnx_voice_group.add_argument("-o", "--output_dir", type=str, default="voice_output", help="Output directory for audio processing")
onnx_voice_group.add_argument("-r", "--sampling_rate", type=int, default=16000, help="Sampling rate for audio processing")

# Embed command
embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.")
embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")

# Convert command
convert_parser = subparsers.add_parser("convert", help="Convert and quantize a Hugging Face model to GGUF format.")
convert_parser.add_argument("input_path", type=str, help="Path to the input Hugging Face model directory or GGUF file")
convert_parser.add_argument("ftype", nargs='?', type=str, default="q4_0", help="Quantization type (default: q4_0)")
convert_parser.add_argument("output_file", nargs='?', type=str, help="Path to the output quantized GGUF file")

convert_hf_parser = convert_parser.add_argument_group('Convert from safetensors options')
convert_hf_parser.add_argument("--convert_type", type=str, default="f16", help="Conversion type for safetensors to GGUF (default: f16)")
convert_hf_parser.add_argument("--bigendian", action="store_true", help="Use big endian format")
convert_hf_parser.add_argument("--use_temp_file", action="store_true", help="Use a temporary file during conversion")
convert_hf_parser.add_argument("--no_lazy", action="store_true", help="Disable lazy loading")
convert_hf_parser.add_argument("--metadata", type=json.loads, help="Additional metadata as JSON string")
convert_hf_parser.add_argument("--split_max_tensors", type=int, default=0, help="Maximum number of tensors per split")
convert_hf_parser.add_argument("--split_max_size", type=str, default="0", help="Maximum size per split")
convert_hf_parser.add_argument("--no_tensor_first_split", action="store_true", help="Disable tensor-first splitting")
convert_hf_parser.add_argument("--vocab_only", action="store_true", help="Only process vocabulary")
convert_hf_parser.add_argument("--dry_run", action="store_true", help="Perform a dry run without actual conversion")

quantization_parser = convert_parser.add_argument_group('Quantization options')
quantization_parser.add_argument("--nthread", type=int, default=4, help="Number of threads to use (default: 4)")
quantization_parser.add_argument("--output_tensor_type", type=str, help="Output tensor type")
quantization_parser.add_argument("--token_embedding_type", type=str, help="Token embedding type")
quantization_parser.add_argument("--allow_requantize", action="store_true", help="Allow quantizing non-f32/f16 tensors")
quantization_parser.add_argument("--quantize_output_tensor", action="store_true", help="Quantize output.weight")
quantization_parser.add_argument("--only_copy", action="store_true", help="Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)")
quantization_parser.add_argument("--pure", action="store_true", help="Quantize all tensors to the default type")
quantization_parser.add_argument("--keep_split", action="store_true", help="Quantize to the same number of shards")

# GGML server parser
server_parser = subparsers.add_parser("server", help="Run the Nexa AI Text Generation Service")
Expand Down Expand Up @@ -378,15 +464,6 @@ def main():
perf_eval_group.add_argument("--device", type=str, help="Device to run performance evaluation on, choose from 'cpu', 'cuda', 'mps'", default="cpu")
perf_eval_group.add_argument("--new_tokens", type=int, help="Number of new tokens to evaluate", default=100)

# Embed command
embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.")
embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")

args = parser.parse_args()

if args.command == "run":
Expand Down Expand Up @@ -418,6 +495,8 @@ def main():
from nexa.general import pull_model
hf = getattr(args, 'huggingface', False)
pull_model(args.model_path, hf)
elif args.command == "convert":
run_convert(args)
elif args.command == "remove":
from nexa.general import remove_model
remove_model(args.model_path)
Expand All @@ -440,4 +519,4 @@ def main():
parser.print_help()

if __name__ == "__main__":
main()
main()
150 changes: 150 additions & 0 deletions nexa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,3 +404,153 @@ class ModelType(Enum):
"all-MiniLM-L6-v2": ModelType.TEXT_EMBEDDING,
"all-MiniLM-L12-v2": ModelType.TEXT_EMBEDDING,
}

from nexa.gguf.llama.llama_cpp import (
LLAMA_FTYPE_ALL_F32,
LLAMA_FTYPE_MOSTLY_F16,
LLAMA_FTYPE_MOSTLY_Q4_0,
LLAMA_FTYPE_MOSTLY_Q4_1,
LLAMA_FTYPE_MOSTLY_Q8_0,
LLAMA_FTYPE_MOSTLY_Q5_0,
LLAMA_FTYPE_MOSTLY_Q5_1,
LLAMA_FTYPE_MOSTLY_Q2_K,
LLAMA_FTYPE_MOSTLY_Q3_K_S,
LLAMA_FTYPE_MOSTLY_Q3_K_M,
LLAMA_FTYPE_MOSTLY_Q3_K_L,
LLAMA_FTYPE_MOSTLY_Q4_K_S,
LLAMA_FTYPE_MOSTLY_Q4_K_M,
LLAMA_FTYPE_MOSTLY_Q5_K_S,
LLAMA_FTYPE_MOSTLY_Q5_K_M,
LLAMA_FTYPE_MOSTLY_Q6_K,
LLAMA_FTYPE_MOSTLY_IQ2_XXS,
LLAMA_FTYPE_MOSTLY_IQ2_XS,
LLAMA_FTYPE_MOSTLY_Q2_K_S,
LLAMA_FTYPE_MOSTLY_IQ3_XS,
LLAMA_FTYPE_MOSTLY_IQ3_XXS,
LLAMA_FTYPE_MOSTLY_IQ1_S,
LLAMA_FTYPE_MOSTLY_IQ4_NL,
LLAMA_FTYPE_MOSTLY_IQ3_S,
LLAMA_FTYPE_MOSTLY_IQ3_M,
LLAMA_FTYPE_MOSTLY_IQ2_S,
LLAMA_FTYPE_MOSTLY_IQ2_M,
LLAMA_FTYPE_MOSTLY_IQ4_XS,
LLAMA_FTYPE_MOSTLY_IQ1_M,
LLAMA_FTYPE_MOSTLY_BF16,
LLAMA_FTYPE_MOSTLY_Q4_0_4_4,
LLAMA_FTYPE_MOSTLY_Q4_0_4_8,
LLAMA_FTYPE_MOSTLY_Q4_0_8_8,
LLAMA_FTYPE_MOSTLY_TQ1_0,
LLAMA_FTYPE_MOSTLY_TQ2_0,
)
from nexa.gguf.llama.llama_cpp import (
GGML_TYPE_F32,
GGML_TYPE_F16,
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_Q5_0,
GGML_TYPE_Q5_1,
GGML_TYPE_Q8_0,
GGML_TYPE_Q8_1,
GGML_TYPE_Q2_K,
GGML_TYPE_Q3_K,
GGML_TYPE_Q4_K,
GGML_TYPE_Q5_K,
GGML_TYPE_Q6_K,
GGML_TYPE_Q8_K,
GGML_TYPE_IQ2_XXS,
GGML_TYPE_IQ2_XS,
GGML_TYPE_IQ3_XXS,
GGML_TYPE_IQ1_S,
GGML_TYPE_IQ4_NL,
GGML_TYPE_IQ3_S,
GGML_TYPE_IQ2_S,
GGML_TYPE_IQ4_XS,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_I64,
GGML_TYPE_F64,
GGML_TYPE_IQ1_M,
GGML_TYPE_BF16,
GGML_TYPE_Q4_0_4_4,
GGML_TYPE_Q4_0_4_8,
GGML_TYPE_Q4_0_8_8,
GGML_TYPE_COUNT,
)

# From quantize.cpp
# For mapping of general quantization options (ftypes)
LLAMA_QUANTIZATION_TYPES = {
"q4_0": LLAMA_FTYPE_MOSTLY_Q4_0,
"q4_1": LLAMA_FTYPE_MOSTLY_Q4_1,
"q5_0": LLAMA_FTYPE_MOSTLY_Q5_0,
"q5_1": LLAMA_FTYPE_MOSTLY_Q5_1,
"q8_0": LLAMA_FTYPE_MOSTLY_Q8_0,
"q2_k": LLAMA_FTYPE_MOSTLY_Q2_K,
"q3_k_s": LLAMA_FTYPE_MOSTLY_Q3_K_S,
"q3_k_m": LLAMA_FTYPE_MOSTLY_Q3_K_M,
"q3_k_l": LLAMA_FTYPE_MOSTLY_Q3_K_L,
"q4_k_s": LLAMA_FTYPE_MOSTLY_Q4_K_S,
"q4_k_m": LLAMA_FTYPE_MOSTLY_Q4_K_M,
"q5_k_s": LLAMA_FTYPE_MOSTLY_Q5_K_S,
"q5_k_m": LLAMA_FTYPE_MOSTLY_Q5_K_M,
"q6_k": LLAMA_FTYPE_MOSTLY_Q6_K,
"iq2_xxs": LLAMA_FTYPE_MOSTLY_IQ2_XXS,
"iq2_xs": LLAMA_FTYPE_MOSTLY_IQ2_XS,
"q2_k_s": LLAMA_FTYPE_MOSTLY_Q2_K_S,
"iq3_xs": LLAMA_FTYPE_MOSTLY_IQ3_XS,
"iq3_xxs": LLAMA_FTYPE_MOSTLY_IQ3_XXS,
"iq1_s": LLAMA_FTYPE_MOSTLY_IQ1_S,
"iq4_nl": LLAMA_FTYPE_MOSTLY_IQ4_NL,
"iq3_s": LLAMA_FTYPE_MOSTLY_IQ3_S,
"iq3_m": LLAMA_FTYPE_MOSTLY_IQ3_M,
"iq2_s": LLAMA_FTYPE_MOSTLY_IQ2_S,
"iq2_m": LLAMA_FTYPE_MOSTLY_IQ2_M,
"iq4_xs": LLAMA_FTYPE_MOSTLY_IQ4_XS,
"iq1_m": LLAMA_FTYPE_MOSTLY_IQ1_M,
"f16": LLAMA_FTYPE_MOSTLY_F16,
"f32": LLAMA_FTYPE_ALL_F32,
"bf16": LLAMA_FTYPE_MOSTLY_BF16,
"q4_0_4_4": LLAMA_FTYPE_MOSTLY_Q4_0_4_4,
"q4_0_4_8": LLAMA_FTYPE_MOSTLY_Q4_0_4_8,
"q4_0_8_8": LLAMA_FTYPE_MOSTLY_Q4_0_8_8,
"tq1_0": LLAMA_FTYPE_MOSTLY_TQ1_0,
"tq2_0": LLAMA_FTYPE_MOSTLY_TQ2_0,
}

# From ggml.h
# For mapping of output_tensor_type and token_embedding_type only
GGML_TYPES = {
zhycheng614 marked this conversation as resolved.
Show resolved Hide resolved
"f32": GGML_TYPE_F32,
"f16": GGML_TYPE_F16,
"q4_0": GGML_TYPE_Q4_0,
"q4_1": GGML_TYPE_Q4_1,
"q5_0": GGML_TYPE_Q5_0,
"q5_1": GGML_TYPE_Q5_1,
"q8_0": GGML_TYPE_Q8_0,
"q8_1": GGML_TYPE_Q8_1,
"q2_k": GGML_TYPE_Q2_K,
"q3_k": GGML_TYPE_Q3_K,
"q4_k": GGML_TYPE_Q4_K,
"q5_k": GGML_TYPE_Q5_K,
"q6_k": GGML_TYPE_Q6_K,
"q8_k": GGML_TYPE_Q8_K,
"iq2_xxs": GGML_TYPE_IQ2_XXS,
"iq2_xs": GGML_TYPE_IQ2_XS,
"iq3_xxs": GGML_TYPE_IQ3_XXS,
"iq1_s": GGML_TYPE_IQ1_S,
"iq4_nl": GGML_TYPE_IQ4_NL,
"iq3_s": GGML_TYPE_IQ3_S,
"iq2_s": GGML_TYPE_IQ2_S,
"iq4_xs": GGML_TYPE_IQ4_XS,
"i8": GGML_TYPE_I8,
"i16": GGML_TYPE_I16,
"i32": GGML_TYPE_I32,
"i64": GGML_TYPE_I64,
"f64": GGML_TYPE_F64,
"iq1_m": GGML_TYPE_IQ1_M,
"bf16": GGML_TYPE_BF16,
"q4_0_4_4": GGML_TYPE_Q4_0_4_4,
"q4_0_4_8": GGML_TYPE_Q4_0_4_8,
"q4_0_8_8": GGML_TYPE_Q4_0_8_8,
}
Loading
Loading