diff --git a/.gitignore b/.gitignore index 00d554d9..58f3930d 100644 --- a/.gitignore +++ b/.gitignore @@ -86,4 +86,7 @@ build_*/ *.sqlite # Other -.cache/ \ No newline at end of file +.cache/ + +# tests +quantization_test.py \ No newline at end of file diff --git a/CLI.md b/CLI.md index aad72de0..2a706353 100644 --- a/CLI.md +++ b/CLI.md @@ -13,6 +13,7 @@ positional arguments: run Run inference for various tasks using GGUF models. onnx Run inference for various tasks using ONNX models. embed Generate embeddings for text. + convert Convert and quantize a Hugging Face model to GGUF format. server Run the Nexa AI Text Generation Service. eval Run the Nexa AI Evaluation Tasks. pull Pull a model from official or hub. @@ -268,6 +269,63 @@ nexa embed nomic-embed-text-v1.5:fp16 "I love Nexa AI." nexa embed sentence-transformers/all-MiniLM-L6-v2:gguf-fp16 "I love Nexa AI." >> generated_embeddings.txt ``` +### Convert and quantize a Hugging Face Model to GGUF + +``` +nexa convert HF_MODEL_PATH [ftype] [output_file] +usage: nexa convert [-h] [-t NTHREAD] [--convert_type CONVERT_TYPE] [--bigendian] [--use_temp_file] [--no_lazy] + [--metadata METADATA] [--split_max_tensors SPLIT_MAX_TENSORS] [--split_max_size SPLIT_MAX_SIZE] + [--no_tensor_first_split] [--vocab_only] [--dry_run] [--output_tensor_type OUTPUT_TENSOR_TYPE] + [--token_embedding_type TOKEN_EMBEDDING_TYPE] [--allow_requantize] [--quantize_output_tensor] + [--only_copy] [--pure] [--keep_split] input_path [ftype] [output_file] + +positional arguments: + input_path Path to the input Hugging Face model directory or GGUF file + ftype Quantization type (default: q4_0) + output_file Path to the output quantized GGUF file + +options: + -h, --help show this help message and exit + -t, --nthread NTHREAD Number of threads to use (default: 4) + --convert_type CONVERT_TYPE + Conversion type for safetensors to GGUF (default: f16) + --bigendian Use big endian format + --use_temp_file Use a temporary file during conversion + --no_lazy Disable lazy loading + --metadata METADATA Additional metadata as JSON string + --split_max_tensors SPLIT_MAX_TENSORS + Maximum number of tensors per split + --split_max_size SPLIT_MAX_SIZE + --no_tensor_first_split + Disable tensor-first splitting + --vocab_only Only process vocabulary + --dry_run Perform a dry run without actual conversion + --output_tensor_type Output tensor type + --token_embedding_type + Token embedding type + --allow_requantize Allow quantizing non-f32/f16 tensors + --quantize_output_tensor + Quantize output.weight + --only_copy Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor) + --pure Quantize all tensors to the default type + --keep_split Quantize to the same number of shards +``` + +#### Example + +``` +# Default quantization type (q4_0) and output file in current directory +nexa convert meta-llama/Llama-3.2-1B-Instruct + +# Equivalent to: +# nexa convert meta-llama/Llama-3.2-1B-Instruct q4_0 ./Llama-3.2-1B-Instruct-q4_0.gguf + +# Specifying quantization type and output file +nexa convert meta-llama/Llama-3.2-1B-Instruct q6_k llama3.2-1b-instruct-q6_k.gguf +``` + +Note: When not specified, the default quantization type is set to q4_0, and the output file will be created in the current directory with the name format: `-q4_0.gguf`. + ### Start Local Server Start a local server using models on your local computer. @@ -329,3 +387,7 @@ For `model_path` in nexa commands, it's better to follow the standard format to - `gemma-2b:q4_0` - `Meta-Llama-3-8B-Instruct:onnx-cpu-int8` - `liuhaotian/llava-v1.6-vicuna-7b:gguf-q4_0` + +``` + +``` diff --git a/README.md b/README.md index d6be39be..b9716afd 100644 --- a/README.md +++ b/README.md @@ -24,38 +24,43 @@ Nexa SDK is a comprehensive toolkit for supporting **ONNX** and **GGML** models. It supports text generation, image generation, vision-language models (VLM), and speech-to-text (ASR), and text-to-speech (TTS) capabilities. Additionally, it offers an OpenAI-compatible API server with JSON schema mode for function calling and streaming support, and a user-friendly Streamlit UI. Users can run Nexa SDK in any device with Python environment, and GPU acceleration is supported, including CUDA, Metal, and ROCm. An executable version is also available. ## Latest News 🔥 -* [2024/10] Support embedding model: `nexa embed ` -* [2024/10] Support pull and run supported Computer Vision models in GGUF format from HuggingFace: `nexa run -hf -mt COMPUTER_VISION` -* [2024/10] Support VLM in local server. -* [2024/10] Added option to customize maximum context window for NLP and VLM models. -* [2024/10] Support running model from user's local path -* [2024/10] Added LoRA support for NLP models. -* [2024/10] Added support for whisper-large-v3-turbo: `nexa run faster-whisper-large-turbo` -* [2024/10] Added support for AMD-Llama-135m: `nexa run AMD-Llama-135m:fp16` -* [2024/09] Nexa now has executables for easy installation: [Install Nexa SDK](https://nexaai.com/download-sdk) ✨ -* [2024/09] Added support for Llama 3.2 models: `nexa run llama3.2` -* [2024/09] Added support for Qwen2.5, Qwen2.5-coder and Qwen2.5-Math models: `nexa run qwen2.5` -* [2024/09] Support pull and run NLP models in GGUF format from HuggingFace: `nexa run -hf -mt NLP` -* [2024/09] Added support for ROCm -* [2024/09] Added support for Phi-3.5 models: `nexa run phi3.5` -* [2024/09] Added support for OpenELM models: `nexa run openelm` -* [2024/09] Introduced logits API support for more advanced model interactions -* [2024/09] Added support for Flux models: `nexa run flux` -* [2024/09] Added support for Stable Diffusion 3 model: `nexa run sd3` -* [2024/09] Added support for Stable Diffusion 2.1 model: `nexa run sd2-1` + +- [2024/10] Support embedding model: `nexa embed ` +- [2024/10] Support pull and run supported Computer Vision models in GGUF format from HuggingFace: `nexa run -hf -mt COMPUTER_VISION` +- [2024/10] Support VLM in local server. +- [2024/10] Added option to customize maximum context window for NLP and VLM models. +- [2024/10] Support running model from user's local path +- [2024/10] Added LoRA support for NLP models. +- [2024/10] Added support for whisper-large-v3-turbo: `nexa run faster-whisper-large-turbo` +- [2024/10] Added support for AMD-Llama-135m: `nexa run AMD-Llama-135m:fp16` +- [2024/09] Nexa now has executables for easy installation: [Install Nexa SDK](https://nexaai.com/download-sdk) ✨ +- [2024/09] Added support for Llama 3.2 models: `nexa run llama3.2` +- [2024/09] Added support for Qwen2.5, Qwen2.5-coder and Qwen2.5-Math models: `nexa run qwen2.5` +- [2024/09] Support pull and run NLP models in GGUF format from HuggingFace: `nexa run -hf -mt NLP` +- [2024/09] Added support for ROCm +- [2024/09] Added support for Phi-3.5 models: `nexa run phi3.5` +- [2024/09] Added support for OpenELM models: `nexa run openelm` +- [2024/09] Introduced logits API support for more advanced model interactions +- [2024/09] Added support for Flux models: `nexa run flux` +- [2024/09] Added support for Stable Diffusion 3 model: `nexa run sd3` +- [2024/09] Added support for Stable Diffusion 2.1 model: `nexa run sd2-1` Welcome to submit your requests through [issues](https://github.com/NexaAI/nexa-sdk/issues/new/choose), we ship weekly. ## Installation - Executable ### macOS + [Download](https://public-storage.nexa4ai.com/nexa-sdk-executable-installer/nexa-macos-installer.pkg) ### Linux + ```bash -curl -fsSL https://public-storage.nexa4ai.com/install.sh | sh +curl -fsSL https://public-storage.nexa4ai.com/install.sh | sh ``` + ### Windows + Coming soon. Install with Python package below 👇 ## Installation - Python Package @@ -65,7 +70,8 @@ We have released pre-built wheels for various Python versions, platforms, and ba > [!NOTE] > > 1. If you want to use ONNX model, just replace `pip install nexaai` with `pip install "nexaai[onnx]"` in provided commands. -> 2. For Chinese developers, we recommend you to use Tsinghua Open Source Mirror as extra index url, just replace `--extra-index-url https://pypi.org/simple` with `--extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple` in provided commands. +> 2. If you want to convert and quantize huggingface models to GGUF models, just replace `pip install nexaai` with `pip install "nexaai[nexa-gguf]"`. +> 3. For Chinese developers, we recommend you to use Tsinghua Open Source Mirror as extra index url, just replace `--extra-index-url https://pypi.org/simple` with `--extra-index-url https://pypi.tuna.tsinghua.edu.cn/simple` in provided commands. #### CPU @@ -199,68 +205,69 @@ Below is our differentiation from other similar tools: | **User Interface** | ✅ | ❌ | ❌ | ✅ | ## Supported Models & Model Hub + Our on-device model hub offers all types of quantized models (text, image, audio, multimodal) with filters for RAM, file size, Tasks, etc. to help you easily explore models with UI. Explore on-device models at [On-device Model Hub](https://model-hub.nexa4ai.com/) Supported models (full list at [Model Hub](https://nexa.ai/models)): -| Model | Type | Format | Command | +| Model | Type | Format | Command | | ------------------------------------------------------------------------------------------------------- | --------------- | --------- | -------------------------------------- | -| [octopus-v2](https://www.nexaai.com/NexaAI/Octopus-v2/gguf-q4_0/readme) | NLP | GGUF | `nexa run octopus-v2` | -| [octopus-v4](https://www.nexaai.com/NexaAI/Octopus-v4/gguf-q4_0/readme) | NLP | GGUF | `nexa run octopus-v4` | -| [gpt2](https://nexaai.com/openai/gpt2/gguf-q4_0/readme) | NLP | GGUF | `nexa run gpt2` | -| [tinyllama](https://www.nexaai.com/TinyLlama/TinyLlama-1.1B-Chat-v1.0/gguf-fp16/readme) | NLP | GGUF | `nexa run tinyllama` | -| [llama2](https://www.nexaai.com/meta/Llama2-7b-chat/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run llama2` | -| [llama2-uncensored](https://www.nexaai.com/georgesung/Llama2-7b-chat-uncensored/gguf-q4_0/readme) | NLP | GGUF | `nexa run llama2-uncensored` | -| [llama2-function-calling](https://www.nexaai.com/Trelis/Llama2-7b-function-calling/gguf-q4_K_M/readme) | NLP | GGUF | `nexa run llama2-function-calling` | -| [llama3](https://www.nexaai.com/meta/Llama3-8B-Instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run llama3` | -| [llama3.1](https://www.nexaai.com/meta/Llama3.1-8B-Instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run llama3.1` | -| [llama3.2](https://nexaai.com/meta/Llama3.2-3B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run llama3.2` | -| [llama3-uncensored](https://www.nexaai.com/Orenguteng/Llama3-8B-Lexi-Uncensored/gguf-q4_K_M/readme) | NLP | GGUF | `nexa run llama3-uncensored` | -| [gemma](https://www.nexaai.com/google/gemma-1.1-2b-instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run gemma` | -| [gemma2](https://www.nexaai.com/google/gemma-2-2b-instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run gemma2` | -| [qwen1.5](https://www.nexaai.com/Qwen/Qwen1.5-7B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run qwen1.5` | -| [qwen2](https://www.nexaai.com/Qwen/Qwen2-1.5B-Instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run qwen2` | -| [qwen2.5](https://www.nexaai.com/Qwen/Qwen2.5-1.5B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run qwen2.5` | -| [mathqwen](https://nexaai.com/Qwen/Qwen2.5-Math-1.5B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run mathqwen` | -| [codeqwen](https://www.nexaai.com/Qwen/CodeQwen1.5-7B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run codeqwen` | -| [mistral](https://www.nexaai.com/mistralai/Mistral-7B-Instruct-v0.3/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run mistral` | -| [dolphin-mistral](https://www.nexaai.com/CognitiveComputations/dolphin-2.8-mistral-7b/gguf-q4_0/readme) | NLP | GGUF | `nexa run dolphin-mistral` | -| [codegemma](https://www.nexaai.com/google/codegemma-2b/gguf-q4_0/readme) | NLP | GGUF | `nexa run codegemma` | -| [codellama](https://www.nexaai.com/meta/CodeLlama-7b-Instruct/gguf-q2_K/readme) | NLP | GGUF | `nexa run codellama` | -| [deepseek-coder](https://www.nexaai.com/DeepSeek/deepseek-coder-1.3b-instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run deepseek-coder` | -| [phi2](https://www.nexaai.com/microsoft/Phi-2/gguf-q4_0/readme) | NLP | GGUF | `nexa run phi2` | -| [phi3](https://www.nexaai.com/microsoft/Phi-3-mini-128k-instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run phi3` | -| [phi3.5](https://nexaai.com/microsoft/Phi-3.5-mini-instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run phi3.5` | -| [openelm](https://nexaai.com/apple/OpenELM-3B/gguf-q4_K_M/readme) | NLP | GGUF | `nexa run openelm` | -| [AMD-Llama-135m](https://nexaai.com/amd/AMD-Llama-135m/gguf-fp16/readme) | NLP | GGUF | `nexa run AMD-Llama-135m:fp16` | -| [nanollava](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | Multimodal | GGUF | `nexa run nanollava` | -| [llava-phi3](https://www.nexaai.com/xtuner/llava-phi-3-mini/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-phi3` | -| [llava-llama3](https://www.nexaai.com/xtuner/llava-llama-3-8b-v1.1/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-llama3` | -| [llava1.6-mistral](https://www.nexaai.com/liuhaotian/llava-v1.6-mistral-7b/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava1.6-mistral` | -| [llava1.6-vicuna](https://www.nexaai.com/liuhaotian/llava-v1.6-vicuna-7b/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava1.6-vicuna` | -| [stable-diffusion-v1-4](https://www.nexaai.com/runwayml/stable-diffusion-v1-4/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run sd1-4` | -| [stable-diffusion-v1-5](https://www.nexaai.com/runwayml/stable-diffusion-v1-5/gguf-q4_0/readme) | Computer Vision | GGUF/ONNX | `nexa run sd1-5` | -| [stable-diffusion-v2-1](https://nexaai.com/StabilityAI/stable-diffusion-v2-1/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run sd2-1` | -| [stable-diffusion-3-medium](https://nexaai.com/StabilityAI/stable-diffusion-3-medium/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run sd3` | -| [FLUX.1-schnell](https://nexaai.com/BlackForestLabs/FLUX.1-schnell/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run flux` | -| [lcm-dreamshaper](https://www.nexaai.com/SimianLuo/lcm-dreamshaper-v7/gguf-fp16/readme) | Computer Vision | GGUF/ONNX | `nexa run lcm-dreamshaper` | -| [hassaku-lcm](https://nexaai.com/stablediffusionapi/hassaku-hentai-model-v13-LCM/gguf-fp16/readme) | Computer Vision | GGUF | `nexa run hassaku-lcm` | -| [anything-lcm](https://www.nexaai.com/Linaqruf/anything-v30-LCM/gguf-fp16/readme) | Computer Vision | GGUF | `nexa run anything-lcm` | -| [faster-whisper-tiny](https://www.nexaai.com/Systran/faster-whisper-tiny/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-tiny` | -| [faster-whisper-small](https://www.nexaai.com/Systran/faster-whisper-small/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-small` | -| [faster-whisper-medium](https://www.nexaai.com/Systran/faster-whisper-medium/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-medium` | -| [faster-whisper-base](https://www.nexaai.com/Systran/faster-whisper-base/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-base` | -| [faster-whisper-large](https://www.nexaai.com/Systran/faster-whisper-large-v3/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-large` | -| [whisper-large-v3-turbo](https://nexaai.com/Systran/faster-whisper-large-v3-turbo/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-large-turbo` | -| [whisper-tiny.en](https://nexaai.com/openai/whisper-tiny.en/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-tiny.en` | -| [whisper-tiny](https://nexaai.com/openai/whisper-tiny/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-tiny` | -| [whisper-small.en](https://nexaai.com/openai/whisper-small.en/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-small.en` | -| [whisper-small](https://nexaai.com/openai/whisper-small/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-small` | -| [whisper-base.en](https://nexaai.com/openai/whisper-base.en/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-base.en` | -| [whisper-base](https://nexaai.com/openai/whisper-base/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-base` | -| [mxbai-embed-large-v1](https://nexa.ai/mixedbread-ai/mxbai-embed-large-v1/gguf-fp16/readme) | Embedding | GGUF | `nexa embed mxbai` | -| [nomic-embed-text-v1.5](https://nexa.ai/nomic-ai/nomic-embed-text-v1.5/gguf-fp16/readme) | Embedding | GGUF | `nexa embed nomic` | -| [all-MiniLM-L6-v2](https://nexa.ai/sentence-transformers/all-MiniLM-L6-v2/gguf-fp16/readme) | Embedding | GGUF | `nexa embed all-MiniLM-L6-v2:fp16` | -| [all-MiniLM-L12-v2](https://nexa.ai/sentence-transformers/all-MiniLM-L12-v2/gguf-fp16/readme) | Embedding | GGUF | `nexa embed all-MiniLM-L12-v2:fp16` | +| [octopus-v2](https://www.nexaai.com/NexaAI/Octopus-v2/gguf-q4_0/readme) | NLP | GGUF | `nexa run octopus-v2` | +| [octopus-v4](https://www.nexaai.com/NexaAI/Octopus-v4/gguf-q4_0/readme) | NLP | GGUF | `nexa run octopus-v4` | +| [gpt2](https://nexaai.com/openai/gpt2/gguf-q4_0/readme) | NLP | GGUF | `nexa run gpt2` | +| [tinyllama](https://www.nexaai.com/TinyLlama/TinyLlama-1.1B-Chat-v1.0/gguf-fp16/readme) | NLP | GGUF | `nexa run tinyllama` | +| [llama2](https://www.nexaai.com/meta/Llama2-7b-chat/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run llama2` | +| [llama2-uncensored](https://www.nexaai.com/georgesung/Llama2-7b-chat-uncensored/gguf-q4_0/readme) | NLP | GGUF | `nexa run llama2-uncensored` | +| [llama2-function-calling](https://www.nexaai.com/Trelis/Llama2-7b-function-calling/gguf-q4_K_M/readme) | NLP | GGUF | `nexa run llama2-function-calling` | +| [llama3](https://www.nexaai.com/meta/Llama3-8B-Instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run llama3` | +| [llama3.1](https://www.nexaai.com/meta/Llama3.1-8B-Instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run llama3.1` | +| [llama3.2](https://nexaai.com/meta/Llama3.2-3B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run llama3.2` | +| [llama3-uncensored](https://www.nexaai.com/Orenguteng/Llama3-8B-Lexi-Uncensored/gguf-q4_K_M/readme) | NLP | GGUF | `nexa run llama3-uncensored` | +| [gemma](https://www.nexaai.com/google/gemma-1.1-2b-instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run gemma` | +| [gemma2](https://www.nexaai.com/google/gemma-2-2b-instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run gemma2` | +| [qwen1.5](https://www.nexaai.com/Qwen/Qwen1.5-7B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run qwen1.5` | +| [qwen2](https://www.nexaai.com/Qwen/Qwen2-1.5B-Instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run qwen2` | +| [qwen2.5](https://www.nexaai.com/Qwen/Qwen2.5-1.5B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run qwen2.5` | +| [mathqwen](https://nexaai.com/Qwen/Qwen2.5-Math-1.5B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run mathqwen` | +| [codeqwen](https://www.nexaai.com/Qwen/CodeQwen1.5-7B-Instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run codeqwen` | +| [mistral](https://www.nexaai.com/mistralai/Mistral-7B-Instruct-v0.3/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run mistral` | +| [dolphin-mistral](https://www.nexaai.com/CognitiveComputations/dolphin-2.8-mistral-7b/gguf-q4_0/readme) | NLP | GGUF | `nexa run dolphin-mistral` | +| [codegemma](https://www.nexaai.com/google/codegemma-2b/gguf-q4_0/readme) | NLP | GGUF | `nexa run codegemma` | +| [codellama](https://www.nexaai.com/meta/CodeLlama-7b-Instruct/gguf-q2_K/readme) | NLP | GGUF | `nexa run codellama` | +| [deepseek-coder](https://www.nexaai.com/DeepSeek/deepseek-coder-1.3b-instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run deepseek-coder` | +| [phi2](https://www.nexaai.com/microsoft/Phi-2/gguf-q4_0/readme) | NLP | GGUF | `nexa run phi2` | +| [phi3](https://www.nexaai.com/microsoft/Phi-3-mini-128k-instruct/gguf-q4_0/readme) | NLP | GGUF/ONNX | `nexa run phi3` | +| [phi3.5](https://nexaai.com/microsoft/Phi-3.5-mini-instruct/gguf-q4_0/readme) | NLP | GGUF | `nexa run phi3.5` | +| [openelm](https://nexaai.com/apple/OpenELM-3B/gguf-q4_K_M/readme) | NLP | GGUF | `nexa run openelm` | +| [AMD-Llama-135m](https://nexaai.com/amd/AMD-Llama-135m/gguf-fp16/readme) | NLP | GGUF | `nexa run AMD-Llama-135m:fp16` | +| [nanollava](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | Multimodal | GGUF | `nexa run nanollava` | +| [llava-phi3](https://www.nexaai.com/xtuner/llava-phi-3-mini/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-phi3` | +| [llava-llama3](https://www.nexaai.com/xtuner/llava-llama-3-8b-v1.1/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-llama3` | +| [llava1.6-mistral](https://www.nexaai.com/liuhaotian/llava-v1.6-mistral-7b/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava1.6-mistral` | +| [llava1.6-vicuna](https://www.nexaai.com/liuhaotian/llava-v1.6-vicuna-7b/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava1.6-vicuna` | +| [stable-diffusion-v1-4](https://www.nexaai.com/runwayml/stable-diffusion-v1-4/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run sd1-4` | +| [stable-diffusion-v1-5](https://www.nexaai.com/runwayml/stable-diffusion-v1-5/gguf-q4_0/readme) | Computer Vision | GGUF/ONNX | `nexa run sd1-5` | +| [stable-diffusion-v2-1](https://nexaai.com/StabilityAI/stable-diffusion-v2-1/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run sd2-1` | +| [stable-diffusion-3-medium](https://nexaai.com/StabilityAI/stable-diffusion-3-medium/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run sd3` | +| [FLUX.1-schnell](https://nexaai.com/BlackForestLabs/FLUX.1-schnell/gguf-q4_0/readme) | Computer Vision | GGUF | `nexa run flux` | +| [lcm-dreamshaper](https://www.nexaai.com/SimianLuo/lcm-dreamshaper-v7/gguf-fp16/readme) | Computer Vision | GGUF/ONNX | `nexa run lcm-dreamshaper` | +| [hassaku-lcm](https://nexaai.com/stablediffusionapi/hassaku-hentai-model-v13-LCM/gguf-fp16/readme) | Computer Vision | GGUF | `nexa run hassaku-lcm` | +| [anything-lcm](https://www.nexaai.com/Linaqruf/anything-v30-LCM/gguf-fp16/readme) | Computer Vision | GGUF | `nexa run anything-lcm` | +| [faster-whisper-tiny](https://www.nexaai.com/Systran/faster-whisper-tiny/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-tiny` | +| [faster-whisper-small](https://www.nexaai.com/Systran/faster-whisper-small/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-small` | +| [faster-whisper-medium](https://www.nexaai.com/Systran/faster-whisper-medium/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-medium` | +| [faster-whisper-base](https://www.nexaai.com/Systran/faster-whisper-base/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-base` | +| [faster-whisper-large](https://www.nexaai.com/Systran/faster-whisper-large-v3/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-large` | +| [whisper-large-v3-turbo](https://nexaai.com/Systran/faster-whisper-large-v3-turbo/bin-cpu-fp16/readme) | Audio | BIN | `nexa run faster-whisper-large-turbo` | +| [whisper-tiny.en](https://nexaai.com/openai/whisper-tiny.en/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-tiny.en` | +| [whisper-tiny](https://nexaai.com/openai/whisper-tiny/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-tiny` | +| [whisper-small.en](https://nexaai.com/openai/whisper-small.en/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-small.en` | +| [whisper-small](https://nexaai.com/openai/whisper-small/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-small` | +| [whisper-base.en](https://nexaai.com/openai/whisper-base.en/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-base.en` | +| [whisper-base](https://nexaai.com/openai/whisper-base/onnx-cpu-fp32/readme) | Audio | ONNX | `nexa run whisper-base` | +| [mxbai-embed-large-v1](https://nexa.ai/mixedbread-ai/mxbai-embed-large-v1/gguf-fp16/readme) | Embedding | GGUF | `nexa embed mxbai` | +| [nomic-embed-text-v1.5](https://nexa.ai/nomic-ai/nomic-embed-text-v1.5/gguf-fp16/readme) | Embedding | GGUF | `nexa embed nomic` | +| [all-MiniLM-L6-v2](https://nexa.ai/sentence-transformers/all-MiniLM-L6-v2/gguf-fp16/readme) | Embedding | GGUF | `nexa embed all-MiniLM-L6-v2:fp16` | +| [all-MiniLM-L12-v2](https://nexa.ai/sentence-transformers/all-MiniLM-L12-v2/gguf-fp16/readme) | Embedding | GGUF | `nexa embed all-MiniLM-L12-v2:fp16` | ## CLI Reference @@ -268,6 +275,7 @@ Here's a brief overview of the main CLI commands: - `nexa run`: Run inference for various tasks using GGUF models. - `nexa onnx`: Run inference for various tasks using ONNX models. +- `nexa convert`: Convert and quantize huggingface models to GGUF models. - `nexa server`: Run the Nexa AI Text Generation Service. - `nexa eval`: Run the Nexa AI Evaluation Tasks. - `nexa pull`: Pull a model from official or hub. diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py index 85751627..d7ee4ee0 100644 --- a/nexa/cli/entry.py +++ b/nexa/cli/entry.py @@ -2,6 +2,7 @@ import os from nexa import __version__ from nexa.constants import ModelType +import json def _choose_files(local_path): @@ -260,6 +261,54 @@ def run_embedding_generation(args): print(f"Error generating embedding: {e}") print("Please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation") +def run_convert(args): + input_path = args.input_path + + # Check if input_path is a valid directory + if not os.path.isdir(input_path): + from nexa.general import download_repo_from_hf + success, local_path = download_repo_from_hf(input_path) + + if success: + input_path = local_path + else: + print("Error: Failed to download the repository and the provided path is not a valid directory.") + return + + # Input_path here should be a valid directory + kwargs = {k: v for k, v in vars(args).items() if v is not None and k not in ['input_path', 'ftype', 'output_file', 'convert_type']} + + try: + from nexa.gguf.converter.nexa_convert import convert_hf_to_quantized_gguf + converted_path = convert_hf_to_quantized_gguf( + input_path, + output_file=args.output_file, + ftype=args.ftype, + convert_type=args.convert_type, + **kwargs + ) + if converted_path: + print(f"Conversion completed successfully. Output file: {converted_path}") + + # Ask user if they want to run the converted model + user_choice = input("Would you like to run the converted model? (y/N) (Currently only supports NLP): ").strip().lower() + if user_choice == 'y': + try: + import subprocess + command = f"nexa run {converted_path} -lp -mt NLP" + print(f"Running command: {command}") + subprocess.run(command.split(), check=True, text=True) + except subprocess.CalledProcessError as e: + print("Error running the converted model.") + print("Change model type with -mt to run the model correctly. Or refer to our docs: https://docs.nexa.ai/sdk/cli-reference") + else: + print("Exiting without running the model.") + return + else: + print("Conversion failed.") + except Exception as e: + print(f"Error during conversion: {e}") + def main(): parser = argparse.ArgumentParser(description="Nexa CLI tool for handling various model operations.") parser.add_argument("-V", "--version", action="version", version=__version__, help="Show the version of the Nexa SDK.") @@ -336,6 +385,43 @@ def main(): onnx_voice_group = onnx_parser.add_argument_group('Voice generation options') onnx_voice_group.add_argument("-o", "--output_dir", type=str, default="voice_output", help="Output directory for audio processing") onnx_voice_group.add_argument("-r", "--sampling_rate", type=int, default=16000, help="Sampling rate for audio processing") + + # Embed command + embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.") + embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub") + embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for") + embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path") + embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub") + embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings") + embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings") + + # Convert command + convert_parser = subparsers.add_parser("convert", help="Convert and quantize a Hugging Face model to GGUF format.") + convert_parser.add_argument("input_path", type=str, help="Path to the input Hugging Face model directory or GGUF file") + convert_parser.add_argument("ftype", nargs='?', type=str, default="q4_0", help="Quantization type (default: q4_0)") + convert_parser.add_argument("output_file", nargs='?', type=str, help="Path to the output quantized GGUF file") + + convert_hf_parser = convert_parser.add_argument_group('Convert from safetensors options') + convert_hf_parser.add_argument("--convert_type", type=str, default="f16", help="Conversion type for safetensors to GGUF (default: f16)") + convert_hf_parser.add_argument("--bigendian", action="store_true", help="Use big endian format") + convert_hf_parser.add_argument("--use_temp_file", action="store_true", help="Use a temporary file during conversion") + convert_hf_parser.add_argument("--no_lazy", action="store_true", help="Disable lazy loading") + convert_hf_parser.add_argument("--metadata", type=json.loads, help="Additional metadata as JSON string") + convert_hf_parser.add_argument("--split_max_tensors", type=int, default=0, help="Maximum number of tensors per split") + convert_hf_parser.add_argument("--split_max_size", type=str, default="0", help="Maximum size per split") + convert_hf_parser.add_argument("--no_tensor_first_split", action="store_true", help="Disable tensor-first splitting") + convert_hf_parser.add_argument("--vocab_only", action="store_true", help="Only process vocabulary") + convert_hf_parser.add_argument("--dry_run", action="store_true", help="Perform a dry run without actual conversion") + + quantization_parser = convert_parser.add_argument_group('Quantization options') + quantization_parser.add_argument("--nthread", type=int, default=4, help="Number of threads to use (default: 4)") + quantization_parser.add_argument("--output_tensor_type", type=str, help="Output tensor type") + quantization_parser.add_argument("--token_embedding_type", type=str, help="Token embedding type") + quantization_parser.add_argument("--allow_requantize", action="store_true", help="Allow quantizing non-f32/f16 tensors") + quantization_parser.add_argument("--quantize_output_tensor", action="store_true", help="Quantize output.weight") + quantization_parser.add_argument("--only_copy", action="store_true", help="Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)") + quantization_parser.add_argument("--pure", action="store_true", help="Quantize all tensors to the default type") + quantization_parser.add_argument("--keep_split", action="store_true", help="Quantize to the same number of shards") # GGML server parser server_parser = subparsers.add_parser("server", help="Run the Nexa AI Text Generation Service") @@ -378,15 +464,6 @@ def main(): perf_eval_group.add_argument("--device", type=str, help="Device to run performance evaluation on, choose from 'cpu', 'cuda', 'mps'", default="cpu") perf_eval_group.add_argument("--new_tokens", type=int, help="Number of new tokens to evaluate", default=100) - # Embed command - embed_parser = subparsers.add_parser("embed", help="Generate embeddings for a given prompt.") - embed_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub") - embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for") - embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path") - embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub") - embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings") - embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings") - args = parser.parse_args() if args.command == "run": @@ -418,6 +495,8 @@ def main(): from nexa.general import pull_model hf = getattr(args, 'huggingface', False) pull_model(args.model_path, hf) + elif args.command == "convert": + run_convert(args) elif args.command == "remove": from nexa.general import remove_model remove_model(args.model_path) @@ -440,4 +519,4 @@ def main(): parser.print_help() if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/nexa/constants.py b/nexa/constants.py index 65663b7e..c78c4a43 100644 --- a/nexa/constants.py +++ b/nexa/constants.py @@ -404,3 +404,153 @@ class ModelType(Enum): "all-MiniLM-L6-v2": ModelType.TEXT_EMBEDDING, "all-MiniLM-L12-v2": ModelType.TEXT_EMBEDDING, } + +from nexa.gguf.llama.llama_cpp import ( + LLAMA_FTYPE_ALL_F32, + LLAMA_FTYPE_MOSTLY_F16, + LLAMA_FTYPE_MOSTLY_Q4_0, + LLAMA_FTYPE_MOSTLY_Q4_1, + LLAMA_FTYPE_MOSTLY_Q8_0, + LLAMA_FTYPE_MOSTLY_Q5_0, + LLAMA_FTYPE_MOSTLY_Q5_1, + LLAMA_FTYPE_MOSTLY_Q2_K, + LLAMA_FTYPE_MOSTLY_Q3_K_S, + LLAMA_FTYPE_MOSTLY_Q3_K_M, + LLAMA_FTYPE_MOSTLY_Q3_K_L, + LLAMA_FTYPE_MOSTLY_Q4_K_S, + LLAMA_FTYPE_MOSTLY_Q4_K_M, + LLAMA_FTYPE_MOSTLY_Q5_K_S, + LLAMA_FTYPE_MOSTLY_Q5_K_M, + LLAMA_FTYPE_MOSTLY_Q6_K, + LLAMA_FTYPE_MOSTLY_IQ2_XXS, + LLAMA_FTYPE_MOSTLY_IQ2_XS, + LLAMA_FTYPE_MOSTLY_Q2_K_S, + LLAMA_FTYPE_MOSTLY_IQ3_XS, + LLAMA_FTYPE_MOSTLY_IQ3_XXS, + LLAMA_FTYPE_MOSTLY_IQ1_S, + LLAMA_FTYPE_MOSTLY_IQ4_NL, + LLAMA_FTYPE_MOSTLY_IQ3_S, + LLAMA_FTYPE_MOSTLY_IQ3_M, + LLAMA_FTYPE_MOSTLY_IQ2_S, + LLAMA_FTYPE_MOSTLY_IQ2_M, + LLAMA_FTYPE_MOSTLY_IQ4_XS, + LLAMA_FTYPE_MOSTLY_IQ1_M, + LLAMA_FTYPE_MOSTLY_BF16, + LLAMA_FTYPE_MOSTLY_Q4_0_4_4, + LLAMA_FTYPE_MOSTLY_Q4_0_4_8, + LLAMA_FTYPE_MOSTLY_Q4_0_8_8, + LLAMA_FTYPE_MOSTLY_TQ1_0, + LLAMA_FTYPE_MOSTLY_TQ2_0, +) +from nexa.gguf.llama.llama_cpp import ( + GGML_TYPE_F32, + GGML_TYPE_F16, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, + GGML_TYPE_Q5_0, + GGML_TYPE_Q5_1, + GGML_TYPE_Q8_0, + GGML_TYPE_Q8_1, + GGML_TYPE_Q2_K, + GGML_TYPE_Q3_K, + GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_Q8_K, + GGML_TYPE_IQ2_XXS, + GGML_TYPE_IQ2_XS, + GGML_TYPE_IQ3_XXS, + GGML_TYPE_IQ1_S, + GGML_TYPE_IQ4_NL, + GGML_TYPE_IQ3_S, + GGML_TYPE_IQ2_S, + GGML_TYPE_IQ4_XS, + GGML_TYPE_I8, + GGML_TYPE_I16, + GGML_TYPE_I32, + GGML_TYPE_I64, + GGML_TYPE_F64, + GGML_TYPE_IQ1_M, + GGML_TYPE_BF16, + GGML_TYPE_Q4_0_4_4, + GGML_TYPE_Q4_0_4_8, + GGML_TYPE_Q4_0_8_8, + GGML_TYPE_COUNT, +) + +# From quantize.cpp +# For mapping of general quantization options (ftypes) +LLAMA_QUANTIZATION_TYPES = { + "q4_0": LLAMA_FTYPE_MOSTLY_Q4_0, + "q4_1": LLAMA_FTYPE_MOSTLY_Q4_1, + "q5_0": LLAMA_FTYPE_MOSTLY_Q5_0, + "q5_1": LLAMA_FTYPE_MOSTLY_Q5_1, + "q8_0": LLAMA_FTYPE_MOSTLY_Q8_0, + "q2_k": LLAMA_FTYPE_MOSTLY_Q2_K, + "q3_k_s": LLAMA_FTYPE_MOSTLY_Q3_K_S, + "q3_k_m": LLAMA_FTYPE_MOSTLY_Q3_K_M, + "q3_k_l": LLAMA_FTYPE_MOSTLY_Q3_K_L, + "q4_k_s": LLAMA_FTYPE_MOSTLY_Q4_K_S, + "q4_k_m": LLAMA_FTYPE_MOSTLY_Q4_K_M, + "q5_k_s": LLAMA_FTYPE_MOSTLY_Q5_K_S, + "q5_k_m": LLAMA_FTYPE_MOSTLY_Q5_K_M, + "q6_k": LLAMA_FTYPE_MOSTLY_Q6_K, + "iq2_xxs": LLAMA_FTYPE_MOSTLY_IQ2_XXS, + "iq2_xs": LLAMA_FTYPE_MOSTLY_IQ2_XS, + "q2_k_s": LLAMA_FTYPE_MOSTLY_Q2_K_S, + "iq3_xs": LLAMA_FTYPE_MOSTLY_IQ3_XS, + "iq3_xxs": LLAMA_FTYPE_MOSTLY_IQ3_XXS, + "iq1_s": LLAMA_FTYPE_MOSTLY_IQ1_S, + "iq4_nl": LLAMA_FTYPE_MOSTLY_IQ4_NL, + "iq3_s": LLAMA_FTYPE_MOSTLY_IQ3_S, + "iq3_m": LLAMA_FTYPE_MOSTLY_IQ3_M, + "iq2_s": LLAMA_FTYPE_MOSTLY_IQ2_S, + "iq2_m": LLAMA_FTYPE_MOSTLY_IQ2_M, + "iq4_xs": LLAMA_FTYPE_MOSTLY_IQ4_XS, + "iq1_m": LLAMA_FTYPE_MOSTLY_IQ1_M, + "f16": LLAMA_FTYPE_MOSTLY_F16, + "f32": LLAMA_FTYPE_ALL_F32, + "bf16": LLAMA_FTYPE_MOSTLY_BF16, + "q4_0_4_4": LLAMA_FTYPE_MOSTLY_Q4_0_4_4, + "q4_0_4_8": LLAMA_FTYPE_MOSTLY_Q4_0_4_8, + "q4_0_8_8": LLAMA_FTYPE_MOSTLY_Q4_0_8_8, + "tq1_0": LLAMA_FTYPE_MOSTLY_TQ1_0, + "tq2_0": LLAMA_FTYPE_MOSTLY_TQ2_0, +} + +# From ggml.h +# For mapping of output_tensor_type and token_embedding_type only +GGML_TYPES = { + "f32": GGML_TYPE_F32, + "f16": GGML_TYPE_F16, + "q4_0": GGML_TYPE_Q4_0, + "q4_1": GGML_TYPE_Q4_1, + "q5_0": GGML_TYPE_Q5_0, + "q5_1": GGML_TYPE_Q5_1, + "q8_0": GGML_TYPE_Q8_0, + "q8_1": GGML_TYPE_Q8_1, + "q2_k": GGML_TYPE_Q2_K, + "q3_k": GGML_TYPE_Q3_K, + "q4_k": GGML_TYPE_Q4_K, + "q5_k": GGML_TYPE_Q5_K, + "q6_k": GGML_TYPE_Q6_K, + "q8_k": GGML_TYPE_Q8_K, + "iq2_xxs": GGML_TYPE_IQ2_XXS, + "iq2_xs": GGML_TYPE_IQ2_XS, + "iq3_xxs": GGML_TYPE_IQ3_XXS, + "iq1_s": GGML_TYPE_IQ1_S, + "iq4_nl": GGML_TYPE_IQ4_NL, + "iq3_s": GGML_TYPE_IQ3_S, + "iq2_s": GGML_TYPE_IQ2_S, + "iq4_xs": GGML_TYPE_IQ4_XS, + "i8": GGML_TYPE_I8, + "i16": GGML_TYPE_I16, + "i32": GGML_TYPE_I32, + "i64": GGML_TYPE_I64, + "f64": GGML_TYPE_F64, + "iq1_m": GGML_TYPE_IQ1_M, + "bf16": GGML_TYPE_BF16, + "q4_0_4_4": GGML_TYPE_Q4_0_4_4, + "q4_0_4_8": GGML_TYPE_Q4_0_4_8, + "q4_0_8_8": GGML_TYPE_Q4_0_8_8, +} \ No newline at end of file diff --git a/nexa/general.py b/nexa/general.py index e9eb1016..456eee3e 100644 --- a/nexa/general.py +++ b/nexa/general.py @@ -429,6 +429,33 @@ def download_model_from_official(model_path, model_type, **kwargs): except Exception as e: print(f"An error occurred while downloading or processing the model: {e}") return False, None + +def download_repo_from_hf(repo_id): + try: + from huggingface_hub import snapshot_download + from pathlib import Path + except ImportError: + print("The huggingface-hub package is required. Please install it with `pip install huggingface-hub`.") + return False, None + + # Define the local directory to save the model + local_dir = NEXA_MODELS_HUB_HF_DIR / Path(repo_id) + local_dir.mkdir(parents=True, exist_ok=True) + + try: + # Download the entire repository + repo_path = snapshot_download( + repo_id=repo_id, + local_dir=local_dir, + local_dir_use_symlinks=False, + revision="main" + ) + + print(f"Successfully downloaded repository '{repo_id}' to {repo_path}") + return True, repo_path + except Exception as e: + print(f"Failed to download the repository: {e}") + return False, None def download_gguf_from_hf(repo_id, filename): try: diff --git a/nexa/gguf/converter/__init__.py b/nexa/gguf/converter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/nexa/gguf/converter/nexa_convert.py b/nexa/gguf/converter/nexa_convert.py new file mode 100644 index 00000000..c38970cd --- /dev/null +++ b/nexa/gguf/converter/nexa_convert.py @@ -0,0 +1,238 @@ +import os +import logging +import argparse +from typing import Optional +from pathlib import Path +import json + +from nexa.gguf.llama.llama_cpp import GGML_TYPE_COUNT, LLAMA_FTYPE_MOSTLY_Q4_0 +from nexa.constants import LLAMA_QUANTIZATION_TYPES, GGML_TYPES +from nexa.gguf.llama.llama_cpp import llama_model_quantize_params, llama_model_quantize + +# Set up logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def quantize_model( + input_file: str, + output_file: Optional[str] = None, + ftype: str = "q4_0", + nthread: int = 4, + **kwargs +) -> None: + """ + Quantize a GGUF model file. + + Args: + input_file (str): Path to the input GGUF file. + output_file (Optional[str]): Path to the output quantized file. If None, a default path will be used. + ftype (str): Quantization type (default: "q4_0"). + nthread (int): Number of threads to use for quantization (default: 4). + **kwargs: Additional parameters for quantization: + output_tensor_type (str): Output tensor type. + token_embedding_type (str): Token embeddings tensor type. + allow_requantize (bool): Allow quantizing non-f32/f16 tensors. + quantize_output_tensor (bool): Quantize output.weight. + only_copy (bool): Only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored. + pure (bool): Quantize all tensors to the default type. + keep_split (bool): Quantize to the same number of shards. + imatrix (ctypes.c_void_p): Pointer to importance matrix data. + kv_overrides (ctypes.c_void_p): Pointer to vector containing overrides. + + Raises: + FileNotFoundError: If the input file doesn't exist. + ValueError: If an invalid quantization type is provided. + """ + # Check if input file exists + if not os.path.isfile(input_file): + raise FileNotFoundError(f"Input file does not exist: {input_file}") + + # Set up output file path + if output_file is None: + output_dir = os.path.join(os.path.dirname(input_file), "quantized_models") + os.makedirs(output_dir, exist_ok=True) + output_file = os.path.join(output_dir, f"{os.path.basename(input_file).split('.')[0]}_{ftype}.gguf") + else: + output_dir = os.path.dirname(output_file) + os.makedirs(output_dir, exist_ok=True) + + # Set up quantization parameters + params = llama_model_quantize_params() + params.nthread = nthread + + # Handle ftype + if ftype in LLAMA_QUANTIZATION_TYPES: + params.ftype = LLAMA_QUANTIZATION_TYPES[ftype] + else: + logger.warning(f"Provided ftype '{ftype}' not found in LLAMA_QUANTIZATION_TYPES. Using default Q4_0.") + params.ftype = LLAMA_FTYPE_MOSTLY_Q4_0 + + # Handle output_tensor_type + output_tensor_type = kwargs.get('output_tensor_type', '') + if output_tensor_type: + if output_tensor_type in GGML_TYPES: + params.output_tensor_type = GGML_TYPES[output_tensor_type] + else: + logger.warning(f"Provided output_tensor_type '{output_tensor_type}' not found in GGML_TYPES. Using default COUNT.") + params.output_tensor_type = GGML_TYPE_COUNT + else: + params.output_tensor_type = GGML_TYPE_COUNT + + # Handle token_embedding_type + token_embedding_type = kwargs.get('token_embedding_type', '') + if token_embedding_type: + if token_embedding_type in GGML_TYPES: + params.token_embedding_type = GGML_TYPES[token_embedding_type] + else: + logger.warning(f"Provided token_embedding_type '{token_embedding_type}' not found in GGML_TYPES. Using default COUNT.") + params.token_embedding_type = GGML_TYPE_COUNT + else: + params.token_embedding_type = GGML_TYPE_COUNT + + logger.info(f"Starting quantization of {input_file}") + logger.info(f"Output file: {output_file}") + + try: + llama_model_quantize( + input_file.encode("utf-8"), + output_file.encode("utf-8"), + params, + ) + except Exception as e: + logger.error(f"Quantization failed: {str(e)}") + raise + + +def convert_hf_to_quantized_gguf( + input_path: str, + output_file: str = None, + ftype: str = "q4_0", + convert_type: str = "f16", + **kwargs +) -> Optional[str]: + """ + Convert a model in safetensors format to a quantized GGUF file. + + This function handles the conversion of Hugging Face models to GGUF format and subsequent quantization. + It can process both directories containing .safetensors files and existing .gguf files. + + Args: + input_path (str): Path to the input Hugging Face model directory or GGUF file. + output_file (str, optional): Path to the output quantized GGUF file. If None, a default path will be used. + ftype (str, optional): Quantization type (default: "q4_0"). + convert_type (str, optional): Conversion type for safetensors to GGUF (default: "f16"). + **kwargs: Additional keyword arguments for the conversion and quantization process. + + Returns: + Optional[str]: Path to the output quantized GGUF file if successful, None otherwise. + + Raises: + FileNotFoundError: If the input directory or file does not exist. + ValueError: If the input path is invalid or no .safetensors files are found in the directory. + + Note: + - For directory inputs, this function first converts the model to GGUF format, then quantizes it. + - For .gguf file inputs, it directly applies quantization. + - Temporary files are created and cleaned up during the process. + """ + # Convert input path to absolute path + input_path = os.path.abspath(input_path) + + # Set default output file if not provided + if not output_file: + input_name = os.path.basename(input_path) + output_file = os.path.abspath(f"./{input_name}-{ftype}.gguf") + else: + output_file = os.path.abspath(output_file) + + if os.path.isdir(input_path): + if not os.path.exists(input_path): + logger.error(f"Input directory does not exist: {input_path}") + return None + + safetensors_files = [f for f in os.listdir(input_path) if f.endswith('.safetensors')] + if safetensors_files: + # Create tmp file path + tmp_dir = Path.home().absolute() / ".cache" / "nexa" / "tmp_models" + tmp_dir.mkdir(parents=True, exist_ok=True) + tmp_file_name = f"{Path(input_path).name}-{convert_type}.gguf" + tmp_file_path = tmp_dir / tmp_file_name + + try: + # Convert HF model to GGUF + from nexa_gguf.convert_hf_to_gguf import nexa_convert_hf_to_gguf + nexa_convert_hf_to_gguf(model=input_path, outfile=str(tmp_file_path.absolute()), outtype=convert_type, **kwargs) + + # Quantize GGUF model + quantize_model(str(tmp_file_path.absolute()), output_file, ftype, **kwargs) + return output_file + finally: + # Delete the temporary file + if tmp_file_path.exists(): + tmp_file_path.unlink() + else: + logger.error(f"No .safetensors files found in directory: {input_path}") + return None + elif input_path.endswith('.gguf'): + # Directly call quantize_model with input_path + quantize_model(input_file=input_path, output_file=output_file, ftype=ftype, **kwargs) + return output_file + else: + logger.error(f"Invalid input path: {input_path}. Must be a directory with .safetensors files or a .gguf file.") + return None + + +def main(): + parser = argparse.ArgumentParser(description="Convert and quantize a Hugging Face model to GGUF format.") + # nexa convert specific arguments + parser.add_argument("input_path", type=str, help="Path to the input Hugging Face model directory or GGUF file") + parser.add_argument("ftype", nargs='?', type=str, default="q4_0", help="Quantization type (default: q4_0)") + parser.add_argument("output_file", nargs='?', type=str, help="Path to the output quantized GGUF file") + + # Arguments for convert_hf_to_gguf + # Reference: https://github.com/ggerganov/llama.cpp/blob/c8c07d658a6cefc5a50cfdf6be7d726503612303/convert_hf_to_gguf.py#L4284-L4344 + parser.add_argument("--convert_type", type=str, default="f16", help="Conversion type for safetensors to GGUF (default: f16)") + parser.add_argument("--bigendian", action="store_true", help="Use big endian format") + parser.add_argument("--use_temp_file", action="store_true", help="Use a temporary file during conversion") + parser.add_argument("--no_lazy", action="store_true", help="Disable lazy loading") + parser.add_argument("--metadata", type=json.loads, help="Additional metadata as JSON string") + parser.add_argument("--split_max_tensors", type=int, default=0, help="Maximum number of tensors per split") + parser.add_argument("--split_max_size", type=str, default="0", help="Maximum size per split") + parser.add_argument("--no_tensor_first_split", action="store_true", help="Disable tensor-first splitting") + parser.add_argument("--vocab_only", action="store_true", help="Only process vocabulary") + parser.add_argument("--dry_run", action="store_true", help="Perform a dry run without actual conversion") + + # Arguments for quantization + # Reference: https://github.com/ggerganov/llama.cpp/blob/c8c07d658a6cefc5a50cfdf6be7d726503612303/examples/quantize/quantize.cpp#L109-L133 + parser.add_argument("--nthread", type=int, default=4, help="Number of threads to use (default: 4)") + parser.add_argument("--output_tensor_type", type=str, help="Output tensor type") + parser.add_argument("--token_embedding_type", type=str, help="Token embedding type") + parser.add_argument("--allow_requantize", action="store_true", help="Allow quantizing non-f32/f16 tensors") + parser.add_argument("--quantize_output_tensor", action="store_true", help="Quantize output.weight") + parser.add_argument("--only_copy", action="store_true", help="Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)") + parser.add_argument("--pure", action="store_true", help="Quantize all tensors to the default type") + parser.add_argument("--keep_split", action="store_true", help="Quantize to the same number of shards") + + args = parser.parse_args() + + # Prepare kwargs for additional parameters + kwargs = { + k: v for k, v in vars(args).items() + if k not in ["input_path", "output_file", "ftype", "convert_type"] and v is not None + } + + # Convert string types to GGML types if specified + if args.output_tensor_type: + kwargs["output_tensor_type"] = GGML_TYPES.get(args.output_tensor_type, GGML_TYPE_COUNT) + if args.token_embedding_type: + kwargs["token_embedding_type"] = GGML_TYPES.get(args.token_embedding_type, GGML_TYPE_COUNT) + + try: + convert_hf_to_quantized_gguf(args.input_path, args.output_file, args.ftype, args.convert_type, **kwargs) + except Exception as e: + logger.error(f"Error during conversion and quantization: {str(e)}") + exit(1) + +if __name__ == "__main__": + main() diff --git a/nexa/gguf/llama/llama_cpp.py b/nexa/gguf/llama/llama_cpp.py index f970a739..442d2e86 100644 --- a/nexa/gguf/llama/llama_cpp.py +++ b/nexa/gguf/llama/llama_cpp.py @@ -148,7 +148,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa GGML_TYPE_I64 = 27 GGML_TYPE_F64 = 28 GGML_TYPE_IQ1_M = 29 -GGML_TYPE_COUNT = 30 +GGML_TYPE_BF16 = 30, +GGML_TYPE_Q4_0_4_4 = 31 +GGML_TYPE_Q4_0_4_8 = 32 +GGML_TYPE_Q4_0_8_8 = 33 +GGML_TYPE_COUNT = 34 # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); diff --git a/pyproject.toml b/pyproject.toml index b54895cc..2ae2637e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,10 @@ eval = [ "codecarbon", ] +convert = [ + "nexa-gguf", +] + [project.urls] Homepage = "https://github.com/NexaAI/nexa-sdk" Issues = "https://github.com/NexaAI/nexa-sdk/issues" @@ -96,6 +100,7 @@ wheel.packages = [ "nexa.gguf.sd", "nexa.gguf.streamlit", "nexa.gguf.server", + "nexa.gguf.converter", "nexa.onnx", "nexa.onnx.streamlit", "nexa.onnx.server",