diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index d8c2f4aa29..472f35b6eb 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -8205,6 +8205,16 @@ ], "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-0.5B" + }, { "model_format": "pytorch", "model_size_in_billions": "1_5", @@ -8213,8 +8223,17 @@ "8-bit", "none" ], - "model_id": "Qwen/Qwen2.5-Coder-1.5B", - "model_revision": "d3586cfe793730945f8e4d7ef31032a3ee50247d" + "model_id": "Qwen/Qwen2.5-Coder-1.5B" + }, + { + "model_format": "pytorch", + "model_size_in_billions": "3", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-3B" }, { "model_format": "pytorch", @@ -8224,8 +8243,27 @@ "8-bit", "none" ], - "model_id": "Qwen/Qwen2.5-Coder-7B", - "model_revision": "30b6a7e874a78d46b80fa1db3194ea427dd41b08" + "model_id": "Qwen/Qwen2.5-Coder-7B" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-14B" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-32B" } ] }, @@ -8243,6 +8281,16 @@ ], "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-0.5B-Instruct" + }, { "model_format": "pytorch", "model_size_in_billions": "1_5", @@ -8253,6 +8301,16 @@ ], "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct" }, + { + "model_format": "pytorch", + "model_size_in_billions": "3", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-3B-Instruct" + }, { "model_format": "pytorch", "model_size_in_billions": 7, @@ -8263,6 +8321,53 @@ ], "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct" }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-14B-Instruct" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "Qwen/Qwen2.5-Coder-32B-Instruct" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": "3", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-{quantization}" + }, { "model_format": "gptq", "model_size_in_billions": "7", @@ -8272,6 +8377,73 @@ ], "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-GPTQ-{quantization}" }, + { + "model_format": "gptq", + "model_size_in_billions": "14", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "gptq", + "model_size_in_billions": "32", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-{quantization}" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "3", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-Coder-3B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "7", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-Coder-7B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "14", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-Coder-14B-Instruct-AWQ" + }, + { + "model_format": "awq", + "model_size_in_billions": "32", + "quantizations": [ + "Int4" + ], + "model_id": "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ" + }, + { "model_format": "ggufv2", "model_size_in_billions": "1_5", diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index bfd3d09a4a..f8598d3602 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -5907,6 +5907,18 @@ ], "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-0.5B", + "model_revision": "master", + "model_hub": "modelscope" + }, { "model_format": "pytorch", "model_size_in_billions": "1_5", @@ -5919,6 +5931,18 @@ "model_revision": "master", "model_hub": "modelscope" }, + { + "model_format": "pytorch", + "model_size_in_billions": "3", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-3B", + "model_revision": "master", + "model_hub": "modelscope" + }, { "model_format": "pytorch", "model_size_in_billions": 7, @@ -5930,6 +5954,30 @@ "model_id": "qwen/Qwen2.5-Coder-7B", "model_revision": "master", "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-14B", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-32B", + "model_revision": "master", + "model_hub": "modelscope" } ] }, @@ -5947,6 +5995,18 @@ ], "model_description": "Qwen2.5-Coder is the latest series of Code-Specific Qwen large language models (formerly known as CodeQwen).", "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": "0_5", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct", + "model_revision": "master", + "model_hub": "modelscope" + }, { "model_format": "pytorch", "model_size_in_billions": "1_5", @@ -5958,6 +6018,17 @@ "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct", "model_revision": "master", "model_hub": "modelscope" + }, { + "model_format": "pytorch", + "model_size_in_billions": "3", + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-3B-Instruct", + "model_revision": "master", + "model_hub": "modelscope" }, { "model_format": "pytorch", @@ -5971,6 +6042,63 @@ "model_revision": "master", "model_hub": "modelscope" }, + { + "model_format": "pytorch", + "model_size_in_billions": 14, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-14B-Instruct", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "pytorch", + "model_size_in_billions": 32, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "qwen/Qwen2.5-Coder-32B-Instruct", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-GPTQ-{quantization}", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-GPTQ-{quantization}", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-GPTQ-{quantization}", + "model_revision": "master", + "model_hub": "modelscope" + }, { "model_format": "gptq", "model_size_in_billions": 7, @@ -5982,6 +6110,89 @@ "model_revision": "master", "model_hub": "modelscope" }, + { + "model_format": "gptq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-GPTQ-{quantization}", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "gptq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4", + "Int8" + ], + "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-{quantization}", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "0_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-Coder-0.5B-Instruct-AWQ", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": "1_5", + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-Coder-1.5B-Instruct-AWQ", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 3, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-Coder-3B-Instruct-AWQ", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 7, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-Coder-7B-Instruct-AWQ", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 14, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-Coder-14B-Instruct-AWQ", + "model_revision": "master", + "model_hub": "modelscope" + }, + { + "model_format": "awq", + "model_size_in_billions": 32, + "quantizations": [ + "Int4" + ], + "model_id": "qwen/Qwen2.5-Coder-32B-Instruct-AWQ", + "model_revision": "master", + "model_hub": "modelscope" + }, + { "model_format": "ggufv2", "model_size_in_billions": "1_5",