diff --git a/.github/workflows/script/models/cpp_graph_inference.sh b/.github/workflows/script/models/cpp_graph_inference.sh index 9f656cca002..26a2d3deb1b 100644 --- a/.github/workflows/script/models/cpp_graph_inference.sh +++ b/.github/workflows/script/models/cpp_graph_inference.sh @@ -44,7 +44,7 @@ function main() { infer_cmd="./build/bin/run_gptj" model_name="EleutherAI/gpt-j-6b" input_model="/tf_dataset2/models/pytorch/gpt-j-6B" - precision_list=("q4_j_b128") + precision_list=("q4_j_b128", "q4_j_b128_asym") elif [[ "${model}" == "starcoder-3b" ]]; then convert_script="${working_dir}/scripts/convert_starcoder.py" quant_script="./build/bin/quant_starcoder" @@ -119,6 +119,8 @@ function main() { ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --scale_dtype fp32 --compute_type fp32 --alg sym elif [[ ${precision} == "q4_j_b128" ]]; then ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg sym + elif [[ ${precision} == "q4_j_b128_asym" ]]; then + ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 128 --scale_dtype fp32 --compute_type fp32 --alg asym elif [[ ${precision} == "q4_0" ]]; then ${quant_script} --model_file ${working_dir}/${model}-fp32.bin --out_file ${working_dir}/${model}-${precision}.bin --bits 4 --block_size 32 --compute_type ggml --alg sym elif [[ ${precision} == "q4_1" ]]; then diff --git a/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_weight_compression.h b/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_weight_compression.h index a1669c839ad..a136833660a 100644 --- a/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_weight_compression.h +++ b/intel_extension_for_transformers/llm/library/jblas/jblas/jit_blas_weight_compression.h @@ -1280,11 +1280,11 @@ class WeightF4ScaleFp32 : public WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP> public: using Param = typename WeightS8ScaleFp32<_GemmCore_T, ISA_T>::Param; using StorageWeight = StorageWeightF4ScaleFp32; - PackedWeight* createStorage(const int N, const int K, int blocksize, bool is_sym = true) override { + PackedWeight* createStorage(const int N, const int K, int blocksize) { int KPad = utils::padto(K, _GemmCore_T::KTILE); int NPad = utils::padto(N, _GemmCore_T::NTILE); auto ptr = new StorageWeight(_GemmCore_T::TYPE, F4_T); - ptr->resize(NPad, KPad, blocksize <= 0 ? K : blocksize, is_sym); + ptr->resize(NPad, KPad, blocksize <= 0 ? K : blocksize); return ptr; } @@ -1334,6 +1334,26 @@ class WeightF4ScaleFp32 : public WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP> assert(false); return JblasInvalidParam; } + virtual void packQWeight(const int N, const int K, const int8_t* B, const int ldb, const float* scales, + PackedWeight* ptr) { + auto stor = dynamic_cast(ptr); + if (stor) { + int rawnk_scale = utils::updiv(K, stor->mBlockSize); + int nk_scale = utils::updiv(stor->mKPad, stor->mBlockSize); +#pragma omp parallel for + for (int i = 0; i < nk_scale; i++) { // padding copy + if (i < rawnk_scale) { + std::memcpy(stor->mSPtr + i * stor->mNPad, scales + i * N, N * sizeof(scales[0])); + } else { + std::memset(stor->mSPtr + i * stor->mNPad, 0, stor->mNPad * sizeof(stor->mSPtr[0])); + } + } + utils::avector reorded(stor->mKPad * stor->mNPad); + WeightS8ScaleFp32<_GemmCore_T, ISA_T>::reorderWeight(N, K, B, ldb, reorded.data()); + WeightS4ScaleFp32<_GemmCore_T, ISA_T, S4_CLIP>::compressWeight(stor->mNPad, stor->mKPad, reorded.data(), + stor->mNPad, stor->mWPtr); + } + } protected: virtual void quantRowBlock(const float* srcptr, int8_t* dstptr, int row, int col, int ld_src, int ld_dst, diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h index ba3c79464c3..4f28689cf49 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_files.h @@ -23,6 +23,12 @@ #include #endif +#if UINTPTR_MAX == 0xFFFFFFFF +#define NE_MEM_ALIGN 4 +#else +#define NE_MEM_ALIGN 16 +#endif + #include "core/ne_layers.h" #include "models/model_utils/util.h" #include "models/models.h" @@ -493,9 +499,15 @@ struct model_model_loader { void calc_sizes(size_t* ctx_size_p, size_t* mmapped_size_p) const { *ctx_size_p = *mmapped_size_p = 0; + size_t size_needed = 0; for (const model_load_tensor& lt : tensors_map.tensors) { *ctx_size_p += sizeof(struct ne_tensor) + NE_OBJECT_SIZE; - *(use_mmap ? mmapped_size_p : ctx_size_p) += lt.size; + if (lt.type == NE_TYPE_JBLAS) { + size_needed = lt.size; + } else { + size_needed = (lt.size + NE_MEM_ALIGN - 1) / NE_MEM_ALIGN * NE_MEM_ALIGN; + } + *(use_mmap ? mmapped_size_p : ctx_size_p) += size_needed; } } diff --git a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp index 584a1dfd994..4bbd3116d8b 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/models/model_utils/model_utils.cpp @@ -796,6 +796,9 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter if (params.bits == quant_bits::q4) { if (params.scale_dtype == quant_sdtype::fp32) { if (params.compute_type == quant_comp::int8) { + if (params.alg != quant_alg::sym) { + printf("Current not support asymmetric int8 computation, reset to symmetric\n"); + } if (params.block_size == -1) { using Kernel = WeiS4ClipFp32PerN; using KernelRef = WeiS4ClipFp32PerN; @@ -824,7 +827,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter using KernelRef = WeiS4ClipFp32; static Kernel kernel; static Kernel kernelref; - packedw = kernel.createStorage(n, k, params.block_size); + packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym); if (cd->AVX512_FP16()) { kernel.packTransposeWeight(n, k, f32ptr, k, packedw); } else { @@ -835,7 +838,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter using KernelRef = WeiS4ClipFp32; static Kernel kernel; static Kernel kernelref; - packedw = kernel.createStorage(n, k, params.block_size); + packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym); if (cd->AMX_BF16()) { kernel.packTransposeWeight(n, k, f32ptr, k, packedw); } else { @@ -848,6 +851,9 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter // TODO add 8bit quantization if (params.scale_dtype == quant_sdtype::fp32) { if (params.compute_type == quant_comp::int8) { + if (params.alg != quant_alg::sym) { + printf("Current not support asymmetric int8 computation, reset to symmetric\n"); + } if (params.block_size == -1) { using Kernel = WeiS8Fp32PerN; using KernelRef = WeiS8Fp32PerN; @@ -876,7 +882,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter using KernelRef = WeiS8Fp32; static Kernel kernel; static Kernel kernelref; - packedw = kernel.createStorage(n, k, params.block_size); + packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym); if (cd->AVX512_FP16()) { kernel.packTransposeWeight(n, k, f32ptr, k, packedw); } else { @@ -887,7 +893,7 @@ size_t jblas_quantize(const float* f32ptr, void* dstpr, const quant_params_inter using KernelRef = WeiS8Fp32; static Kernel kernel; static Kernel kernelref; - packedw = kernel.createStorage(n, k, params.block_size); + packedw = kernel.createStorage(n, k, params.block_size, params.alg == quant_alg::sym); if (cd->AMX_BF16()) { kernel.packTransposeWeight(n, k, f32ptr, k, packedw); } else {