diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/README.md b/intel_extension_for_transformers/llm/runtime/graph/core/README.md index efbb4a28fb3..0b049d7060a 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/README.md +++ b/intel_extension_for_transformers/llm/runtime/graph/core/README.md @@ -48,12 +48,12 @@ We support three kinds of kernel fusion for transformer models: QKV, MHA (multi- QKV GPT-J
LLaMA - AMX_INT8, AVX512_VNNI + AMX_INT8, AVX512_VNNI, AVX_VNNI FFN GPT-J
LLaMA
BLOOM
ChatGLM
Falcon
MPT - AMX_INT8, AVX512_VNNI, AVX512F and AMX_BF16 + AMX_INT8, AVX512_VNNI, AVX512F, AMX_BF16, AVX_VNNI, AVX2 MHA @@ -71,4 +71,6 @@ codename | weight config | runtime ISA Sapphire Rapids | any int4
group size=-1
compute type=int8 | AMX_INT8 Ice Lake
Cascade Lake
Cooper Lake
Tiger Lake
Rocket Lake | any int4
group size=-1
compute type=int8 | AVX512_VNNI Skylake | any 4bits
group size=-1
compute type=fp32 | AVX512F +Alder Lake (12th Gen)
Raptor Lake (13th and 14th Gen)|any 4bits
group size=-1
compute type=int8 | AVX_VNNI +Older architecture (before 12th Gen)| any 4bits
group size=-1
compute type=fp32 | AVX2 diff --git a/intel_extension_for_transformers/llm/runtime/graph/core/layers/inner_product.cpp b/intel_extension_for_transformers/llm/runtime/graph/core/layers/inner_product.cpp index 065ad048df4..8e9b6b7a1f9 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/core/layers/inner_product.cpp +++ b/intel_extension_for_transformers/llm/runtime/graph/core/layers/inner_product.cpp @@ -92,6 +92,24 @@ using PerNFp32Fp32 = jblas::wrapper::gemm_pack_weight::GemmInterfaceParallelAB< jblas::utils::parallel::Parallel2DGemm>; } // namespace avx512_vnni +namespace avx_vnni { +JBLAS_ISA constexpr DefaultISA = JblasAVX_VNNI; + +template