diff --git a/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt b/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt index 666d9845abd..80d4a800273 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt +++ b/intel_extension_for_transformers/llm/runtime/graph/CMakeLists.txt @@ -54,9 +54,9 @@ option(NE_SANITIZE_UNDEFINED "neural_engine: enable undefined sanitizer" # instruction set specific option(NE_AVX "neural_engine: enable AVX" ON) option(NE_AVX2 "neural_engine: enable AVX2" ON) -option(NE_AVX512 "neural_engine: enable AVX512" ON) -option(NE_AVX512_VBMI "neural_engine: enable AVX512-VBMI" ON) -option(NE_AVX512_VNNI "neural_engine: enable AVX512-VNNI" ON) +option(NE_AVX512 "neural_engine: enable AVX512" OFF) +option(NE_AVX512_VBMI "neural_engine: enable AVX512-VBMI" OFF) +option(NE_AVX512_VNNI "neural_engine: enable AVX512-VNNI" OFF) option(NE_FMA "neural_engine: enable FMA" ON) option(NE_AMX "neural_engine: enable AMX" OFF) diff --git a/intel_extension_for_transformers/llm/runtime/graph/README.md b/intel_extension_for_transformers/llm/runtime/graph/README.md index 1f2c83f0224..4f5ca7839b5 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/README.md +++ b/intel_extension_for_transformers/llm/runtime/graph/README.md @@ -61,7 +61,7 @@ cd build cmake .. cmake --build . -j ``` -Note: add compile args ```-DNE_AVX512=OFF -DNE_AVX512_VBMI=OFF -DNE_AVX512_VNNI=OFF``` to ```cmake``` when compiling it on a CPU without AVX512 + ### 2. Run LLM with Python API You can use Python API to run Hugging Face model simply. Here is the sample code: diff --git a/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_load.hpp b/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_load.hpp index 435736eee87..3211ceac4e2 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_load.hpp +++ b/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_load.hpp @@ -26,7 +26,7 @@ inline fp32x16 load_fp32x16(void const* mem_addr) { #endif } template <> -fp32x16 load_kernel_t(const void* src) { +inline fp32x16 load_kernel_t(const void* src) { return load_fp32x16(src); } inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) { @@ -43,7 +43,7 @@ inline bf16x16 load_bf16x16(void const* mem_addr) { return {_mm256_loadu_si256(mem_addr_bf16)}; } template <> -bf16x16 load_kernel_t(const void* src) { +inline bf16x16 load_kernel_t(const void* src) { return load_bf16x16(src); } diff --git a/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_store.hpp b/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_store.hpp index e096a0de96b..5522ebb2f81 100644 --- a/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_store.hpp +++ b/intel_extension_for_transformers/llm/runtime/graph/vectors/cpu/vec_store.hpp @@ -20,7 +20,7 @@ inline void store_s8x16(void* mem_addr, s8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); } inline void store_u8x16(void* mem_addr, u8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); } template <> -void store_kernel_t(void* dst, s8x16 src) { +inline void store_kernel_t(void* dst, s8x16 src) { store_s8x16(dst, src); } @@ -57,7 +57,7 @@ inline void store_fp32x16(void* mem_addr, fp32x16 a) { } template <> -void store_kernel_t(void* dst, fp32x16 src) { +inline void store_kernel_t(void* dst, fp32x16 src) { store_fp32x16(dst, src); } @@ -66,7 +66,7 @@ inline void store_bf16x16(void* mem_addr, bf16x16 a) { } template <> -void store_kernel_t(void* dst, bf16x16 src) { +inline void store_kernel_t(void* dst, bf16x16 src) { store_bf16x16(dst, src); }