Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

improve Avx2 #511

Merged
merged 4 commits into from
Oct 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,9 @@ option(NE_SANITIZE_UNDEFINED "neural_engine: enable undefined sanitizer"
# instruction set specific
option(NE_AVX "neural_engine: enable AVX" ON)
option(NE_AVX2 "neural_engine: enable AVX2" ON)
option(NE_AVX512 "neural_engine: enable AVX512" ON)
option(NE_AVX512_VBMI "neural_engine: enable AVX512-VBMI" ON)
option(NE_AVX512_VNNI "neural_engine: enable AVX512-VNNI" ON)
option(NE_AVX512 "neural_engine: enable AVX512" OFF)
option(NE_AVX512_VBMI "neural_engine: enable AVX512-VBMI" OFF)
option(NE_AVX512_VNNI "neural_engine: enable AVX512-VNNI" OFF)
option(NE_FMA "neural_engine: enable FMA" ON)
option(NE_AMX "neural_engine: enable AMX" OFF)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ cd build
cmake ..
cmake --build . -j
```
Note: add compile args ```-DNE_AVX512=OFF -DNE_AVX512_VBMI=OFF -DNE_AVX512_VNNI=OFF``` to ```cmake``` when compiling it on a CPU without AVX512

### 2. Run LLM with Python API

You can use Python API to run Hugging Face model simply. Here is the sample code:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ inline fp32x16 load_fp32x16(void const* mem_addr) {
#endif
}
template <>
fp32x16 load_kernel_t<fp32x16>(const void* src) {
inline fp32x16 load_kernel_t<fp32x16>(const void* src) {
return load_fp32x16(src);
}
inline fp32x16 mask_load_fp32x16(fp32x16 src, int mask, void const* mem_addr) {
Expand All @@ -43,7 +43,7 @@ inline bf16x16 load_bf16x16(void const* mem_addr) {
return {_mm256_loadu_si256(mem_addr_bf16)};
}
template <>
bf16x16 load_kernel_t<bf16x16>(const void* src) {
inline bf16x16 load_kernel_t<bf16x16>(const void* src) {
return load_bf16x16(src);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
inline void store_s8x16(void* mem_addr, s8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); }
inline void store_u8x16(void* mem_addr, u8x16 a) { _mm_storeu_si128(reinterpret_cast<__m128i*>(mem_addr), a.first); }
template <>
void store_kernel_t<s8x16>(void* dst, s8x16 src) {
inline void store_kernel_t<s8x16>(void* dst, s8x16 src) {
store_s8x16(dst, src);
}

Expand Down Expand Up @@ -57,7 +57,7 @@ inline void store_fp32x16(void* mem_addr, fp32x16 a) {
}

template <>
void store_kernel_t<fp32x16>(void* dst, fp32x16 src) {
inline void store_kernel_t<fp32x16>(void* dst, fp32x16 src) {
store_fp32x16(dst, src);
}

Expand All @@ -66,7 +66,7 @@ inline void store_bf16x16(void* mem_addr, bf16x16 a) {
}

template <>
void store_kernel_t<bf16x16>(void* dst, bf16x16 src) {
inline void store_kernel_t<bf16x16>(void* dst, bf16x16 src) {
store_bf16x16(dst, src);
}

Expand Down