diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7165e8048f..5dc093d8943 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -416,21 +416,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
 
     if(CMAKE_SIZEOF_VOID_P EQUAL 8)
         set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
-        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V)
+        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m8_t _s, _w; float _v; size_t vl; _s = __riscv_vfmacc_vf_f32m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_V)
 
-        set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zvfh")
-        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)
+        set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
+        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_ZVFH)
 
-        # if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH)
-        #     set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
-        #     check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH)
-        # endif()
+        set(CMAKE_REQUIRED_FLAGS "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
+        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m8_t _s, _w; __fp16 _v; size_t vl; _s = __riscv_vfmacc_vf_f16m8(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
 
         unset(CMAKE_REQUIRED_FLAGS)
 
-        if(NCNN_COMPILER_SUPPORT_RISCV_V)
+        if(NCNN_COMPILER_SUPPORT_RISCV_V OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
             option(NCNN_RVV "optimize risc-v platform with v extension" ON)
-            if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH)
+            if(NCNN_COMPILER_SUPPORT_RISCV_ZVFH OR NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
                 if(NCNN_RVV)
                     option(NCNN_ZVFH "optimize risc-v platform with zvfh extension" ON)
                 endif()
@@ -456,8 +454,15 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
         #         add_definitions(-D__rvv_tuple)
         #     endif()
         else()
-            message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.")
+            message(WARNING "The compiler does not support risc-v v or xtheadvector extension. NCNN_RVV will be OFF.")
         endif()
+
+        if(NCNN_COMPILER_SUPPORT_RISCV_XTHEADVECTOR)
+            option(NCNN_XTHEADVECTOR "optimize risc-v platform with xtheadvector extension" ON)
+        else()
+            message(WARNING "The compiler does not support risc-v xtheadvector extension. NCNN_XTHEADVECTOR will be OFF.")
+        endif()
+
     endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc|ppc)")
     set(NCNN_TARGET_ARCH powerpc)
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index 8afe15546bc..462bf45afd0 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -363,17 +363,14 @@ macro(ncnn_add_layer class)
     endif()
 
     if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-        # if(NCNN_RUNTIME_CPU AND NCNN_RVV)
-            # ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
-
-            # if(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
-                # ncnn_add_arch_opt_layer(${class} zvfh "-march=rv64gcv_zvfh")
-            # elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
-                # ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
-            # endif()
-        # endif()
+        if(NCNN_RUNTIME_CPU AND NCNN_RVV)
+            ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
+        endif()
+        if(NCNN_RUNTIME_CPU AND NCNN_XTHEADVECTOR)
+            ncnn_add_arch_opt_layer(${class} xtheadvector "-march=rv64gc_zfh_xtheadvector -D__fp16=_Float16")
+        endif()
         if(NCNN_ZVFH)
-            ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zvfh -D__fp16=_Float16")
+            ncnn_add_arch_opt_source(${class} zvfh "-march=rv64gcv_zfh_zvfh -D__fp16=_Float16")
         endif()
     endif()
 
diff --git a/cmake/ncnn_generate_xtheadvector_source.cmake b/cmake/ncnn_generate_xtheadvector_source.cmake
new file mode 100644
index 00000000000..e9c7b00c818
--- /dev/null
+++ b/cmake/ncnn_generate_xtheadvector_source.cmake
@@ -0,0 +1,14 @@
+
+# must define SRC DST CLASS
+
+file(READ ${SRC} source_data)
+
+# replace
+string(TOUPPER ${CLASS} CLASS_UPPER)
+string(TOLOWER ${CLASS} CLASS_LOWER)
+
+string(REGEX REPLACE "LAYER_${CLASS_UPPER}_RISCV_H" "LAYER_${CLASS_UPPER}_RISCV_XTHEADVECTOR_H" source_data "${source_data}")
+string(REGEX REPLACE "${CLASS}_riscv" "${CLASS}_riscv_xtheadvector" source_data "${source_data}")
+string(REGEX REPLACE "#include \"${CLASS_LOWER}_riscv.h\"" "#include \"${CLASS_LOWER}_riscv_xtheadvector.h\"" source_data "${source_data}")
+
+file(WRITE ${DST} "${source_data}")
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index aec5291e48b..b6f28c55b50 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -631,24 +631,16 @@ if(NCNN_TARGET_ARCH STREQUAL "loongarch")
 endif()
 
 if(NCNN_TARGET_ARCH STREQUAL "riscv" AND CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT C906)
-    if(NCNN_RVV)
+    if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
         set(RISCV_MARCH_FLAG "-march=rv64gcv")
-    else()
-        set(RISCV_MARCH_FLAG "-march=rv64gc")
-    endif()
-
-    if(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH)
-        set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zvfh")
+        if(NCNN_ZVFH)
+            set(RISCV_MARCH_FLAG "${RISCV_MARCH_FLAG}_zfh_zvfh")
+            target_compile_options(ncnn PRIVATE -D__fp16=_Float16)
+        endif()
+    elseif(NOT NCNN_RUNTIME_CPU AND NCNN_ZVFH)
+        set(RISCV_MARCH_FLAG "-march=rv64gc_zfh_xtheadvector")
         target_compile_options(ncnn PRIVATE -D__fp16=_Float16)
     endif()
-
-    # if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
-    #     target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh)
-    # elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
-    #     target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16)
-    # elseif(NCNN_COMPILER_SUPPORT_RVV)
-    #     target_compile_options(ncnn PRIVATE -march=rv64gcv)
-    # endif()
     target_compile_options(ncnn PRIVATE ${RISCV_MARCH_FLAG})
 endif()
 
diff --git a/src/cpu.cpp b/src/cpu.cpp
index e18671a21d5..3b52d95b8e9 100644
--- a/src/cpu.cpp
+++ b/src/cpu.cpp
@@ -2580,6 +2580,22 @@ int cpu_support_riscv_zvfh()
 #endif
 }
 
+int cpu_support_riscv_xtheadvector()
+{
+    try_initialize_global_cpu_info();
+#if defined __ANDROID__ || defined __linux__
+#if __riscv
+    // v + f does not imply zfh, but how to discover zvfh properly ?
+    // upstream issue https://github.com/riscv/riscv-isa-manual/issues/414
+    return g_hwcaps & COMPAT_HWCAP_ISA_V && g_hwcaps & COMPAT_HWCAP_ISA_F;
+#else
+    return 0;
+#endif
+#else
+    return 0;
+#endif
+}
+
 int cpu_riscv_vlenb()
 {
     try_initialize_global_cpu_info();
diff --git a/src/cpu.h b/src/cpu.h
index 9530d4f7e7e..4c8e42c5712 100644
--- a/src/cpu.h
+++ b/src/cpu.h
@@ -118,6 +118,8 @@ NCNN_EXPORT int cpu_support_riscv_v();
 NCNN_EXPORT int cpu_support_riscv_zfh();
 // zvfh = riscv vector half-precision float
 NCNN_EXPORT int cpu_support_riscv_zvfh();
+// xtheadvector = riscv xtheadvector
+NCNN_EXPORT int cpu_support_riscv_xtheadvector();
 // vlenb = riscv vector length in bytes
 NCNN_EXPORT int cpu_riscv_vlenb();
 
diff --git a/src/layer.cpp b/src/layer.cpp
index 1f76d546060..d82595105ce 100644
--- a/src/layer.cpp
+++ b/src/layer.cpp
@@ -546,6 +546,20 @@ Layer* create_layer_cpu(int index)
     }
     else
 #endif // NCNN_RUNTIME_CPU && NCNN_MSA
+#if NCNN_RUNTIME_CPU && NCNN_RVV
+    if (ncnn::cpu_support_riscv_v())
+    {
+        layer_creator = layer_registry_rvv[index].creator;
+    }
+    else
+#endif // NCNN_RUNTIME_CPU && NCNN_RVV
+#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR
+    if (ncnn::cpu_support_riscv_xtheadvector())
+    {
+        layer_creator = layer_registry_xtheadvector[index].creator;
+    }
+    else
+#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR
     {
         layer_creator = layer_registry_arch[index].creator;
     }
diff --git a/src/layer/riscv/absval_fp16.h b/src/layer/riscv/absval_fp16.h
new file mode 100755
index 00000000000..50cc5d3de17
--- /dev/null
+++ b/src/layer/riscv/absval_fp16.h
@@ -0,0 +1,67 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+
+#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_zvfh
+void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt);
+#endif
+
+#if __riscv_zvfh
+static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl)
+{
+    return __riscv_vfsgnjx_vv_f16m8(op1, op1, vl);
+}
+#endif // __riscv_zvfh
+
+static void absval_fp16(Mat& bottom_top_blob, const Option& opt)
+{
+#if NCNN_RUNTIME_CPU && NCNN_ZVFH && __riscv_vector && !__riscv_xtheadvector && !__riscv_zvfh
+    if (ncnn::cpu_support_riscv_zvfh())
+    {
+        absval_fp16_zvfh(bottom_top_blob, opt);
+        return;
+    }
+#endif
+
+#if __riscv_zvfh
+    const int w = bottom_top_blob.w;
+    const int h = bottom_top_blob.h;
+    const int d = bottom_top_blob.d;
+    const int channels = bottom_top_blob.c;
+    const int elempack = bottom_top_blob.elempack;
+    const int size = w * h * d * elempack;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        __fp16* ptr = bottom_top_blob.channel(q);
+
+        int n = size;
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e16m8(n);
+
+            vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
+            _p = __riscv_vfabs_v_f16m8_absval(_p, vl);
+            __riscv_vse16_v_f16m8(ptr, _p, vl);
+
+            ptr += vl;
+            n -= vl;
+        }
+    }
+#else
+    (void)bottom_top_blob;
+    (void)opt;
+#endif // __riscv_zvfh
+}
diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp
index b341f48a5d8..8c84af7284c 100644
--- a/src/layer/riscv/absval_riscv.cpp
+++ b/src/layer/riscv/absval_riscv.cpp
@@ -13,6 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "absval_riscv.h"
+#include "cpu.h"
 
 #if __riscv_vector
 #include <riscv_vector.h>
@@ -20,12 +21,14 @@
 
 namespace ncnn {
 
+#include "absval_fp16.h"
+
 AbsVal_riscv::AbsVal_riscv()
 {
 #if __riscv_vector
     support_packing = true;
-#if NCNN_ZVFH
-    support_fp16_storage = cpu_support_riscv_zvfh();
+#if NCNN_ZVFH || NCNN_XTHEADVECTOR
+    support_fp16_storage = cpu_support_riscv_zvfh() || cpu_support_riscv_xtheadvector();
 #endif
 #endif // __riscv_vector
 }
@@ -39,7 +42,7 @@ static inline vfloat32m8_t __riscv_vfabs_v_f32m8_absval(vfloat32m8_t op1, size_t
 
 int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
-#if NCNN_ZVFH
+#if __riscv_vector
     int elembits = bottom_top_blob.elembits();
 
     if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
@@ -86,4 +89,12 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     return 0;
 }
 
+#if __riscv_vector
+int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+{
+    absval_fp16(bottom_top_blob, opt);
+    return 0;
+}
+#endif // __riscv_vector
+
 } // namespace ncnn
diff --git a/src/layer/riscv/absval_riscv.h b/src/layer/riscv/absval_riscv.h
index 0acf42ce634..b08a8a7bb31 100644
--- a/src/layer/riscv/absval_riscv.h
+++ b/src/layer/riscv/absval_riscv.h
@@ -27,7 +27,7 @@ class AbsVal_riscv : public AbsVal
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
 
 protected:
-#if NCNN_ZVFH
+#if __riscv_vector
     int forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const;
 #endif
 };
diff --git a/src/layer/riscv/absval_riscv_zvfh.cpp b/src/layer/riscv/absval_riscv_zvfh.cpp
index c6a9c393240..1eb9554a6b2 100644
--- a/src/layer/riscv/absval_riscv_zvfh.cpp
+++ b/src/layer/riscv/absval_riscv_zvfh.cpp
@@ -12,50 +12,16 @@
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 
-#include "absval_riscv.h"
-
-#if __riscv_vector
-#include <riscv_vector.h>
-#endif
+#include "cpu.h"
+#include "mat.h"
 
 namespace ncnn {
 
-#if __riscv_zvfh
-static inline vfloat16m8_t __riscv_vfabs_v_f16m8_absval(vfloat16m8_t op1, size_t vl)
-{
-    return __riscv_vfsgnjx_vv_f16m8(op1, op1, vl);
-}
+#include "absval_fp16.h"
 
-int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
+void absval_fp16_zvfh(Mat& bottom_top_blob, const Option& opt)
 {
-    int w = bottom_top_blob.w;
-    int h = bottom_top_blob.h;
-    int d = bottom_top_blob.d;
-    int channels = bottom_top_blob.c;
-    int elempack = bottom_top_blob.elempack;
-    int size = w * h * d * elempack;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int q = 0; q < channels; q++)
-    {
-        __fp16* ptr = bottom_top_blob.channel(q);
-
-        int n = size;
-        while (n > 0)
-        {
-            size_t vl = __riscv_vsetvl_e16m8(n);
-
-            vfloat16m8_t _p = __riscv_vle16_v_f16m8(ptr, vl);
-            _p = __riscv_vfabs_v_f16m8_absval(_p, vl);
-            __riscv_vse16_v_f16m8(ptr, _p, vl);
-
-            ptr += vl;
-            n -= vl;
-        }
-    }
-
-    return 0;
+    absval_fp16(bottom_top_blob, opt);
 }
-#endif // __riscv_zvfh
 
 } // namespace ncnn
diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp
index 788549bb5bb..15b206b063c 100644
--- a/src/layer/riscv/packing_riscv.cpp
+++ b/src/layer/riscv/packing_riscv.cpp
@@ -20,6 +20,8 @@
 
 #include "riscv_usability.h"
 
+#include "cpu.h"
+
 namespace ncnn {
 
 Packing_riscv::Packing_riscv()
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index fd1806d8185..6966a7368c7 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -125,54 +125,54 @@ static inline vfloat16m8_t __riscv_vle16_v_f16m8_f16m1(const __fp16* ptr)
 #endif // __riscv_zvfh
 #endif // __riscv_vector
 
-#if __riscv_vector && __rvv_tuple
+#if 0//__riscv_vector && __rvv_tuple
 
 // f32m1, vsseg.v
-static inline void __riscv_vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
+static inline void __riscv_vsseg8e32_v_f32m1(float* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
 {
     vfloat32m1x8_t _tmp = __riscv_vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
     __riscv_vsseg8e32_v_f32m1x8(base, _tmp, vl);
 }
 
-static inline void __riscv_vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
+static inline void __riscv_vsseg4e32_v_f32m1(float* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
 {
     vfloat32m1x4_t _tmp = __riscv_vcreate_f32m1x4(v0, v1, v2, v3);
     __riscv_vsseg4e32_v_f32m1x4(base, _tmp, vl);
 }
 
-static inline void __riscv_vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
+static inline void __riscv_vsseg2e32_v_f32m1(float* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
 {
     vfloat32m1x2_t _tmp = __riscv_vcreate_f32m1x2(v0, v1);
     __riscv_vsseg2e32_v_f32m1x2(base, _tmp, vl);
 }
 
 // f32m1, vssseg.v, 8/4/2
-static inline void __riscv_vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
+static inline void __riscv_vssseg8e32_v_f32m1(float* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
 {
     vfloat32m1x8_t _tmp = __riscv_vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
     __riscv_vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl);
 }
 
-static inline void __riscv_vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
+static inline void __riscv_vssseg4e32_v_f32m1(float* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
 {
     vfloat32m1x4_t _tmp = __riscv_vcreate_f32m1x4(v0, v1, v2, v3);
     __riscv_vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl);
 }
 
-static inline void __riscv_vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
+static inline void __riscv_vssseg2e32_v_f32m1(float* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
 {
     vfloat32m1x2_t _tmp = __riscv_vcreate_f32m1x2(v0, v1);
     __riscv_vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl);
 }
 
 // f32m2, vsseg.v, 4/2
-static inline void __riscv_vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl)
+static inline void __riscv_vsseg4e32_v_f32m2(float* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl)
 {
     vfloat32m2x4_t _tmp = __riscv_vcreate_f32m2x4(v0, v1, v2, v3);
     __riscv_vsseg4e32_v_f32m2x4(base, _tmp, vl);
 }
 
-static inline void __riscv_vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl)
+static inline void __riscv_vsseg2e32_v_f32m2(float* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl)
 {
     vfloat32m2x2_t _tmp = __riscv_vcreate_f32m2x2(v0, v1);
     __riscv_vsseg2e32_v_f32m2x2(base, _tmp, vl);
@@ -205,7 +205,7 @@ static inline void __riscv_vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vui
 }
 
 // f32m1, vlseg.v 8/4/2
-static inline void __riscv_vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl)
+static inline void __riscv_vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float* base, size_t vl)
 {
     vfloat32m1x8_t _tmp = __riscv_vlseg8e32_v_f32m1x8(base, vl);
     *v0 = __riscv_vget_f32m1x8_f32m1(_tmp, 0);
@@ -218,7 +218,7 @@ static inline void __riscv_vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1,
     *v7 = __riscv_vget_f32m1x8_f32m1(_tmp, 7);
 }
 
-static inline void __riscv_vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl)
+static inline void __riscv_vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float* base, size_t vl)
 {
     vfloat32m1x4_t _tmp = __riscv_vlseg4e32_v_f32m1x4(base, vl);
     *v0 = __riscv_vget_f32m1x4_f32m1(_tmp, 0);
@@ -227,7 +227,7 @@ static inline void __riscv_vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1,
     *v3 = __riscv_vget_f32m1x4_f32m1(_tmp, 3);
 }
 
-static inline void __riscv_vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl)
+static inline void __riscv_vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float* base, size_t vl)
 {
     vfloat32m1x2_t _tmp = __riscv_vlseg2e32_v_f32m1x2(base, vl);
     *v0 = __riscv_vget_f32m1x2_f32m1(_tmp, 0);
@@ -235,7 +235,7 @@ static inline void __riscv_vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1,
 }
 
 // f32m2, vlseg.v, 4
-static inline void __riscv_vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl)
+static inline void __riscv_vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float* base, size_t vl)
 {
     vfloat32m2x4_t _tmp = __riscv_vlseg4e32_v_f32m2x4(base, vl);
     *v0 = __riscv_vget_f32m2x4_f32m2(_tmp, 0);
@@ -245,7 +245,7 @@ static inline void __riscv_vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1,
 }
 
 // f32m4, vlseg.v, 2
-static inline void __riscv_vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl)
+static inline void __riscv_vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float* base, size_t vl)
 {
     vfloat32m4x2_t _tmp = __riscv_vlseg2e32_v_f32m4x2(base, vl);
     *v0 = __riscv_vget_f32m4x2_f32m4(_tmp, 0);
@@ -253,7 +253,7 @@ static inline void __riscv_vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1,
 }
 
 // f32m4, vloxseg.v
-static inline void __riscv_vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl)
+static inline void __riscv_vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float* base, vuint32m4_t bindex, size_t vl)
 {
     vfloat32m4x2_t _tmp = __riscv_vloxseg2ei32_v_f32m4x2(base, bindex, vl);
     *v0 = __riscv_vget_f32m4x2_f32m4(_tmp, 0);
@@ -399,7 +399,7 @@ static inline void __riscv_vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1,
 #endif // __riscv_zvfh
 #endif // __riscv_vector
 
-#ifdef __riscv_vector
+#if __riscv_vector
 
 static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
@@ -411,8 +411,57 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
 {
     float tmp[64];
-    __riscv_vsseg8e32_v_f32m1x8(&tmp[0], __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l), vl);
-    __riscv_vsseg8e32_v_f32m1x8(&tmp[32], __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h), vl);
+#if __riscv_vector
+#if 1//__riscv_v_intrinsic > 12000
+#warning A
+    vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l);
+    vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h);
+#else
+#warning B
+    vfloat32m1x8_t _rl = vfloat32m1x8_t();
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l);
+    vfloat32m1x8_t _rh = vfloat32m1x8_t();
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h);
+#endif
+    __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl);
+    __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl);
+#elif __riscv_xtheadvector
+    vfloat32m1x8_t _rl = vfloat32m1x8_t();
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 0, _r0l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 1, _r1l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 2, _r2l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 3, _r3l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 4, _r4l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 5, _r5l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 6, _r6l);
+    // _rl = __riscv_th_vset_v_f32m1_f32m1x8(_rl, 7, _r7l);
+    vfloat32m1x8_t _rh = vfloat32m1x8_t();
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 0, _r0h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 1, _r1h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 2, _r2h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 3, _r3h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 4, _r4h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 5, _r5h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 6, _r6h);
+    // _rh = __riscv_th_vset_v_f32m1_f32m1x8(_rh, 7, _r7h);
+    __riscv_th_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl);
+    __riscv_th_vsseg8e32_v_f32m1x8(&tmp[32], _rh, vl);
+#endif
+
     float* ptr = (float*)tmp;
     _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
     _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl);
@@ -435,7 +484,16 @@ static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
 static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
 {
     float tmp[16];
-    __riscv_vsseg4e32_v_f32m1x4(&tmp[0], __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3), vl);
+#if __riscv_vector && __riscv_v_intrinsic > 12000
+    vfloat32m1x4_t _r = __riscv_vcreate_v_f32m1x4(_r0, _r1, _r2, _r3);
+#else
+    vfloat32m1x4_t _r = vfloat32m1x4_t();
+    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 0, _r0);
+    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 1, _r1);
+    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 2, _r2);
+    _r = __riscv_vset_v_f32m1_f32m1x4(_r, 3, _r3);
+#endif
+    __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _r, vl);
     float* ptr = (float*)tmp;
     _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
     _r1 = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl);
@@ -519,9 +577,42 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo
                                     vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
 {
     float tmp[96];
-    __riscv_vsseg8e32_v_f32m1x8(&tmp[0], __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l), vl);
-    __riscv_vsseg8e32_v_f32m1x8(&tmp[32], __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m), vl);
-    __riscv_vsseg8e32_v_f32m1x8(&tmp[64], __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h), vl);
+#if __riscv_vector && __riscv_v_intrinsic > 12000
+    vfloat32m1x8_t _rl = __riscv_vcreate_v_f32m1x8(_r0l, _r1l, _r2l, _r3l, _r4l, _r5l, _r6l, _r7l);
+    vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0m, _r1m, _r2m, _r3m, _r4m, _r5m, _r6m, _r7m);
+    vfloat32m1x8_t _rh = __riscv_vcreate_v_f32m1x8(_r0h, _r1h, _r2h, _r3h, _r4h, _r5h, _r6h, _r7h);
+#else
+    vfloat32m1x8_t _rl = vfloat32m1x8_t();
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 0, _r0l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 1, _r1l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 2, _r2l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 3, _r3l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 4, _r4l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 5, _r5l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 6, _r6l);
+    _rl = __riscv_vset_v_f32m1_f32m1x8(_rl, 7, _r7l);
+    vfloat32m1x8_t _rm = vfloat32m1x8_t();
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 0, _r0m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 1, _r1m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 2, _r2m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 3, _r3m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 4, _r4m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 5, _r5m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 6, _r6m);
+    _rm = __riscv_vset_v_f32m1_f32m1x8(_rm, 7, _r7m);
+    vfloat32m1x8_t _rh = vfloat32m1x8_t();
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 0, _r0h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 1, _r1h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 2, _r2h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 3, _r3h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 4, _r4h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 5, _r5h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 6, _r6h);
+    _rh = __riscv_vset_v_f32m1_f32m1x8(_rh, 7, _r7h);
+#endif
+    __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _rl, vl);
+    __riscv_vsseg8e32_v_f32m1x8(&tmp[32], _rm, vl);
+    __riscv_vsseg8e32_v_f32m1x8(&tmp[64], _rh, vl);
 
     float* ptr = (float*)tmp;
     _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
@@ -553,7 +644,20 @@ static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vflo
 static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
 {
     float tmp[32];
-    __riscv_vsseg8e32_v_f32m1x8(&tmp[0], __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7), vl);
+#if __riscv_vector && __riscv_v_intrinsic > 12000
+    vfloat32m1x8_t _r = __riscv_vcreate_v_f32m1x8(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+#else
+    vfloat32m1x8_t _r = vfloat32m1x8_t();
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 0, _r0);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 1, _r1);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 2, _r2);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 3, _r3);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 4, _r4);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 5, _r5);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 6, _r6);
+    _r = __riscv_vset_v_f32m1_f32m1x8(_r, 7, _r7);
+#endif
+    __riscv_vsseg8e32_v_f32m1x8(&tmp[0], _r, vl);
 
     float* ptr = (float*)tmp;
     _r0 = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
@@ -602,8 +706,23 @@ static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
 {
     float tmp[32];
-    __riscv_vsseg4e32_v_f32m1x4(&tmp[0], __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l), vl);
-    __riscv_vsseg4e32_v_f32m1x4(&tmp[16], __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h), vl);
+#if __riscv_vector && __riscv_v_intrinsic > 12000
+    vfloat32m1x4_t _rl = __riscv_vcreate_v_f32m1x4(_r0l, _r1l, _r2l, _r3l);
+    vfloat32m1x4_t _rh = __riscv_vcreate_v_f32m1x4(_r0h, _r1h, _r2h, _r3h);
+#else
+    vfloat32m1x4_t _rl = vfloat32m1x4_t();
+    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 0, _r0l);
+    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 1, _r1l);
+    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 2, _r2l);
+    _rl = __riscv_vset_v_f32m1_f32m1x4(_rl, 3, _r3l);
+    vfloat32m1x4_t _rh = vfloat32m1x4_t();
+    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 0, _r0h);
+    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 1, _r1h);
+    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 2, _r2h);
+    _rh = __riscv_vset_v_f32m1_f32m1x4(_rh, 3, _r3h);
+#endif
+    __riscv_vsseg4e32_v_f32m1x4(&tmp[0], _rl, vl);
+    __riscv_vsseg4e32_v_f32m1x4(&tmp[16], _rh, vl);
     float* ptr = (float*)tmp;
     _r0l = __riscv_vle32_v_f32m1(ptr + 0 * 4, vl);
     _r0h = __riscv_vle32_v_f32m1(ptr + 1 * 4, vl);
diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp
index c4a801e17ab..a53b05dffef 100644
--- a/src/layer/riscv/softmax_riscv.cpp
+++ b/src/layer/riscv/softmax_riscv.cpp
@@ -33,7 +33,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
     int elempack = bottom_top_blob.elempack;
 
     int positive_axis = axis < 0 ? dims + axis : axis;
-#ifdef __riscv_vector
+#if __riscv_vector
     if (dims == 1) // positive_axis == 0
     {
         int w = bottom_top_blob.w;
diff --git a/src/layer_registry.h.in b/src/layer_registry.h.in
index 97d2de37ef5..6f34f8cb0f0 100644
--- a/src/layer_registry.h.in
+++ b/src/layer_registry.h.in
@@ -46,6 +46,18 @@ static const layer_registry_entry layer_registry_lsx[] = {
 };
 #endif // NCNN_RUNTIME_CPU && NCNN_LSX
 
+#if NCNN_RUNTIME_CPU && NCNN_RVV
+static const layer_registry_entry layer_registry_rvv[] = {
+@layer_registry_rvv@
+};
+#endif // NCNN_RUNTIME_CPU && NCNN_RVV
+
+#if NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR
+static const layer_registry_entry layer_registry_xtheadvector[] = {
+@layer_registry_xtheadvector@
+};
+#endif // NCNN_RUNTIME_CPU && NCNN_XTHEADVECTOR
+
 #if NCNN_VULKAN
 static const layer_registry_entry layer_registry_vulkan[] = {
 @layer_registry_vulkan@
diff --git a/src/platform.h.in b/src/platform.h.in
index d28cbdc0014..57af8f2664c 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -60,6 +60,7 @@
 #cmakedefine01 NCNN_MMI
 #cmakedefine01 NCNN_RVV
 #cmakedefine01 NCNN_ZVFH
+#cmakedefine01 NCNN_XTHEADVECTOR
 #cmakedefine01 NCNN_INT8
 #cmakedefine01 NCNN_BF16
 #cmakedefine01 NCNN_FORCE_INLINE