diff --git a/src/layer/arm/gemm_arm.cpp b/src/layer/arm/gemm_arm.cpp index 7607d8f523e5..09f25869f43d 100644 --- a/src/layer/arm/gemm_arm.cpp +++ b/src/layer/arm/gemm_arm.cpp @@ -4404,7 +4404,6 @@ int Gemm_arm::forward(const std::vector& bottom_blobs, std::vector& to if (int8_scale_term) { return forward_int8(bottom_blobs, top_blobs, opt); - // return Gemm::forward_int8(bottom_blobs, top_blobs, opt); } #endif diff --git a/src/layer/arm/gemm_int8.h b/src/layer/arm/gemm_int8.h index 68688c863102..652f300b4fd5 100644 --- a/src/layer/arm/gemm_int8.h +++ b/src/layer/arm/gemm_int8.h @@ -1724,8 +1724,8 @@ static void compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales, float B_s const float v127_B_scale = 127.f * B_scale; - float* ps = scales; - float* pods = out_descales; + float* ps = (float*)scales + i; + float* pods = (float*)out_descales + i; #if __ARM_NEON if (elempack == 4) @@ -1897,8 +1897,8 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack; - float32x4_t _scale0 = vld1q_f32((const float*)scales + ii); - float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4); + float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii); + float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4); if (elempack == 4) { @@ -2314,7 +2314,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack; - float32x4_t _scale = vld1q_f32((const float*)scales + ii); + float32x4_t _scale = vld1q_f32((const float*)scales + i + ii); if (elempack == 4) { @@ -2592,8 +2592,8 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const float* p0 = (const float*)A + (i + ii) * A_hstep + k; - const float scale0 = scales[ii]; - const float scale1 = scales[ii + 1]; + const float scale0 = scales[i + ii]; + const float scale1 = scales[i + ii + 1]; // if (elempack == 1) { @@ -2680,7 +2680,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const float* p0 = (const float*)A + (i + ii) * A_hstep + k; - const float scale = scales[ii]; + const float scale = scales[i + ii]; // if (elempack == 1) { @@ -2750,8 +2750,8 @@ static void transpose_compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales, #endif #endif - float* ps = scales; - float* pods = out_descales; + float* ps = (float*)scales + i; + float* pods = (float*)out_descales + i; #if __ARM_NEON if (elempack == 4) @@ -3055,8 +3055,8 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int { const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack; - float32x4_t _scale0 = vld1q_f32((const float*)scales + ii); - float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4); + float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii); + float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4); if (elempack == 4) { @@ -3396,7 +3396,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int { const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack; - float32x4_t _scale = vld1q_f32((const float*)scales + ii); + float32x4_t _scale = vld1q_f32((const float*)scales + i + ii); if (elempack == 4) { @@ -3622,8 +3622,8 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int { const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack; - const float scale0 = scales[ii]; - const float scale1 = scales[ii + 1]; + const float scale0 = scales[i + ii]; + const float scale1 = scales[i + ii + 1]; #if __ARM_NEON float32x4_t _scale0 = vdupq_n_f32(scale0); @@ -3805,7 +3805,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int { const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack; - const float scale = scales[ii]; + const float scale = scales[i + ii]; #if __ARM_NEON float32x4_t _scale = vdupq_n_f32(scale); @@ -5646,8 +5646,8 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat& { float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack; - float32x4_t _descale0 = vld1q_f32((const float*)descales + ii); - float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4); + float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii); + float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4); float32x4_t _c0; float32x4_t _c1; @@ -6593,7 +6593,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat& { float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack; - float32x4_t _descale = vld1q_f32((const float*)descales + ii); + float32x4_t _descale = vld1q_f32((const float*)descales + i + ii); float32x4_t _c0; if (pC) @@ -7181,10 +7181,10 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat& // out_elempack == 1 float* p0 = (float*)top_blob + (i + ii) * out_hstep + j; - const float descale0 = descales[ii]; - const float descale1 = descales[ii + 1]; + const float descale0 = descales[i + ii]; + const float descale1 = descales[i + ii + 1]; #if __ARM_NEON - float32x2_t _descale = vld1_f32((const float*)descales + ii); + float32x2_t _descale = vld1_f32((const float*)descales + i + ii); #endif float c0; @@ -7467,7 +7467,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat& // out_elempack == 1 float* p0 = (float*)top_blob + (i + ii) * out_hstep + j; - const float descale = descales[ii]; + const float descale = descales[i + ii]; #if __ARM_NEON float32x4_t _descale = vdupq_n_f32(descale); #endif @@ -7726,8 +7726,8 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma { float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack; - float32x4_t _descale0 = vld1q_f32((const float*)descales + ii); - float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4); + float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii); + float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4); float32x4_t _c0; float32x4_t _c1; @@ -8673,7 +8673,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma { float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack; - float32x4_t _descale = vld1q_f32((const float*)descales + ii); + float32x4_t _descale = vld1q_f32((const float*)descales + i + ii); float32x4_t _c0; if (pC) @@ -9237,10 +9237,10 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma { float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack; - const float descale0 = descales[ii]; - const float descale1 = descales[ii + 1]; + const float descale0 = descales[i + ii]; + const float descale1 = descales[i + ii + 1]; #if __ARM_NEON - float32x2_t _descale01 = vld1_f32((const float*)descales + ii); + float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii); #endif float c0; @@ -9556,7 +9556,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma { float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack; - const float descale = descales[ii]; + const float descale = descales[i + ii]; #if __ARM_NEON float32x4_t _descale = vdupq_n_f32(descale); #endif diff --git a/src/layer/arm/gemm_int8_bf16s.h b/src/layer/arm/gemm_int8_bf16s.h index 350f20ab4c0f..a1ad87d51229 100644 --- a/src/layer/arm/gemm_int8_bf16s.h +++ b/src/layer/arm/gemm_int8_bf16s.h @@ -38,8 +38,8 @@ static void compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales, float B_s const float v127_B_scale = 127.f * B_scale; - float* ps = scales; - float* pods = out_descales; + float* ps = (float*)scales + i; + float* pods = (float*)out_descales + i; #if __ARM_NEON if (elempack == 4) @@ -217,8 +217,8 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack; - float32x4_t _scale0 = vld1q_f32((const float*)scales + ii); - float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4); + float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii); + float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4); if (elempack == 4) { @@ -665,7 +665,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack; - float32x4_t _scale = vld1q_f32((const float*)scales + ii); + float32x4_t _scale = vld1q_f32((const float*)scales + i + ii); if (elempack == 4) { @@ -958,8 +958,8 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k; - const float scale0 = scales[ii]; - const float scale1 = scales[ii + 1]; + const float scale0 = scales[i + ii]; + const float scale1 = scales[i + ii + 1]; // if (elempack == 1) { @@ -1048,7 +1048,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k; - const float scale = scales[ii]; + const float scale = scales[i + ii]; // if (elempack == 1) { @@ -1121,8 +1121,8 @@ static void transpose_compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales, #endif #endif - float* ps = scales; - float* pods = out_descales; + float* ps = (float*)scales + i; + float* pods = (float*)out_descales + i; #if __ARM_NEON if (elempack == 4) @@ -1362,8 +1362,8 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - float32x4_t _scale0 = vld1q_f32((const float*)scales + ii); - float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4); + float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii); + float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4); if (elempack == 4) { @@ -1731,7 +1731,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - float32x4_t _scale = vld1q_f32((const float*)scales + ii); + float32x4_t _scale = vld1q_f32((const float*)scales + i + ii); if (elempack == 4) { @@ -1963,8 +1963,8 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - const float scale0 = scales[ii]; - const float scale1 = scales[ii + 1]; + const float scale0 = scales[i + ii]; + const float scale1 = scales[i + ii + 1]; #if __ARM_NEON float32x4_t _scale0 = vdupq_n_f32(scale0); @@ -2187,7 +2187,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - const float scale = scales[ii]; + const float scale = scales[i + ii]; #if __ARM_NEON float32x4_t _scale = vdupq_n_f32(scale); @@ -4169,8 +4169,8 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat& { unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack; - float32x4_t _descale0 = vld1q_f32((const float*)descales + ii); - float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4); + float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii); + float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4); float32x4_t _c0; float32x4_t _c1; @@ -5189,7 +5189,7 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat& { unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack; - float32x4_t _descale = vld1q_f32((const float*)descales + ii); + float32x4_t _descale = vld1q_f32((const float*)descales + i + ii); float32x4_t _c0; if (pC) @@ -5794,10 +5794,10 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat& // out_elempack == 1 unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j; - const float descale0 = descales[ii]; - const float descale1 = descales[ii + 1]; + const float descale0 = descales[i + ii]; + const float descale1 = descales[i + ii + 1]; #if __ARM_NEON - float32x2_t _descale = vld1_f32((const float*)descales + ii); + float32x2_t _descale = vld1_f32((const float*)descales + i + ii); #endif float c0; @@ -6097,7 +6097,7 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat& // out_elempack == 1 unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j; - const float descale = descales[ii]; + const float descale = descales[i + ii]; #if __ARM_NEON float32x4_t _descale = vdupq_n_f32(descale); #endif @@ -6359,8 +6359,8 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - float32x4_t _descale0 = vld1q_f32((const float*)descales + ii); - float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4); + float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii); + float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4); float32x4_t _c0; float32x4_t _c1; @@ -7318,7 +7318,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - float32x4_t _descale = vld1q_f32((const float*)descales + ii); + float32x4_t _descale = vld1q_f32((const float*)descales + i + ii); float32x4_t _c0; if (pC) @@ -7902,10 +7902,10 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - const float descale0 = descales[ii]; - const float descale1 = descales[ii + 1]; + const float descale0 = descales[i + ii]; + const float descale1 = descales[i + ii + 1]; #if __ARM_NEON - float32x2_t _descale01 = vld1_f32((const float*)descales + ii); + float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii); #endif float c0; @@ -8250,7 +8250,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - const float descale = descales[ii]; + const float descale = descales[i + ii]; #if __ARM_NEON float32x4_t _descale = vdupq_n_f32(descale); #endif diff --git a/src/layer/arm/gemm_int8_fp16s.h b/src/layer/arm/gemm_int8_fp16s.h index e096a6caf6f6..0ea6c389c8b2 100644 --- a/src/layer/arm/gemm_int8_fp16s.h +++ b/src/layer/arm/gemm_int8_fp16s.h @@ -52,8 +52,8 @@ static void compute_A_tile_fp16_int8_scales(const Mat& A, Mat& scales, float B_s const float v127_B_scale = 127.f * B_scale; - float* ps = scales; - float* pods = out_descales; + float* ps = (float*)scales + i; + float* pods = (float*)out_descales + i; #if __ARM_NEON #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -390,8 +390,8 @@ static void pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack; - float32x4_t _scale0 = vld1q_f32((const float*)scales + ii); - float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4); + float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii); + float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4); #if __aarch64__ if (elempack == 8) @@ -1007,7 +1007,7 @@ static void pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack; - float32x4_t _scale = vld1q_f32((const float*)scales + ii); + float32x4_t _scale = vld1q_f32((const float*)scales + i + ii); if (elempack == 4) { @@ -1300,8 +1300,8 @@ static void pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k; - const float scale0 = scales[ii]; - const float scale1 = scales[ii + 1]; + const float scale0 = scales[i + ii]; + const float scale1 = scales[i + ii + 1]; // if (elempack == 1) { @@ -1390,7 +1390,7 @@ static void pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i { const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k; - const float scale = scales[ii]; + const float scale = scales[i + ii]; // if (elempack == 1) { @@ -1471,8 +1471,8 @@ static void transpose_compute_A_tile_fp16_int8_scales(const Mat& A, Mat& scales, #endif #endif - float* ps = scales; - float* pods = out_descales; + float* ps = (float*)scales + i; + float* pods = (float*)out_descales + i; #if __ARM_NEON #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC @@ -2035,8 +2035,8 @@ static void transpose_pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - float32x4_t _scale0 = vld1q_f32((const float*)scales + ii); - float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4); + float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii); + float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4); #if __aarch64__ if (elempack == 8) @@ -2510,7 +2510,7 @@ static void transpose_pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - float32x4_t _scale = vld1q_f32((const float*)scales + ii); + float32x4_t _scale = vld1q_f32((const float*)scales + i + ii); #if __aarch64__ if (elempack == 8) @@ -2803,8 +2803,8 @@ static void transpose_pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - const float scale0 = scales[ii]; - const float scale1 = scales[ii + 1]; + const float scale0 = scales[i + ii]; + const float scale1 = scales[i + ii + 1]; #if __ARM_NEON float32x4_t _scale0 = vdupq_n_f32(scale0); @@ -3068,7 +3068,7 @@ static void transpose_pack_A_tile_fp16_to_int8(const Mat& A, Mat& AT, int i, int { const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack; - const float scale = scales[ii]; + const float scale = scales[i + ii]; #if __ARM_NEON float32x4_t _scale = vdupq_n_f32(scale); @@ -5605,8 +5605,8 @@ static void unpack_output_tile_int32_to_fp16(const Mat& topT, const Mat& C, Mat& { unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack; - float32x4_t _descale0 = vld1q_f32((const float*)descales + ii); - float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4); + float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii); + float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4); float32x4_t _c0; float32x4_t _c1; @@ -6813,7 +6813,7 @@ static void unpack_output_tile_int32_to_fp16(const Mat& topT, const Mat& C, Mat& { unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack; - float32x4_t _descale = vld1q_f32((const float*)descales + ii); + float32x4_t _descale = vld1q_f32((const float*)descales + i + ii); float32x4_t _c0; if (pC) @@ -7418,10 +7418,10 @@ static void unpack_output_tile_int32_to_fp16(const Mat& topT, const Mat& C, Mat& // out_elempack == 1 unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j; - const float descale0 = descales[ii]; - const float descale1 = descales[ii + 1]; + const float descale0 = descales[i + ii]; + const float descale1 = descales[i + ii + 1]; #if __ARM_NEON - float32x2_t _descale = vld1_f32((const float*)descales + ii); + float32x2_t _descale = vld1_f32((const float*)descales + i + ii); #endif float c0; @@ -7721,7 +7721,7 @@ static void unpack_output_tile_int32_to_fp16(const Mat& topT, const Mat& C, Mat& // out_elempack == 1 unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j; - const float descale = descales[ii]; + const float descale = descales[i + ii]; #if __ARM_NEON float32x4_t _descale = vdupq_n_f32(descale); #endif @@ -7983,8 +7983,8 @@ static void transpose_unpack_output_tile_int32_to_fp16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - float32x4_t _descale0 = vld1q_f32((const float*)descales + ii); - float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4); + float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii); + float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4); float32x4_t _c0; float32x4_t _c1; @@ -9088,7 +9088,7 @@ static void transpose_unpack_output_tile_int32_to_fp16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - float32x4_t _descale = vld1q_f32((const float*)descales + ii); + float32x4_t _descale = vld1q_f32((const float*)descales + i + ii); float32x4_t _c0; if (pC) @@ -9683,10 +9683,10 @@ static void transpose_unpack_output_tile_int32_to_fp16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - const float descale0 = descales[ii]; - const float descale1 = descales[ii + 1]; + const float descale0 = descales[i + ii]; + const float descale1 = descales[i + ii + 1]; #if __ARM_NEON - float32x2_t _descale01 = vld1_f32((const float*)descales + ii); + float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii); #endif float c0; @@ -10038,7 +10038,7 @@ static void transpose_unpack_output_tile_int32_to_fp16(const Mat& topT, const Ma { unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack; - const float descale = descales[ii]; + const float descale = descales[i + ii]; #if __ARM_NEON float32x4_t _descale = vdupq_n_f32(descale); #endif diff --git a/tests/test_multiheadattention_1.cpp b/tests/test_multiheadattention_1.cpp index c29930a0be8c..7039b19cc3cb 100644 --- a/tests/test_multiheadattention_1.cpp +++ b/tests/test_multiheadattention_1.cpp @@ -55,7 +55,7 @@ static int test_multiheadattention_int8(const ncnn::Mat& q, const ncnn::Mat& k, as.push_back(RandomMat(k.h, q.h)); } - float epsilon = 0.15; + float epsilon = 0.1; int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon); if (ret != 0) @@ -98,7 +98,7 @@ static int test_multiheadattention_int8_samekv(const ncnn::Mat& q, const ncnn::M as[0] = q; as[1] = kv; - float epsilon = 0.15; + float epsilon = 0.1; int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon); if (ret != 0) @@ -139,7 +139,7 @@ static int test_multiheadattention_int8_sameqkv(const ncnn::Mat& a, int embed_di std::vector as(1); as[0] = a; - float epsilon = 0.15; + float epsilon = 0.1; int ret = test_layer("MultiHeadAttention", pd, weights, as, 1, epsilon); if (ret != 0)