Skip to content

Commit

Permalink
fix gemm arm int8 scales descales offset (#5750)
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui authored Oct 21, 2024
1 parent c1f9e95 commit e7602a2
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 94 deletions.
1 change: 0 additions & 1 deletion src/layer/arm/gemm_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4404,7 +4404,6 @@ int Gemm_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
if (int8_scale_term)
{
return forward_int8(bottom_blobs, top_blobs, opt);
// return Gemm::forward_int8(bottom_blobs, top_blobs, opt);
}
#endif

Expand Down
60 changes: 30 additions & 30 deletions src/layer/arm/gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1724,8 +1724,8 @@ static void compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales, float B_s

const float v127_B_scale = 127.f * B_scale;

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -1897,8 +1897,8 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -2314,7 +2314,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -2592,8 +2592,8 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

// if (elempack == 1)
{
Expand Down Expand Up @@ -2680,7 +2680,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

const float scale = scales[ii];
const float scale = scales[i + ii];

// if (elempack == 1)
{
Expand Down Expand Up @@ -2750,8 +2750,8 @@ static void transpose_compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales,
#endif
#endif

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -3055,8 +3055,8 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -3396,7 +3396,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -3622,8 +3622,8 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
float32x4_t _scale0 = vdupq_n_f32(scale0);
Expand Down Expand Up @@ -3805,7 +3805,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

const float scale = scales[ii];
const float scale = scales[i + ii];

#if __ARM_NEON
float32x4_t _scale = vdupq_n_f32(scale);
Expand Down Expand Up @@ -5646,8 +5646,8 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
{
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -6593,7 +6593,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
{
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -7181,10 +7181,10 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale = vld1_f32((const float*)descales + ii);
float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -7467,7 +7467,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down Expand Up @@ -7726,8 +7726,8 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -8673,7 +8673,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -9237,10 +9237,10 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale01 = vld1_f32((const float*)descales + ii);
float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -9556,7 +9556,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down
60 changes: 30 additions & 30 deletions src/layer/arm/gemm_int8_bf16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ static void compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales, float B_s

const float v127_B_scale = 127.f * B_scale;

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -217,8 +217,8 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -665,7 +665,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -958,8 +958,8 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

// if (elempack == 1)
{
Expand Down Expand Up @@ -1048,7 +1048,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

const float scale = scales[ii];
const float scale = scales[i + ii];

// if (elempack == 1)
{
Expand Down Expand Up @@ -1121,8 +1121,8 @@ static void transpose_compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales,
#endif
#endif

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -1362,8 +1362,8 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -1731,7 +1731,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -1963,8 +1963,8 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
float32x4_t _scale0 = vdupq_n_f32(scale0);
Expand Down Expand Up @@ -2187,7 +2187,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

const float scale = scales[ii];
const float scale = scales[i + ii];

#if __ARM_NEON
float32x4_t _scale = vdupq_n_f32(scale);
Expand Down Expand Up @@ -4169,8 +4169,8 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
{
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -5189,7 +5189,7 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
{
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -5794,10 +5794,10 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale = vld1_f32((const float*)descales + ii);
float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -6097,7 +6097,7 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down Expand Up @@ -6359,8 +6359,8 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -7318,7 +7318,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -7902,10 +7902,10 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale01 = vld1_f32((const float*)descales + ii);
float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -8250,7 +8250,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down
Loading

0 comments on commit e7602a2

Please sign in to comment.