Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix gemm arm int8 scales descales offset #5750

Merged
merged 2 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/layer/arm/gemm_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4404,7 +4404,6 @@ int Gemm_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& to
if (int8_scale_term)
{
return forward_int8(bottom_blobs, top_blobs, opt);
// return Gemm::forward_int8(bottom_blobs, top_blobs, opt);
}
#endif

Expand Down
60 changes: 30 additions & 30 deletions src/layer/arm/gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -1724,8 +1724,8 @@ static void compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales, float B_s

const float v127_B_scale = 127.f * B_scale;

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -1897,8 +1897,8 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -2314,7 +2314,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -2592,8 +2592,8 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

// if (elempack == 1)
{
Expand Down Expand Up @@ -2680,7 +2680,7 @@ static void pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const float* p0 = (const float*)A + (i + ii) * A_hstep + k;

const float scale = scales[ii];
const float scale = scales[i + ii];

// if (elempack == 1)
{
Expand Down Expand Up @@ -2750,8 +2750,8 @@ static void transpose_compute_A_tile_fp32_int8_scales(const Mat& A, Mat& scales,
#endif
#endif

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -3055,8 +3055,8 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -3396,7 +3396,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -3622,8 +3622,8 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
float32x4_t _scale0 = vdupq_n_f32(scale0);
Expand Down Expand Up @@ -3805,7 +3805,7 @@ static void transpose_pack_A_tile_fp32_to_int8(const Mat& A, Mat& AT, int i, int
{
const float* p0 = (const float*)A + k * A_hstep + (i + ii) * elempack;

const float scale = scales[ii];
const float scale = scales[i + ii];

#if __ARM_NEON
float32x4_t _scale = vdupq_n_f32(scale);
Expand Down Expand Up @@ -5646,8 +5646,8 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
{
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -6593,7 +6593,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
{
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -7181,10 +7181,10 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale = vld1_f32((const float*)descales + ii);
float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -7467,7 +7467,7 @@ static void unpack_output_tile_int32_to_fp32(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
float* p0 = (float*)top_blob + (i + ii) * out_hstep + j;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down Expand Up @@ -7726,8 +7726,8 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -8673,7 +8673,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -9237,10 +9237,10 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale01 = vld1_f32((const float*)descales + ii);
float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -9556,7 +9556,7 @@ static void transpose_unpack_output_tile_int32_to_fp32(const Mat& topT, const Ma
{
float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down
60 changes: 30 additions & 30 deletions src/layer/arm/gemm_int8_bf16s.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ static void compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales, float B_s

const float v127_B_scale = 127.f * B_scale;

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -217,8 +217,8 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -665,7 +665,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -958,8 +958,8 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

// if (elempack == 1)
{
Expand Down Expand Up @@ -1048,7 +1048,7 @@ static void pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int max_ii, i
{
const unsigned short* p0 = (const unsigned short*)A + (i + ii) * A_hstep + k;

const float scale = scales[ii];
const float scale = scales[i + ii];

// if (elempack == 1)
{
Expand Down Expand Up @@ -1121,8 +1121,8 @@ static void transpose_compute_A_tile_bf16_int8_scales(const Mat& A, Mat& scales,
#endif
#endif

float* ps = scales;
float* pods = out_descales;
float* ps = (float*)scales + i;
float* pods = (float*)out_descales + i;

#if __ARM_NEON
if (elempack == 4)
Expand Down Expand Up @@ -1362,8 +1362,8 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale0 = vld1q_f32((const float*)scales + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + ii + 4);
float32x4_t _scale0 = vld1q_f32((const float*)scales + i + ii);
float32x4_t _scale1 = vld1q_f32((const float*)scales + i + ii + 4);

if (elempack == 4)
{
Expand Down Expand Up @@ -1731,7 +1731,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

float32x4_t _scale = vld1q_f32((const float*)scales + ii);
float32x4_t _scale = vld1q_f32((const float*)scales + i + ii);

if (elempack == 4)
{
Expand Down Expand Up @@ -1963,8 +1963,8 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

const float scale0 = scales[ii];
const float scale1 = scales[ii + 1];
const float scale0 = scales[i + ii];
const float scale1 = scales[i + ii + 1];

#if __ARM_NEON
float32x4_t _scale0 = vdupq_n_f32(scale0);
Expand Down Expand Up @@ -2187,7 +2187,7 @@ static void transpose_pack_A_tile_bf16_to_int8(const Mat& A, Mat& AT, int i, int
{
const unsigned short* p0 = (const unsigned short*)A + k * A_hstep + (i + ii) * elempack;

const float scale = scales[ii];
const float scale = scales[i + ii];

#if __ARM_NEON
float32x4_t _scale = vdupq_n_f32(scale);
Expand Down Expand Up @@ -4169,8 +4169,8 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
{
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -5189,7 +5189,7 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
{
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -5794,10 +5794,10 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale = vld1_f32((const float*)descales + ii);
float32x2_t _descale = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -6097,7 +6097,7 @@ static void unpack_output_tile_int32_to_bf16(const Mat& topT, const Mat& C, Mat&
// out_elempack == 1
unsigned short* p0 = (unsigned short*)top_blob + (i + ii) * out_hstep + j;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down Expand Up @@ -6359,8 +6359,8 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale0 = vld1q_f32((const float*)descales + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + ii + 4);
float32x4_t _descale0 = vld1q_f32((const float*)descales + i + ii);
float32x4_t _descale1 = vld1q_f32((const float*)descales + i + ii + 4);

float32x4_t _c0;
float32x4_t _c1;
Expand Down Expand Up @@ -7318,7 +7318,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

float32x4_t _descale = vld1q_f32((const float*)descales + ii);
float32x4_t _descale = vld1q_f32((const float*)descales + i + ii);

float32x4_t _c0;
if (pC)
Expand Down Expand Up @@ -7902,10 +7902,10 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale0 = descales[ii];
const float descale1 = descales[ii + 1];
const float descale0 = descales[i + ii];
const float descale1 = descales[i + ii + 1];
#if __ARM_NEON
float32x2_t _descale01 = vld1_f32((const float*)descales + ii);
float32x2_t _descale01 = vld1_f32((const float*)descales + i + ii);
#endif

float c0;
Expand Down Expand Up @@ -8250,7 +8250,7 @@ static void transpose_unpack_output_tile_int32_to_bf16(const Mat& topT, const Ma
{
unsigned short* p0 = (unsigned short*)top_blob + j * out_hstep + (i + ii) * out_elempack;

const float descale = descales[ii];
const float descale = descales[i + ii];
#if __ARM_NEON
float32x4_t _descale = vdupq_n_f32(descale);
#endif
Expand Down
Loading
Loading