Skip to content

Commit

Permalink
Conv 1xN use im2col for padding (ARM-software#123)
Browse files Browse the repository at this point in the history
Co-authored-by: Felix Johnny Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>
  • Loading branch information
mansnils and felix-johnny authored Mar 28, 2024
1 parent 7d69e99 commit 2ef559d
Show file tree
Hide file tree
Showing 21 changed files with 2,032 additions and 61 deletions.
132 changes: 74 additions & 58 deletions Source/ConvolutionFunctions/arm_convolve_1_x_n_s8.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_convolve_1_x_n_s8.c
* Description: s8 version of 1xN convolution using symmetric quantization.
*
* $Date: 20 February 2024
* $Revision: V.3.5.1
* $Date: 19 March 2024
* $Revision: V.3.6.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand All @@ -45,7 +45,6 @@
* Refer header file for details.
*
*/

arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
const cmsis_nn_conv_params *conv_params,
const cmsis_nn_per_channel_quant_params *quant_params,
Expand All @@ -59,24 +58,23 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
int8_t *output_data)
{
arm_cmsis_nn_status status = ARM_CMSIS_NN_SUCCESS;
int32_t buffer_size = arm_convolve_1_x_n_s8_get_buffer_size(conv_params, input_dims, filter_dims, output_dims);

/* The wrapper API is the ultimate reference for argument check */
if ((input_dims->h != 1) || conv_params->dilation.w != 1 || (buffer_size != 0 && ctx->buf == NULL) ||
conv_params->stride.w == 0 || (conv_params->stride.w * input_dims->c % 4 != 0))
if ((input_dims->h != 1) || conv_params->dilation.w != 1 || ctx->buf == NULL || conv_params->stride.w == 0 ||
(conv_params->stride.w * input_dims->c % 4 != 0))
{
status = ARM_CMSIS_NN_ARG_ERROR;
goto out;
return ARM_CMSIS_NN_ARG_ERROR;
}

#if defined(ARM_MATH_MVEI)
(void)bias_dims;
const uint16_t input_x = input_dims->w;
const uint16_t kernel_x = filter_dims->w;
const uint16_t output_x = output_dims->w;
const uint16_t output_ch = output_dims->c;
const uint16_t input_ch = input_dims->c;
const uint16_t pad_x = conv_params->padding.w;
const uint16_t stride_x = conv_params->stride.w;

const int32_t input_x = input_dims->w;
const int32_t kernel_x = filter_dims->w;
const int32_t output_x = output_dims->w;
const int32_t input_ch = input_dims->c;
const int32_t pad_x = conv_params->padding.w;
const int32_t stride_x = conv_params->stride.w;

// Total pad for dilation of 1
const int32_t total_pad = ((output_x - 1) * stride_x + kernel_x - input_x);
Expand All @@ -91,6 +89,13 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
const int32_t left_pad_num = pad_x != 0 ? MAX(1, (pad_x + stride_x - 1) / stride_x) : 0;
const int32_t no_pad_num = MAX(output_x - (right_pad_num + left_pad_num), 0);

const int32_t pad_size_left = pad_x * input_ch;
const int32_t pad_size_right = asym_pad ? right_pad_num * input_ch : pad_size_left;

const int32_t rhs_cols = kernel_x * input_ch;
const int32_t rhs_rows = output_dims->c;
const int32_t lhs_offset = input_ch * stride_x;

if (right_pad_num + no_pad_num + left_pad_num != output_x)
{
return arm_convolve_s8(ctx,
Expand All @@ -106,43 +111,46 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
output_data);
}

const uint32_t num_elem_left = kernel_x * input_ch;
const uint32_t num_elem_right = num_elem_left - input_ch;

for (int i_batch = 0; i_batch < input_dims->n; i_batch++)
{
// Handle left padded sections
/* Handle left padded sections */
int32_t lhs_rows = left_pad_num;
const int32_t rhs_cols = kernel_x * input_dims->c;
const int32_t rhs_rows = output_dims->c;
const int32_t lhs_offset = input_ch * stride_x;
int8_t *im2col = ctx->buf;

int32_t out_idx = 0;
arm_memset_s8(im2col, (int8_t)-conv_params->input_offset, sizeof(int8_t) * (uint32_t)pad_size_left);
im2col += pad_size_left;
arm_memcpy_s8(im2col, input_data, sizeof(int8_t) * num_elem_left);

for (int i = 0; i < lhs_rows; i++)
{
const int32_t est_input_x_idx = stride_x * i - pad_x;
const int32_t ker_begin_idx = -est_input_x_idx;

const int32_t actual_kernel_len = kernel_x - ker_begin_idx;

status = arm_nn_mat_mul_core_1x_s8(actual_kernel_len * input_ch,
ker_begin_idx * input_ch,
input_data,
filter_data + (ker_begin_idx * input_ch),
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
output_data += output_ch;
}
arm_nn_mat_mult_nt_t_s8((int8_t *)ctx->buf,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max,
rhs_rows,
lhs_offset);

out_idx += lhs_rows;
output_data += lhs_rows * rhs_rows;

/* Non padded elements */
int32_t out_idx = lhs_rows;
int32_t input_start = stride_x * lhs_rows - pad_x;

if (input_start < 0)
{
return ARM_CMSIS_NN_FAILURE;
}
/* Non padded elements */

input_start *= input_ch;
lhs_rows = no_pad_num;

Expand All @@ -163,32 +171,41 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,
lhs_offset);

output_data += lhs_rows * rhs_rows;
out_idx += lhs_rows;

/* Right padded elements */
out_idx += lhs_rows;
lhs_rows = output_x - out_idx;

if (lhs_rows < 0)
{
return ARM_CMSIS_NN_FAILURE;
}

for (int i = out_idx; i < output_x; i++)
{
const int32_t est_input_x_idx = stride_x * i - pad_x;
const int32_t ker_end_idx = MIN(kernel_x, input_x - est_input_x_idx);

status = arm_nn_mat_mul_core_1x_s8(ker_end_idx * input_ch,
(kernel_x - ker_end_idx) * input_ch,
input_data + est_input_x_idx * input_ch,
filter_data,
output_ch,
conv_params,
quant_params,
bias_data,
output_data);
output_data += output_ch;
}
im2col = ctx->buf;
input_start = (stride_x * (left_pad_num + no_pad_num) - pad_x) * input_ch;

arm_memcpy_s8(im2col, input_data + input_start, sizeof(int8_t) * num_elem_right);
im2col += num_elem_right;
arm_memset_s8(im2col, (int8_t)-conv_params->input_offset, sizeof(int8_t) * (uint32_t)pad_size_right);

arm_nn_mat_mult_nt_t_s8((int8_t *)ctx->buf,
filter_data,
bias_data,
output_data,
quant_params->multiplier,
quant_params->shift,
lhs_rows,
rhs_rows,
rhs_cols,
conv_params->input_offset,
conv_params->output_offset,
conv_params->activation.min,
conv_params->activation.max,
rhs_rows,
lhs_offset);

output_data += lhs_rows * rhs_rows;

/* Advance to the next batch */
input_data += (input_x * input_ch);
}
Expand All @@ -207,7 +224,6 @@ arm_cmsis_nn_status arm_convolve_1_x_n_s8(const cmsis_nn_context *ctx,

#endif

out:
/* Return to application */
return status;
}
Expand Down
12 changes: 9 additions & 3 deletions Source/ConvolutionFunctions/arm_convolve_get_buffer_sizes_s8.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_convolve_get_buffer_sizes_s8.c
* Description: Collection of get buffer size functions for the various s8 convolution layer functions.
*
* $Date: 27 February 2024
* $Revision: V.2.0.1
* $Date: 14 March 2024
* $Revision: V.2.1.0
*
* Target : Arm(R) M-Profile Architecture
*
Expand Down Expand Up @@ -74,7 +74,13 @@ __STATIC_INLINE int32_t arm_convolve_1_x_n_s8_get_buffer_size_mve(const cmsis_nn
return arm_convolve_s8_get_buffer_size_mve(input_dims, filter_dims);
}

return 0;
const int32_t pad_size_left = pad_x * input_dims->c;
const int32_t pad_size_right = asym_pad ? right_pad_num * input_dims->c : pad_size_left;
const int32_t num_elem_left = kernel_x * input_dims->c;
const int32_t num_elem_right = num_elem_left - input_dims->c;
const int32_t size_1_x_n = MAX(num_elem_left + pad_size_left, num_elem_right + pad_size_right);

return size_1_x_n;
}

int32_t arm_convolve_s8_get_buffer_size(const cmsis_nn_dims *input_dims, const cmsis_nn_dims *filter_dims)
Expand Down
7 changes: 7 additions & 0 deletions Tests/UnitTest/TestCases/TestData/conv_1_x_n_7/biases_data.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0).
// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f.
#pragma once
#include <stdint.h>

const int32_t conv_1_x_n_7_biases[15] =
{16695, -19618, -5834, 17091, -10157, 12429, -21448, 22243, -25922, 21908, -424, -16337, -19201, -1489, -24054};
24 changes: 24 additions & 0 deletions Tests/UnitTest/TestCases/TestData/conv_1_x_n_7/config_data.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Generated by test_settings.py using tensorflow version 2.15.0 (Keras version 2.15.0).
// Interpreter from tensorflow version 2.15.0 and revision v2.15.0-2-g0b15fdfcb3f.
#pragma once
#define CONV_1_X_N_7_OUT_CH 15
#define CONV_1_X_N_7_IN_CH 20
#define CONV_1_X_N_7_INPUT_W 148
#define CONV_1_X_N_7_INPUT_H 1
#define CONV_1_X_N_7_DST_SIZE 4440
#define CONV_1_X_N_7_INPUT_SIZE 2960
#define CONV_1_X_N_7_OUT_ACTIVATION_MIN -127
#define CONV_1_X_N_7_OUT_ACTIVATION_MAX 127
#define CONV_1_X_N_7_INPUT_BATCHES 2
#define CONV_1_X_N_7_FILTER_X 32
#define CONV_1_X_N_7_FILTER_Y 1
#define CONV_1_X_N_7_STRIDE_X 1
#define CONV_1_X_N_7_STRIDE_Y 1
#define CONV_1_X_N_7_PAD_X 15
#define CONV_1_X_N_7_PAD_Y 0
#define CONV_1_X_N_7_OUTPUT_W 148
#define CONV_1_X_N_7_OUTPUT_H 1
#define CONV_1_X_N_7_INPUT_OFFSET 128
#define CONV_1_X_N_7_OUTPUT_OFFSET -4
#define CONV_1_X_N_7_DILATION_X 1
#define CONV_1_X_N_7_DILATION_Y 1
Loading

0 comments on commit 2ef559d

Please sign in to comment.