Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Oct 31, 2023
1 parent c78bb2b commit 5002853
Show file tree
Hide file tree
Showing 16 changed files with 996 additions and 0 deletions.
143 changes: 143 additions & 0 deletions src/layer/arm/deconvolution_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ Deconvolution_arm::Deconvolution_arm()

int Deconvolution_arm::create_pipeline(const Option& opt)
{
if (dynamic_weight)
return 0;

activation = create_activation_layer(activation_type, activation_params, opt);

#if NCNN_ARM82
Expand Down Expand Up @@ -750,6 +753,146 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
return 0;
}

int Deconvolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
const Mat& _weight_data = bottom_blobs[1];
Mat& top_blob = top_blobs[0];

const int _num_input = bottom_blob.c * bottom_blob.elempack;
const int _kernel_w = _weight_data.w;
const int _kernel_h = _weight_data.h;
const int _num_output = _weight_data.d * 1;

Mat weight_data_flattened;
flatten(_weight_data, weight_data_flattened, opt);
if (weight_data_flattened.empty())
return -100;

#if NCNN_ARM82
if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
{
Mat weight_data_flattened_fp32;
cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
weight_data_flattened = weight_data_flattened_fp32;
}
#endif // NCNN_ARM82
#if NCNN_BF16
if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
{
Mat weight_data_flattened_fp32;
cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
weight_data_flattened = weight_data_flattened_fp32;
}
#endif // NCNN_BF16

// weight_data_flattened as pack1
weight_data_flattened.w *= weight_data_flattened.elempack;
weight_data_flattened.elemsize /= weight_data_flattened.elempack;
weight_data_flattened.elempack = 1;

// transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
Mat weight_data_transposed;
{
weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
if (weight_data_transposed.empty())
return -100;

const int outch_g = _num_output / 1;
const int inch_g = _num_input / 1;
const int maxk = _kernel_h * _kernel_w;

for (int g = 0; g < 1; g++)
{
// reorder weight from inch-outch to outch-inch
float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
for (int i = 0; i < outch_g; i++)
{
for (int j = 0; j < inch_g; j++)
{
for (int k = 0; k < maxk; k++)
{
wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
}
}
}
}
}

Mat bias_data_flattened;
if (bias_term)
{
const Mat& _bias_data = bottom_blobs[2];
flatten(_bias_data, bias_data_flattened, opt);
if (bias_data_flattened.empty())
return -100;

#if NCNN_ARM82
if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
{
Mat bias_data_flattened_fp32;
cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
bias_data_flattened = bias_data_flattened_fp32;
}
#endif // NCNN_ARM82
#if NCNN_BF16
if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
{
Mat bias_data_flattened_fp32;
cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
bias_data_flattened = bias_data_flattened_fp32;
}
#endif // NCNN_BF16

// bias_data_flattened as pack1
bias_data_flattened.w *= bias_data_flattened.elempack;
bias_data_flattened.elemsize /= bias_data_flattened.elempack;
bias_data_flattened.elempack = 1;
}

ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);

ncnn::ParamDict pd;
pd.set(0, _num_output);
pd.set(1, _kernel_w);
pd.set(11, _kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, pad_left);
pd.set(15, pad_right);
pd.set(14, pad_top);
pd.set(16, pad_bottom);
pd.set(18, output_pad_right);
pd.set(19, output_pad_bottom);
pd.set(20, output_w);
pd.set(21, output_h);
pd.set(5, bias_term);
pd.set(6, weight_data_transposed.w);
pd.set(9, activation_type);
pd.set(10, activation_params);

op->load_param(pd);

ncnn::Mat weights[2];
weights[0] = weight_data_transposed;
weights[1] = bias_data_flattened;

op->load_model(ncnn::ModelBinFromMatArray(weights));

op->create_pipeline(opt);

op->forward(bottom_blob, top_blob, opt);

op->destroy_pipeline(opt);

delete op;

return 0;
}

#if NCNN_BF16
int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
{
Expand Down
2 changes: 2 additions & 0 deletions src/layer/arm/deconvolution_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class Deconvolution_arm : virtual public Deconvolution

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
int create_pipeline_fp16s(const Option& opt);
Expand Down
144 changes: 144 additions & 0 deletions src/layer/arm/deconvolutiondepthwise_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()

int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
{
if (dynamic_weight)
return 0;

#if NCNN_ARM82
if (support_fp16_storage && opt.use_fp16_storage)
{
Expand Down Expand Up @@ -482,6 +485,147 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c
return 0;
}

int DeconvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
const Mat& bottom_blob = bottom_blobs[0];
const Mat& _weight_data = bottom_blobs[1];
Mat& top_blob = top_blobs[0];

const int _num_input = bottom_blob.c * bottom_blob.elempack;
const int _kernel_w = _weight_data.w;
const int _kernel_h = _weight_data.h;
const int _num_output = _weight_data.d * group;

Mat weight_data_flattened;
flatten(_weight_data, weight_data_flattened, opt);
if (weight_data_flattened.empty())
return -100;

#if NCNN_ARM82
if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
{
Mat weight_data_flattened_fp32;
cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
weight_data_flattened = weight_data_flattened_fp32;
}
#endif // NCNN_ARM82
#if NCNN_BF16
if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
{
Mat weight_data_flattened_fp32;
cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
weight_data_flattened = weight_data_flattened_fp32;
}
#endif // NCNN_BF16

// weight_data_flattened as pack1
weight_data_flattened.w *= weight_data_flattened.elempack;
weight_data_flattened.elemsize /= weight_data_flattened.elempack;
weight_data_flattened.elempack = 1;

// transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
Mat weight_data_transposed;
{
weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
if (weight_data_transposed.empty())
return -100;

const int outch_g = _num_output / group;
const int inch_g = _num_input / group;
const int maxk = _kernel_h * _kernel_w;

for (int g = 0; g < group; g++)
{
// reorder weight from inch-outch to outch-inch
float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
for (int i = 0; i < outch_g; i++)
{
for (int j = 0; j < inch_g; j++)
{
for (int k = 0; k < maxk; k++)
{
wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
}
}
}
}
}

Mat bias_data_flattened;
if (bias_term)
{
const Mat& _bias_data = bottom_blobs[2];
flatten(_bias_data, bias_data_flattened, opt);
if (bias_data_flattened.empty())
return -100;

#if NCNN_ARM82
if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
{
Mat bias_data_flattened_fp32;
cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
bias_data_flattened = bias_data_flattened_fp32;
}
#endif // NCNN_ARM82
#if NCNN_BF16
if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
{
Mat bias_data_flattened_fp32;
cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
bias_data_flattened = bias_data_flattened_fp32;
}
#endif // NCNN_BF16

// bias_data_flattened as pack1
bias_data_flattened.w *= bias_data_flattened.elempack;
bias_data_flattened.elemsize /= bias_data_flattened.elempack;
bias_data_flattened.elempack = 1;
}

ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);

ncnn::ParamDict pd;
pd.set(0, _num_output);
pd.set(1, _kernel_w);
pd.set(11, _kernel_h);
pd.set(2, dilation_w);
pd.set(12, dilation_h);
pd.set(3, stride_w);
pd.set(13, stride_h);
pd.set(4, pad_left);
pd.set(15, pad_right);
pd.set(14, pad_top);
pd.set(16, pad_bottom);
pd.set(18, output_pad_right);
pd.set(19, output_pad_bottom);
pd.set(20, output_w);
pd.set(21, output_h);
pd.set(5, bias_term);
pd.set(6, weight_data_transposed.w);
pd.set(7, group);
pd.set(9, activation_type);
pd.set(10, activation_params);

op->load_param(pd);

ncnn::Mat weights[2];
weights[0] = weight_data_transposed;
weights[1] = bias_data_flattened;

op->load_model(ncnn::ModelBinFromMatArray(weights));

op->create_pipeline(opt);

op->forward(bottom_blob, top_blob, opt);

op->destroy_pipeline(opt);

delete op;

return 0;
}

#if NCNN_BF16
int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
{
Expand Down
2 changes: 2 additions & 0 deletions src/layer/arm/deconvolutiondepthwise_arm.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ class DeconvolutionDepthWise_arm : virtual public DeconvolutionDepthWise

virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

protected:
#if NCNN_ARM82
int create_pipeline_fp16s(const Option& opt);
Expand Down
Loading

0 comments on commit 5002853

Please sign in to comment.