From 50028533ec7c46c53b3ea260da7f2f6d30a30bc8 Mon Sep 17 00:00:00 2001 From: nihuini Date: Tue, 31 Oct 2023 11:03:27 +0800 Subject: [PATCH] wip --- src/layer/arm/deconvolution_arm.cpp | 143 +++++++++++++++++ src/layer/arm/deconvolution_arm.h | 2 + src/layer/arm/deconvolutiondepthwise_arm.cpp | 144 ++++++++++++++++++ src/layer/arm/deconvolutiondepthwise_arm.h | 2 + .../loongarch/deconvolution_loongarch.cpp | 109 +++++++++++++ src/layer/loongarch/deconvolution_loongarch.h | 2 + .../deconvolutiondepthwise_loongarch.cpp | 110 +++++++++++++ .../deconvolutiondepthwise_loongarch.h | 2 + src/layer/mips/deconvolution_mips.cpp | 109 +++++++++++++ src/layer/mips/deconvolution_mips.h | 2 + .../mips/deconvolutiondepthwise_mips.cpp | 110 +++++++++++++ src/layer/mips/deconvolutiondepthwise_mips.h | 2 + src/layer/riscv/deconvolution_riscv.cpp | 127 +++++++++++++++ src/layer/riscv/deconvolution_riscv.h | 2 + .../riscv/deconvolutiondepthwise_riscv.cpp | 128 ++++++++++++++++ .../riscv/deconvolutiondepthwise_riscv.h | 2 + 16 files changed, 996 insertions(+) diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp index 7744624b1c6..97034ac3c6e 100644 --- a/src/layer/arm/deconvolution_arm.cpp +++ b/src/layer/arm/deconvolution_arm.cpp @@ -49,6 +49,9 @@ Deconvolution_arm::Deconvolution_arm() int Deconvolution_arm::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + activation = create_activation_layer(activation_type, activation_params, opt); #if NCNN_ARM82 @@ -750,6 +753,146 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti return 0; } +int Deconvolution_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * 1; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + +#if NCNN_ARM82 + if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16) + { + Mat weight_data_flattened_fp32; + cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); + weight_data_flattened = weight_data_flattened_fp32; + } +#endif // NCNN_ARM82 +#if NCNN_BF16 + if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16) + { + Mat weight_data_flattened_fp32; + cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); + weight_data_flattened = weight_data_flattened_fp32; + } +#endif // NCNN_BF16 + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / 1; + const int inch_g = _num_input / 1; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < 1; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + +#if NCNN_ARM82 + if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16) + { + Mat bias_data_flattened_fp32; + cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); + bias_data_flattened = bias_data_flattened_fp32; + } +#endif // NCNN_ARM82 +#if NCNN_BF16 + if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16) + { + Mat bias_data_flattened_fp32; + cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); + bias_data_flattened = bias_data_flattened_fp32; + } +#endif // NCNN_BF16 + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + #if NCNN_BF16 int Deconvolution_arm::create_pipeline_bf16s(const Option& opt) { diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h index bbc1e6faa53..3c7979687cb 100644 --- a/src/layer/arm/deconvolution_arm.h +++ b/src/layer/arm/deconvolution_arm.h @@ -29,6 +29,8 @@ class Deconvolution_arm : virtual public Deconvolution virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + protected: #if NCNN_ARM82 int create_pipeline_fp16s(const Option& opt); diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp index dd3b4f8bd0e..712ef4f7970 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.cpp +++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp @@ -43,6 +43,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm() int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + #if NCNN_ARM82 if (support_fp16_storage && opt.use_fp16_storage) { @@ -482,6 +485,147 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c return 0; } +int DeconvolutionDepthWise_arm::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * group; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + +#if NCNN_ARM82 + if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16) + { + Mat weight_data_flattened_fp32; + cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); + weight_data_flattened = weight_data_flattened_fp32; + } +#endif // NCNN_ARM82 +#if NCNN_BF16 + if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16) + { + Mat weight_data_flattened_fp32; + cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); + weight_data_flattened = weight_data_flattened_fp32; + } +#endif // NCNN_BF16 + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / group; + const int inch_g = _num_input / group; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < group; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + +#if NCNN_ARM82 + if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16) + { + Mat bias_data_flattened_fp32; + cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); + bias_data_flattened = bias_data_flattened_fp32; + } +#endif // NCNN_ARM82 +#if NCNN_BF16 + if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16) + { + Mat bias_data_flattened_fp32; + cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); + bias_data_flattened = bias_data_flattened_fp32; + } +#endif // NCNN_BF16 + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(7, group); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + #if NCNN_BF16 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h index 2022d2a54cb..6eff45ede3a 100644 --- a/src/layer/arm/deconvolutiondepthwise_arm.h +++ b/src/layer/arm/deconvolutiondepthwise_arm.h @@ -29,6 +29,8 @@ class DeconvolutionDepthWise_arm : virtual public DeconvolutionDepthWise virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + protected: #if NCNN_ARM82 int create_pipeline_fp16s(const Option& opt); diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp index bb913909b55..2d934bccb06 100644 --- a/src/layer/loongarch/deconvolution_loongarch.cpp +++ b/src/layer/loongarch/deconvolution_loongarch.cpp @@ -40,6 +40,9 @@ Deconvolution_loongarch::Deconvolution_loongarch() int Deconvolution_loongarch::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + const int maxk = kernel_w * kernel_h; int num_input = weight_data_size / maxk / num_output; @@ -281,4 +284,110 @@ int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, cons return 0; } +int Deconvolution_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * 1; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / 1; + const int inch_g = _num_input / 1; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < 1; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + } // namespace ncnn diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h index bb7653b563f..f67b5d7e4e1 100644 --- a/src/layer/loongarch/deconvolution_loongarch.h +++ b/src/layer/loongarch/deconvolution_loongarch.h @@ -29,6 +29,8 @@ class Deconvolution_loongarch : virtual public Deconvolution virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + public: Mat weight_data_tm; }; diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp index a141dd70360..f4f4d37bf7f 100644 --- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp +++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp @@ -34,6 +34,9 @@ DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch() int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; @@ -409,4 +412,111 @@ int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_b return 0; } +int DeconvolutionDepthWise_loongarch::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * group; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / group; + const int inch_g = _num_input / group; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < group; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(7, group); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + } // namespace ncnn diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h index e41e7cac9e1..b710f07ecf3 100644 --- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h +++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h @@ -29,6 +29,8 @@ class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + protected: int create_group_ops(const Option& opt); diff --git a/src/layer/mips/deconvolution_mips.cpp b/src/layer/mips/deconvolution_mips.cpp index c3838f3ea69..506d3072096 100644 --- a/src/layer/mips/deconvolution_mips.cpp +++ b/src/layer/mips/deconvolution_mips.cpp @@ -40,6 +40,9 @@ Deconvolution_mips::Deconvolution_mips() int Deconvolution_mips::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + const int maxk = kernel_w * kernel_h; int num_input = weight_data_size / maxk / num_output; @@ -281,4 +284,110 @@ int Deconvolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt return 0; } +int Deconvolution_mips::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * 1; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / 1; + const int inch_g = _num_input / 1; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < 1; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + } // namespace ncnn diff --git a/src/layer/mips/deconvolution_mips.h b/src/layer/mips/deconvolution_mips.h index 4d5e1ad985a..218bd812672 100644 --- a/src/layer/mips/deconvolution_mips.h +++ b/src/layer/mips/deconvolution_mips.h @@ -29,6 +29,8 @@ class Deconvolution_mips : virtual public Deconvolution virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + public: Mat weight_data_tm; }; diff --git a/src/layer/mips/deconvolutiondepthwise_mips.cpp b/src/layer/mips/deconvolutiondepthwise_mips.cpp index da76b5801cd..533bf522ad9 100644 --- a/src/layer/mips/deconvolutiondepthwise_mips.cpp +++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp @@ -34,6 +34,9 @@ DeconvolutionDepthWise_mips::DeconvolutionDepthWise_mips() int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + const int maxk = kernel_w * kernel_h; int channels = (weight_data_size / group) / maxk / (num_output / group) * group; @@ -409,4 +412,111 @@ int DeconvolutionDepthWise_mips::forward(const Mat& bottom_blob, Mat& top_blob, return 0; } +int DeconvolutionDepthWise_mips::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * group; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / group; + const int inch_g = _num_input / group; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < group; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(7, group); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + } // namespace ncnn diff --git a/src/layer/mips/deconvolutiondepthwise_mips.h b/src/layer/mips/deconvolutiondepthwise_mips.h index 90cb7c3acbb..a033d7c11c3 100644 --- a/src/layer/mips/deconvolutiondepthwise_mips.h +++ b/src/layer/mips/deconvolutiondepthwise_mips.h @@ -29,6 +29,8 @@ class DeconvolutionDepthWise_mips : virtual public DeconvolutionDepthWise virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + protected: int create_group_ops(const Option& opt); diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp index 936b563f646..b3a85fe946a 100644 --- a/src/layer/riscv/deconvolution_riscv.cpp +++ b/src/layer/riscv/deconvolution_riscv.cpp @@ -50,6 +50,9 @@ Deconvolution_riscv::Deconvolution_riscv() int Deconvolution_riscv::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + #if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage) { @@ -318,6 +321,130 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op return 0; } +int Deconvolution_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * 1; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + +#if NCNN_RVV + if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + { + Mat weight_data_flattened_fp32; + cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); + weight_data_flattened = weight_data_flattened_fp32; + } +#endif // NCNN_RVV + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / 1; + const int inch_g = _num_input / 1; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < 1; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + +#if NCNN_RVV + if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + { + Mat bias_data_flattened_fp32; + cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); + bias_data_flattened = bias_data_flattened_fp32; + } +#endif // NCNN_RVV + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + #if __riscv_vector && __riscv_zfh int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt) { diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h index 3574c09d10e..903a420427a 100644 --- a/src/layer/riscv/deconvolution_riscv.h +++ b/src/layer/riscv/deconvolution_riscv.h @@ -29,6 +29,8 @@ class Deconvolution_riscv : virtual public Deconvolution virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + protected: #if __riscv_vector && __riscv_zfh int create_pipeline_fp16s(const Option& opt); diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index b53e8962fd2..4dced8e658b 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -37,6 +37,9 @@ DeconvolutionDepthWise_riscv::DeconvolutionDepthWise_riscv() int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt) { + if (dynamic_weight) + return 0; + #if __riscv_vector && __riscv_zfh if (opt.use_fp16_storage) { @@ -445,6 +448,131 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, return 0; } +int DeconvolutionDepthWise_riscv::forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const +{ + const Mat& bottom_blob = bottom_blobs[0]; + const Mat& _weight_data = bottom_blobs[1]; + Mat& top_blob = top_blobs[0]; + + const int _num_input = bottom_blob.c * bottom_blob.elempack; + const int _kernel_w = _weight_data.w; + const int _kernel_h = _weight_data.h; + const int _num_output = _weight_data.d * group; + + Mat weight_data_flattened; + flatten(_weight_data, weight_data_flattened, opt); + if (weight_data_flattened.empty()) + return -100; + +#if NCNN_RVV + if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16) + { + Mat weight_data_flattened_fp32; + cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt); + weight_data_flattened = weight_data_flattened_fp32; + } +#endif // NCNN_RVV + + // weight_data_flattened as pack1 + weight_data_flattened.w *= weight_data_flattened.elempack; + weight_data_flattened.elemsize /= weight_data_flattened.elempack; + weight_data_flattened.elempack = 1; + + // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw + Mat weight_data_transposed; + { + weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator); + if (weight_data_transposed.empty()) + return -100; + + const int outch_g = _num_output / group; + const int inch_g = _num_input / group; + const int maxk = _kernel_h * _kernel_w; + + for (int g = 0; g < group; g++) + { + // reorder weight from inch-outch to outch-inch + float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk; + const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk; + for (int i = 0; i < outch_g; i++) + { + for (int j = 0; j < inch_g; j++) + { + for (int k = 0; k < maxk; k++) + { + wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k]; + } + } + } + } + } + + Mat bias_data_flattened; + if (bias_term) + { + const Mat& _bias_data = bottom_blobs[2]; + flatten(_bias_data, bias_data_flattened, opt); + if (bias_data_flattened.empty()) + return -100; + +#if NCNN_RVV + if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16) + { + Mat bias_data_flattened_fp32; + cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt); + bias_data_flattened = bias_data_flattened_fp32; + } +#endif // NCNN_RVV + + // bias_data_flattened as pack1 + bias_data_flattened.w *= bias_data_flattened.elempack; + bias_data_flattened.elemsize /= bias_data_flattened.elempack; + bias_data_flattened.elempack = 1; + } + + ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise); + + ncnn::ParamDict pd; + pd.set(0, _num_output); + pd.set(1, _kernel_w); + pd.set(11, _kernel_h); + pd.set(2, dilation_w); + pd.set(12, dilation_h); + pd.set(3, stride_w); + pd.set(13, stride_h); + pd.set(4, pad_left); + pd.set(15, pad_right); + pd.set(14, pad_top); + pd.set(16, pad_bottom); + pd.set(18, output_pad_right); + pd.set(19, output_pad_bottom); + pd.set(20, output_w); + pd.set(21, output_h); + pd.set(5, bias_term); + pd.set(6, weight_data_transposed.w); + pd.set(7, group); + pd.set(9, activation_type); + pd.set(10, activation_params); + + op->load_param(pd); + + ncnn::Mat weights[2]; + weights[0] = weight_data_transposed; + weights[1] = bias_data_flattened; + + op->load_model(ncnn::ModelBinFromMatArray(weights)); + + op->create_pipeline(opt); + + op->forward(bottom_blob, top_blob, opt); + + op->destroy_pipeline(opt); + + delete op; + + return 0; +} + #if __riscv_vector && __riscv_zfh int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) { diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h index ccda5f248e3..5cdbd0d0676 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.h +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h @@ -29,6 +29,8 @@ class DeconvolutionDepthWise_riscv : virtual public DeconvolutionDepthWise virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + virtual int forward(const std::vector& bottom_blobs, std::vector& top_blobs, const Option& opt) const; + protected: int create_group_ops(const Option& opt); #if __riscv_vector && __riscv_zfh