From 50028533ec7c46c53b3ea260da7f2f6d30a30bc8 Mon Sep 17 00:00:00 2001
From: nihuini <nihuini@tencent.com>
Date: Tue, 31 Oct 2023 11:03:27 +0800
Subject: [PATCH] wip

---
 src/layer/arm/deconvolution_arm.cpp           | 143 +++++++++++++++++
 src/layer/arm/deconvolution_arm.h             |   2 +
 src/layer/arm/deconvolutiondepthwise_arm.cpp  | 144 ++++++++++++++++++
 src/layer/arm/deconvolutiondepthwise_arm.h    |   2 +
 .../loongarch/deconvolution_loongarch.cpp     | 109 +++++++++++++
 src/layer/loongarch/deconvolution_loongarch.h |   2 +
 .../deconvolutiondepthwise_loongarch.cpp      | 110 +++++++++++++
 .../deconvolutiondepthwise_loongarch.h        |   2 +
 src/layer/mips/deconvolution_mips.cpp         | 109 +++++++++++++
 src/layer/mips/deconvolution_mips.h           |   2 +
 .../mips/deconvolutiondepthwise_mips.cpp      | 110 +++++++++++++
 src/layer/mips/deconvolutiondepthwise_mips.h  |   2 +
 src/layer/riscv/deconvolution_riscv.cpp       | 127 +++++++++++++++
 src/layer/riscv/deconvolution_riscv.h         |   2 +
 .../riscv/deconvolutiondepthwise_riscv.cpp    | 128 ++++++++++++++++
 .../riscv/deconvolutiondepthwise_riscv.h      |   2 +
 16 files changed, 996 insertions(+)
diff --git a/src/layer/arm/deconvolution_arm.cpp b/src/layer/arm/deconvolution_arm.cpp
index 7744624b1c6..97034ac3c6e 100644
--- a/src/layer/arm/deconvolution_arm.cpp
+++ b/src/layer/arm/deconvolution_arm.cpp
@@ -49,6 +49,9 @@ Deconvolution_arm::Deconvolution_arm()
 
 int Deconvolution_arm::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
     activation = create_activation_layer(activation_type, activation_params, opt);
 
 #if NCNN_ARM82
@@ -750,6 +753,146 @@ int Deconvolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
     return 0;
 }
 
+int Deconvolution_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * 1;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+#if NCNN_ARM82
+    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
+    {
+        Mat weight_data_flattened_fp32;
+        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
+        weight_data_flattened = weight_data_flattened_fp32;
+    }
+#endif // NCNN_ARM82
+#if NCNN_BF16
+    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
+    {
+        Mat weight_data_flattened_fp32;
+        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
+        weight_data_flattened = weight_data_flattened_fp32;
+    }
+#endif // NCNN_BF16
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / 1;
+        const int inch_g = _num_input / 1;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < 1; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+#if NCNN_ARM82
+        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
+        {
+            Mat bias_data_flattened_fp32;
+            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
+            bias_data_flattened = bias_data_flattened_fp32;
+        }
+#endif // NCNN_ARM82
+#if NCNN_BF16
+        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
+        {
+            Mat bias_data_flattened_fp32;
+            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
+            bias_data_flattened = bias_data_flattened_fp32;
+        }
+#endif // NCNN_BF16
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 #if NCNN_BF16
 int Deconvolution_arm::create_pipeline_bf16s(const Option& opt)
 {
diff --git a/src/layer/arm/deconvolution_arm.h b/src/layer/arm/deconvolution_arm.h
index bbc1e6faa53..3c7979687cb 100644
--- a/src/layer/arm/deconvolution_arm.h
+++ b/src/layer/arm/deconvolution_arm.h
@@ -29,6 +29,8 @@ class Deconvolution_arm : virtual public Deconvolution
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 protected:
 #if NCNN_ARM82
     int create_pipeline_fp16s(const Option& opt);
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.cpp b/src/layer/arm/deconvolutiondepthwise_arm.cpp
index dd3b4f8bd0e..712ef4f7970 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.cpp
+++ b/src/layer/arm/deconvolutiondepthwise_arm.cpp
@@ -43,6 +43,9 @@ DeconvolutionDepthWise_arm::DeconvolutionDepthWise_arm()
 
 int DeconvolutionDepthWise_arm::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
 #if NCNN_ARM82
     if (support_fp16_storage && opt.use_fp16_storage)
     {
@@ -482,6 +485,147 @@ int DeconvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, c
     return 0;
 }
 
+int DeconvolutionDepthWise_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * group;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+#if NCNN_ARM82
+    if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && weight_data_flattened.elembits() == 16)
+    {
+        Mat weight_data_flattened_fp32;
+        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
+        weight_data_flattened = weight_data_flattened_fp32;
+    }
+#endif // NCNN_ARM82
+#if NCNN_BF16
+    if (opt.use_bf16_storage && weight_data_flattened.elembits() == 16)
+    {
+        Mat weight_data_flattened_fp32;
+        cast_bfloat16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
+        weight_data_flattened = weight_data_flattened_fp32;
+    }
+#endif // NCNN_BF16
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / group;
+        const int inch_g = _num_input / group;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < group; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+#if NCNN_ARM82
+        if (opt.use_fp16_storage && cpu_support_arm_asimdhp() && bias_data_flattened.elembits() == 16)
+        {
+            Mat bias_data_flattened_fp32;
+            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
+            bias_data_flattened = bias_data_flattened_fp32;
+        }
+#endif // NCNN_ARM82
+#if NCNN_BF16
+        if (opt.use_bf16_storage && bias_data_flattened.elembits() == 16)
+        {
+            Mat bias_data_flattened_fp32;
+            cast_bfloat16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
+            bias_data_flattened = bias_data_flattened_fp32;
+        }
+#endif // NCNN_BF16
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(7, group);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 #if NCNN_BF16
 int DeconvolutionDepthWise_arm::forward_bf16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
diff --git a/src/layer/arm/deconvolutiondepthwise_arm.h b/src/layer/arm/deconvolutiondepthwise_arm.h
index 2022d2a54cb..6eff45ede3a 100644
--- a/src/layer/arm/deconvolutiondepthwise_arm.h
+++ b/src/layer/arm/deconvolutiondepthwise_arm.h
@@ -29,6 +29,8 @@ class DeconvolutionDepthWise_arm : virtual public DeconvolutionDepthWise
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 protected:
 #if NCNN_ARM82
     int create_pipeline_fp16s(const Option& opt);
diff --git a/src/layer/loongarch/deconvolution_loongarch.cpp b/src/layer/loongarch/deconvolution_loongarch.cpp
index bb913909b55..2d934bccb06 100644
--- a/src/layer/loongarch/deconvolution_loongarch.cpp
+++ b/src/layer/loongarch/deconvolution_loongarch.cpp
@@ -40,6 +40,9 @@ Deconvolution_loongarch::Deconvolution_loongarch()
 
 int Deconvolution_loongarch::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
     const int maxk = kernel_w * kernel_h;
     int num_input = weight_data_size / maxk / num_output;
 
@@ -281,4 +284,110 @@ int Deconvolution_loongarch::forward(const Mat& bottom_blob, Mat& top_blob, cons
     return 0;
 }
 
+int Deconvolution_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * 1;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / 1;
+        const int inch_g = _num_input / 1;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < 1; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/loongarch/deconvolution_loongarch.h b/src/layer/loongarch/deconvolution_loongarch.h
index bb7653b563f..f67b5d7e4e1 100644
--- a/src/layer/loongarch/deconvolution_loongarch.h
+++ b/src/layer/loongarch/deconvolution_loongarch.h
@@ -29,6 +29,8 @@ class Deconvolution_loongarch : virtual public Deconvolution
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 public:
     Mat weight_data_tm;
 };
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
index a141dd70360..f4f4d37bf7f 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.cpp
@@ -34,6 +34,9 @@ DeconvolutionDepthWise_loongarch::DeconvolutionDepthWise_loongarch()
 
 int DeconvolutionDepthWise_loongarch::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
     const int maxk = kernel_w * kernel_h;
     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
 
@@ -409,4 +412,111 @@ int DeconvolutionDepthWise_loongarch::forward(const Mat& bottom_blob, Mat& top_b
     return 0;
 }
 
+int DeconvolutionDepthWise_loongarch::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * group;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / group;
+        const int inch_g = _num_input / group;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < group; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(7, group);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
index e41e7cac9e1..b710f07ecf3 100644
--- a/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
+++ b/src/layer/loongarch/deconvolutiondepthwise_loongarch.h
@@ -29,6 +29,8 @@ class DeconvolutionDepthWise_loongarch : virtual public DeconvolutionDepthWise
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 protected:
     int create_group_ops(const Option& opt);
 
diff --git a/src/layer/mips/deconvolution_mips.cpp b/src/layer/mips/deconvolution_mips.cpp
index c3838f3ea69..506d3072096 100644
--- a/src/layer/mips/deconvolution_mips.cpp
+++ b/src/layer/mips/deconvolution_mips.cpp
@@ -40,6 +40,9 @@ Deconvolution_mips::Deconvolution_mips()
 
 int Deconvolution_mips::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
     const int maxk = kernel_w * kernel_h;
     int num_input = weight_data_size / maxk / num_output;
 
@@ -281,4 +284,110 @@ int Deconvolution_mips::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
     return 0;
 }
 
+int Deconvolution_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * 1;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / 1;
+        const int inch_g = _num_input / 1;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < 1; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/mips/deconvolution_mips.h b/src/layer/mips/deconvolution_mips.h
index 4d5e1ad985a..218bd812672 100644
--- a/src/layer/mips/deconvolution_mips.h
+++ b/src/layer/mips/deconvolution_mips.h
@@ -29,6 +29,8 @@ class Deconvolution_mips : virtual public Deconvolution
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 public:
     Mat weight_data_tm;
 };
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.cpp b/src/layer/mips/deconvolutiondepthwise_mips.cpp
index da76b5801cd..533bf522ad9 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.cpp
+++ b/src/layer/mips/deconvolutiondepthwise_mips.cpp
@@ -34,6 +34,9 @@ DeconvolutionDepthWise_mips::DeconvolutionDepthWise_mips()
 
 int DeconvolutionDepthWise_mips::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
     const int maxk = kernel_w * kernel_h;
     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
 
@@ -409,4 +412,111 @@ int DeconvolutionDepthWise_mips::forward(const Mat& bottom_blob, Mat& top_blob,
     return 0;
 }
 
+int DeconvolutionDepthWise_mips::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * group;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / group;
+        const int inch_g = _num_input / group;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < group; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(7, group);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 } // namespace ncnn
diff --git a/src/layer/mips/deconvolutiondepthwise_mips.h b/src/layer/mips/deconvolutiondepthwise_mips.h
index 90cb7c3acbb..a033d7c11c3 100644
--- a/src/layer/mips/deconvolutiondepthwise_mips.h
+++ b/src/layer/mips/deconvolutiondepthwise_mips.h
@@ -29,6 +29,8 @@ class DeconvolutionDepthWise_mips : virtual public DeconvolutionDepthWise
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 protected:
     int create_group_ops(const Option& opt);
 
diff --git a/src/layer/riscv/deconvolution_riscv.cpp b/src/layer/riscv/deconvolution_riscv.cpp
index 936b563f646..b3a85fe946a 100644
--- a/src/layer/riscv/deconvolution_riscv.cpp
+++ b/src/layer/riscv/deconvolution_riscv.cpp
@@ -50,6 +50,9 @@ Deconvolution_riscv::Deconvolution_riscv()
 
 int Deconvolution_riscv::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
 #if __riscv_vector && __riscv_zfh
     if (opt.use_fp16_storage)
     {
@@ -318,6 +321,130 @@ int Deconvolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op
     return 0;
 }
 
+int Deconvolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * 1;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+#if NCNN_RVV
+    if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16)
+    {
+        Mat weight_data_flattened_fp32;
+        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
+        weight_data_flattened = weight_data_flattened_fp32;
+    }
+#endif // NCNN_RVV
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / 1, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / 1;
+        const int inch_g = _num_input / 1;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < 1; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+#if NCNN_RVV
+        if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16)
+        {
+            Mat bias_data_flattened_fp32;
+            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
+            bias_data_flattened = bias_data_flattened_fp32;
+        }
+#endif // NCNN_RVV
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Deconvolution);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 #if __riscv_vector && __riscv_zfh
 int Deconvolution_riscv::create_pipeline_fp16s(const Option& opt)
 {
diff --git a/src/layer/riscv/deconvolution_riscv.h b/src/layer/riscv/deconvolution_riscv.h
index 3574c09d10e..903a420427a 100644
--- a/src/layer/riscv/deconvolution_riscv.h
+++ b/src/layer/riscv/deconvolution_riscv.h
@@ -29,6 +29,8 @@ class Deconvolution_riscv : virtual public Deconvolution
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 protected:
 #if __riscv_vector && __riscv_zfh
     int create_pipeline_fp16s(const Option& opt);
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index b53e8962fd2..4dced8e658b 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -37,6 +37,9 @@ DeconvolutionDepthWise_riscv::DeconvolutionDepthWise_riscv()
 
 int DeconvolutionDepthWise_riscv::create_pipeline(const Option& opt)
 {
+    if (dynamic_weight)
+        return 0;
+
 #if __riscv_vector && __riscv_zfh
     if (opt.use_fp16_storage)
     {
@@ -445,6 +448,131 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob,
     return 0;
 }
 
+int DeconvolutionDepthWise_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& _weight_data = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+
+    const int _num_input = bottom_blob.c * bottom_blob.elempack;
+    const int _kernel_w = _weight_data.w;
+    const int _kernel_h = _weight_data.h;
+    const int _num_output = _weight_data.d * group;
+
+    Mat weight_data_flattened;
+    flatten(_weight_data, weight_data_flattened, opt);
+    if (weight_data_flattened.empty())
+        return -100;
+
+#if NCNN_RVV
+    if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && weight_data_flattened.elembits() == 16)
+    {
+        Mat weight_data_flattened_fp32;
+        cast_float16_to_float32(weight_data_flattened, weight_data_flattened_fp32, opt);
+        weight_data_flattened = weight_data_flattened_fp32;
+    }
+#endif // NCNN_RVV
+
+    // weight_data_flattened as pack1
+    weight_data_flattened.w *= weight_data_flattened.elempack;
+    weight_data_flattened.elemsize /= weight_data_flattened.elempack;
+    weight_data_flattened.elempack = 1;
+
+    // transpose group-inch/group-outch/group-kh-kw to group-outch/group-inch/group-kh-kw
+    Mat weight_data_transposed;
+    {
+        weight_data_transposed.create(_kernel_w * _kernel_h * _num_output * _num_input / group, 4u, opt.workspace_allocator);
+        if (weight_data_transposed.empty())
+            return -100;
+
+        const int outch_g = _num_output / group;
+        const int inch_g = _num_input / group;
+        const int maxk = _kernel_h * _kernel_w;
+
+        for (int g = 0; g < group; g++)
+        {
+            // reorder weight from inch-outch to outch-inch
+            float* wg2 = (float*)weight_data_transposed + g * outch_g * inch_g * maxk;
+            const float* wg = (const float*)weight_data_flattened + g * inch_g * outch_g * maxk;
+            for (int i = 0; i < outch_g; i++)
+            {
+                for (int j = 0; j < inch_g; j++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        wg2[(i * inch_g + j) * maxk + k] = wg[(j * outch_g + i) * maxk + k];
+                    }
+                }
+            }
+        }
+    }
+
+    Mat bias_data_flattened;
+    if (bias_term)
+    {
+        const Mat& _bias_data = bottom_blobs[2];
+        flatten(_bias_data, bias_data_flattened, opt);
+        if (bias_data_flattened.empty())
+            return -100;
+
+#if NCNN_RVV
+        if (opt.use_fp16_storage && cpu_support_riscv_v() && cpu_support_riscv_zfh() && bias_data_flattened.elembits() == 16)
+        {
+            Mat bias_data_flattened_fp32;
+            cast_float16_to_float32(bias_data_flattened, bias_data_flattened_fp32, opt);
+            bias_data_flattened = bias_data_flattened_fp32;
+        }
+#endif // NCNN_RVV
+
+        // bias_data_flattened as pack1
+        bias_data_flattened.w *= bias_data_flattened.elempack;
+        bias_data_flattened.elemsize /= bias_data_flattened.elempack;
+        bias_data_flattened.elempack = 1;
+    }
+
+    ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::DeconvolutionDepthWise);
+
+    ncnn::ParamDict pd;
+    pd.set(0, _num_output);
+    pd.set(1, _kernel_w);
+    pd.set(11, _kernel_h);
+    pd.set(2, dilation_w);
+    pd.set(12, dilation_h);
+    pd.set(3, stride_w);
+    pd.set(13, stride_h);
+    pd.set(4, pad_left);
+    pd.set(15, pad_right);
+    pd.set(14, pad_top);
+    pd.set(16, pad_bottom);
+    pd.set(18, output_pad_right);
+    pd.set(19, output_pad_bottom);
+    pd.set(20, output_w);
+    pd.set(21, output_h);
+    pd.set(5, bias_term);
+    pd.set(6, weight_data_transposed.w);
+    pd.set(7, group);
+    pd.set(9, activation_type);
+    pd.set(10, activation_params);
+
+    op->load_param(pd);
+
+    ncnn::Mat weights[2];
+    weights[0] = weight_data_transposed;
+    weights[1] = bias_data_flattened;
+
+    op->load_model(ncnn::ModelBinFromMatArray(weights));
+
+    op->create_pipeline(opt);
+
+    op->forward(bottom_blob, top_blob, opt);
+
+    op->destroy_pipeline(opt);
+
+    delete op;
+
+    return 0;
+}
+
 #if __riscv_vector && __riscv_zfh
 int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 {
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.h b/src/layer/riscv/deconvolutiondepthwise_riscv.h
index ccda5f248e3..5cdbd0d0676 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.h
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.h
@@ -29,6 +29,8 @@ class DeconvolutionDepthWise_riscv : virtual public DeconvolutionDepthWise
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
 protected:
     int create_group_ops(const Option& opt);
 #if __riscv_vector && __riscv_zfh