diff --git a/VERSION b/VERSION
index 3c43790f..fd9d1a5a 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-1.2.6
+1.2.14
diff --git a/src/tim/vx/internal/include/interface/ops.def b/src/tim/vx/internal/include/interface/ops.def
index fe42d453..23d3f746 100644
--- a/src/tim/vx/internal/include/interface/ops.def
+++ b/src/tim/vx/internal/include/interface/ops.def
@@ -199,3 +199,7 @@ DEF_OP(CROP_AND_RESIZE)
 DEF_OP(TAN)
 DEF_OP(RMSNORM)
 DEF_OP(SHAPE)
+DEF_OP(BITCAST)
+DEF_OP(GROUPED_CONV3D)
+DEF_OP(COL2IM)
+DEF_OP(L1_LAYER_NORM)
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h b/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h
new file mode 100644
index 00000000..9592e6a0
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_bitcast.h
@@ -0,0 +1,44 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_BITCAST_H
+#define _VSI_NN_OP_BITCAST_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_bitcast_param
+{
+    struct _bitcast_local_data_t* local;
+} vsi_nn_bitcast_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h b/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h
new file mode 100644
index 00000000..0cbadb72
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_col2im.h
@@ -0,0 +1,49 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_COL2IM_H
+#define _VSI_NN_OP_COL2IM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_col2im_param
+{
+    const int32_t* image_shape;
+    const int32_t* block_shape;
+    int32_t      strides[3];
+    int32_t      pads[6];
+    int32_t      dilations[3];
+    int32_t      dim_num;
+} vsi_nn_col2im_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h
new file mode 100644
index 00000000..87de1e79
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_grouped_conv3d.h
@@ -0,0 +1,55 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_GROUPED_CONV3D_H
+#define _VSI_NN_OP_GROUPED_CONV3D_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_grouped_conv3d_param
+{
+    void*          local;
+    uint32_t       ksize[3];
+    uint32_t       stride[3];
+    /* Pad left, right, top, bottom, front, rear */
+    uint32_t       pad[6];
+    /* Pad type default value shall be AUTO */
+    vsi_nn_pad_e   pad_type;
+    uint32_t       weights;
+    uint32_t       group;
+    uint32_t       dilation[3];
+    int32_t        multiplier;
+    vsi_nn_pad_mode_e pad_mode;
+} vsi_nn_grouped_conv3d_param;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h b/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h
new file mode 100644
index 00000000..80de07e7
--- /dev/null
+++ b/src/tim/vx/internal/include/ops/vsi_nn_op_l1_layer_norm.h
@@ -0,0 +1,47 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#ifndef _VSI_NN_OP_L1_LAYER_NORM_H
+#define _VSI_NN_OP_L1_LAYER_NORM_H
+
+#include "vsi_nn_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _vsi_nn_l1_layer_norm_param
+{
+    struct _l1_layer_norm_local_data_t * local;
+    float eps;
+    int32_t axis;
+} vsi_nn_l1_layer_norm_param;
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/src/tim/vx/internal/include/utils/vsi_nn_util.h b/src/tim/vx/internal/include/utils/vsi_nn_util.h
index 007983c6..010b52c7 100644
--- a/src/tim/vx/internal/include/utils/vsi_nn_util.h
+++ b/src/tim/vx/internal/include/utils/vsi_nn_util.h
@@ -349,7 +349,7 @@ vsi_bool vsi_nn_IsEVISFeatureAvaiable
     vsi_nn_context_t context
     );
 
-int32_t vsi_nn_compareVersion
+OVXLIB_API int32_t vsi_nn_compareVersion
     (
     vsi_nn_graph_t * graph,
     uint32_t version_major,
diff --git a/src/tim/vx/internal/include/vsi_nn/vsi_nn.h b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
new file mode 100644
index 00000000..115a2e81
--- /dev/null
+++ b/src/tim/vx/internal/include/vsi_nn/vsi_nn.h
@@ -0,0 +1,2034 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+/**
+ * @file vsi_nn.h
+ */
+#ifndef _VSI_NN_INTERFACE_H
+#define _VSI_NN_INTERFACE_H
+
+#if defined(_MSC_VER)
+#define EXPORT  __declspec(dllexport)
+#elif defined(__linux__)
+#define EXPORT __attribute__((visibility("default")))
+#else
+#define EXPORT
+#endif
+
+#if !defined(_IN)
+#define _IN
+#endif
+#if !defined(_OUT)
+#define _OUT
+#endif
+#if !defined(_INOUT)
+#define _INOUT
+#endif
+#if !defined(_OPTIONAL)
+#define _OPTIONAL
+#endif
+
+#include <stdint.h>
+#include <stddef.h>
+
+#if defined(__cplusplus)
+#define __BEGIN_DECLS extern "C" {
+#define __END_DECLS }
+#else
+#define __BEGIN_DECLS
+#define __END_DECLS
+#endif
+
+__BEGIN_DECLS
+
+
+#ifndef TRUE
+#define TRUE    (1)
+#endif
+#ifndef FALSE
+#define FALSE   (0)
+#endif
+
+
+/**
+ * Return codes.
+ */
+typedef enum
+{
+    /**
+     * Operation was succesful.
+     */
+    VSI_NN_ERROR_OK = 0,
+
+    /**
+     * Failure caused by vsi_nn api fail.
+     */
+    VSI_NN_ERROR_API_FAIL = 1,
+
+    /**
+     * Failure caused by not enough available memory.
+     */
+    VSI_NN_ERROR_OUT_OF_MEMORY = 2,
+
+    /**
+     * Failure caused by unexpected null argument.
+     */
+    VSI_NN_ERROR_UNEXPECTED_NULL = 3,
+
+    /**
+     * Failure caused by invalid function arguments, invalid model definition,
+     * invalid execution definition or invalid data at execution time.
+     */
+    VSI_NN_ERROR_VALUED_ERROR = 4,
+
+    /**
+     * Failure caused by operations that need completed graph.
+     */
+    VSI_NN_ERROR_UNCOMPLETE_GRAPH = 5,
+
+    /**
+     * Failure caused by insearting a keyword argument repeatly.
+     */
+    VSI_NN_ERROR_KWARGS_REPEAT = 6,
+} VSI_NN_error_e;
+
+/**
+ * Implicit padding algorithms.
+ */
+typedef enum
+{
+    /**
+     * Pad with const value which are specific by others parameters.
+     */
+    VSI_NN_IMPLICIT_PADDING_NONE = 0,
+
+    /**
+     * Implicit(VALID) padding.
+     * No padding.
+     */
+    VSI_NN_IMPLICIT_PADDING_VALID = 1,
+
+    /**
+     * Implicit(SAME) padding.
+     * Padding on both ends are the "same".
+     */
+    VSI_NN_IMPLICIT_PADDING_SAME = 2,
+} VSI_NN_implicit_padding_e;
+
+/**
+ * Padding mode.
+ */
+typedef enum
+{
+    /**
+     * Pad with const value which are specific by others parameters, default 0.
+     */
+    VSI_NN_PADDING_MODE_CONSTANT = 0,
+
+    /**
+     * Reflect padding mode
+     */
+    VSI_NN_PADDING_MODE_REFLECT = 1,
+
+    /**
+     * Symmetric padding mode
+     */
+    VSI_NN_PADDING_MODE_SYMMETRIC = 2,
+
+    /**
+     * Replicate padding mode
+     */
+    VSI_NN_PADDING_MODE_REPLICATE = 3,
+} VSI_NN_padding_mode_e;
+
+/**
+ * Rounding methods
+ */
+typedef enum
+{
+    /**
+     * Floor rounding
+     */
+    VSI_NN_ROUNDING_FLOOR = 0,
+    /**
+     * Ceiling rounding
+     */
+    VSI_NN_ROUNDING_CEIL = 1,
+} VSI_NN_rounding_e;
+
+/**
+ * LSH Projection supported types.
+ */
+typedef enum
+{
+    /**
+     * Computed bit vector is considered to be sparse.
+     */
+    VSI_NN_LSH_PROJECTION_SPARSE = 1,
+    /**
+     * Computed bit vector is considered to be dense.
+     */
+    VSI_NN_LSH_PROJECTION_DENSE = 2,
+} VSI_NN_lsh_projection_type_e;
+
+/**
+ * Supported activation function types.
+ */
+typedef enum
+{
+    /** No activation */
+    VSI_NN_ACTIVATION_NONE = 0,
+    /** ReLU activation */
+    VSI_NN_ACTIVATION_RELU = 1,
+    /** ReLU1 activation */
+    VSI_NN_ACTIVATION_RELU1 = 2,
+    /** ReLU6 activation */
+    VSI_NN_ACTIVATION_RELU6 = 3,
+    /** TanH activation */
+    VSI_NN_ACTIVATION_TANH = 4,
+    /** Sigmoid activation */
+    VSI_NN_ACTIVATION_SIGMOID = 5,
+} VSI_NN_activation_e;
+
+/**
+ * Tensor types.
+ *
+ * The type of tensors that can be added to a graph.
+ */
+typedef enum
+{
+    /** A tensor of IEEE 754 16 bit floating point values */
+    VSI_NN_TENSOR_FLOAT16 = 0,
+    /** A tensor of 32 bit floating point values */
+    VSI_NN_TENSOR_FLOAT32 = 1,
+    /** A tensor of 64 bit floating point values */
+    VSI_NN_TENSOR_FLOAT64 = 2,
+    /**
+     * A tensor of 8 bit boolean values.
+     *
+     * Values of this operand type are either true or false. A zero value
+     * represents false; any other value represents true.
+     */
+    VSI_NN_TENSOR_BOOL8 = 3,
+    /** A tensor of 8 bit integer values */
+    VSI_NN_TENSOR_INT8 = 4,
+    /** A tensor of 16 bit integer values */
+    VSI_NN_TENSOR_INT16 = 5,
+    /** A tensor of 32 bit integer values */
+    VSI_NN_TENSOR_INT32 = 6,
+    /** A tensor of 64 bit integer values */
+    VSI_NN_TENSOR_INT64 = 7,
+    /** A tensor of 8 bit unsigned integer values */
+    VSI_NN_TENSOR_UINT8 = 8,
+    /** A tensor of 16 bit unsigned integer values */
+    VSI_NN_TENSOR_UINT16 = 9,
+    /** A tensor of 32 bit unsigned integer values */
+    VSI_NN_TENSOR_UINT32 = 10,
+    /** A tensor of 64 bit unsigned integer values */
+    VSI_NN_TENSOR_UINT64 = 11,
+    /** A tensor of 16 bit truncate floating point values */
+    VSI_NN_TENSOR_BFLOAT16 = 12,
+} VSI_NN_tensor_type_e;
+
+typedef enum {
+    /** Not a quantized tensor */
+    VSI_NN_TENSOR_QUANT_NONE = 0,
+    /**
+     * A tensor of 8 bit signed integer values that represent real numbers
+     *
+     * Attached to this tensor is a number that can be used to convert
+     * the 8 bit integer to the real value.
+     *
+     * fraction_length: a 32 bit signed integer, in range [-128, 127].
+     *
+     * The formula is:
+     *  real_value = integer_value / pow(2, fraction_length).
+     */
+    VSI_NN_TENSOR_QUANT8_DFP = 1,
+    /**
+     * A tensor of 16 bit signed integer values that represent real numbers
+     *
+     * Attached to this tensor is a number that can be used to convert
+     * the 16 bit integer to the real value.
+     *
+     * fraction_length: a 32 bit signed integer, in range [-128, 127].
+     *
+     * The formula is:
+     *  real_value = integer_value / pow(2, fraction_length).
+     */
+    VSI_NN_TENSOR_QUANT16_DFP = 2,
+    /**
+     * A tensor of 32 bit signed integer values that represent real numbers
+     *
+     * Attached to this tensor is a number that can be used to convert
+     * the 16 bit integer to the real value.
+     *
+     * fraction_length: a 32 bit signed integer, in range [-128, 127].
+     *
+     * The formula is:
+     *  real_value = integer_value / pow(2, fraction_length).
+     */
+    VSI_NN_TENSOR_QUANT32_DFP = 3,
+    /**
+     * A tensor of 64 bit signed integer values that represent real numbers
+     *
+     * Attached to this tensor is a number that can be used to convert
+     * the 16 bit integer to the real value.
+     *
+     * fraction_length: a 32 bit signed integer, in range [-128, 127].
+     *
+     * The formula is:
+     *  real_value = integer_value / pow(2, fraction_length).
+     */
+    VSI_NN_TENSOR_QUANT64_DFP = 4,
+    /**
+     * A tensor of 8 bit signed integer values that represent real numbers
+     *
+     * Attached to this tensor is a numbers that can be used to convert
+     * the 8 bit integer to the real value.
+     *
+     * scale: a 32 bit floating point value greater than zero.
+     *
+     * The formula is:
+     *  real_value = integer_value * scale.
+     */
+    VSI_NN_TENSOR_QUANT8_SYMM = 5,
+    /**
+     * A tensor of 32 bit signed integer values that represent real numbers
+     *
+     * Attached to this tensor is a numbers that can be used to convert
+     * the 8 bit integer to the real value.
+     *
+     * scale: a 32 bit floating point value greater than zero.
+     *
+     * The formula is:
+     *  real_value = integer_value * scale.
+     */
+    VSI_NN_TENSOR_QUANT32_SYMM = 6,
+    /**
+     * A tensor of 8 bit unsigned integer values that represent real numbers
+     *
+     * Attached to this tensor are two numbers that can be used to convert
+     * the 8 bit integer to the real value.
+     *
+     * scale: a 32 bit floating point value greater than zero.
+     * zero_point: a 32 bit signed integer, in range [0, 255].
+     *
+     * The formula is:
+     *  real_value = (integer_value - zero_point) * scale.
+     */
+    VSI_NN_TENSOR_QUANT8_ASYMM = 7,
+    /**
+     * A tensor of 8 bit signed integers that represent real numbers.
+     *
+     * Attached to this tensor are two numbers that can be used to convert
+     * the 8 bit integer to the real value.
+     *
+     * channel_dim: a 32 bit unsigned integer indicating channel dimension.
+     * scales: an array of positive 32 bit floating point values.
+     * The size of the scales array must be equal to shape[channel_dim].
+     *
+     * The formula is:
+     * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C]
+     * where C is an index in the Channel dimension.
+     */
+    VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM = 8,
+    /**
+     * A tensor of 32 bit signed integers that represent real numbers.
+     *
+     * Attached to this tensor are two numbers that can be used to convert
+     * the 8 bit integer to the real value.
+     *
+     * channel_dim: a 32 bit unsigned integer indicating channel dimension.
+     * scales: an array of positive 32 bit floating point values.
+     * The size of the scales array must be equal to shape[channel_dim].
+     *
+     * The formula is:
+     * realValue[..., C, ...] = integerValue[..., C, ...] * scales[C]
+     * where C is an index in the Channel dimension.
+     */
+    VSI_NN_TENSOR_QUANT32_PERCHANNEL_SYMM = 9,
+} VSI_NN_tensor_quant_type_e;
+
+/** Parameters for VSI_NN_TENSOR_QUANT8_ASYMM */
+typedef struct
+{
+    float   scale;
+    int32_t zero_point;
+} VSI_NN_quant_param_asymm;
+
+/** Parameters for VSI_NN_TENSOR_QUANT8_SYMM */
+typedef struct
+{
+    float   scale;
+} VSI_NN_quant_param_symm;
+
+/** Parameters for VSI_NN_TENSOR_QUANT8_DFP */
+typedef struct
+{
+    int32_t fraction_length;
+} VSI_NN_quant_param_dfp;
+
+/** Parameters for VSI_NN_TENSOR_QUANT8_PERCHANNEL_SYMM */
+typedef struct
+{
+    /** The index of the channel dimension. */
+    int32_t channel_dim;
+
+    /**
+     * The array of scaling values for each channel.
+     * Each value must be greater than zero.
+     */
+    const float* scales;
+
+    /**
+     * The size of the scale array.
+     * Should be equal to shape[channel_dim] of the tensor.
+     * */
+    int32_t scale_count;
+} VSI_NN_quant_param_perchannel_symm;
+
+/** Parameters for quantization */
+typedef struct
+{
+    /** Tensor quantize type */
+    VSI_NN_tensor_quant_type_e type;
+    union
+    {
+        /** Dynamic fixed point quantization */
+        VSI_NN_quant_param_dfp dfp;
+        /** Asymmetric affine quantization */
+        VSI_NN_quant_param_asymm asymm;
+        /** Symmetric affine quantization */
+        VSI_NN_quant_param_symm symm;
+        /** Perchannel symmetric affine quantization */
+        VSI_NN_quant_param_perchannel_symm perchannel_symm;
+    } param;
+} VSI_NN_tensor_quant_param;
+
+/**
+ * NN Runtime context
+ */
+typedef struct _vsi_nn_context_t VSI_NN_context;
+
+/**
+ * VSI_NN_graph is an opaque type that contains a description of the network operations.
+ *
+ * Create graph by calling VSI_NN_graph_create.
+ * A graph is completed by calling VSI_NN_graph_verify.
+ * A graph is destroyed by calling VSI_NN_graph_release.
+ *
+ */
+typedef struct _vsi_nn_graph VSI_NN_graph;
+
+/**
+ * VSI_NN_tensor is an opaque type that can be used to describe a tensor.
+ *
+ * Create tensor by calling VSI_NN_tensor_create.
+ *
+ */
+typedef struct _vsi_nn_tensor VSI_NN_tensor;
+
+/**
+ * Create context
+ *
+ * @return Context handle on success or NULL otherwise.
+ */
+EXPORT VSI_NN_context* VSI_NN_context_create();
+
+/**
+ *  Release context
+ *
+ * @param[in] ctx_ptr The pointer to context to release, and reset point to null.
+ */
+EXPORT void VSI_NN_context_release
+    (
+    _IN VSI_NN_context** ctx_ptr
+    );
+
+/**
+ * Create graph
+ * Create a net graph.
+ *
+ * @param[in] ctx The context used to create graph.
+ * @return The graph on success, or NULL otherwise.
+ */
+EXPORT VSI_NN_graph* VSI_NN_graph_create
+    (
+    VSI_NN_context* ctx
+    );
+
+/**
+ * Release graph
+ * Release a graph and free its resource.
+ *
+ * @param[in] graph_ptr The graph to be release.
+ */
+EXPORT void VSI_NN_graph_release
+    (
+    _IN VSI_NN_graph** graph_ptr
+    );
+
+/**
+ * Identify graph inputs and outputs
+ * Identify the input and output tensors of a graph. User should call this to
+ * specific the inputs and outputs, they are used to exchange data between application
+ * level and VSI_NN level.
+ *
+ * @param[in] graph The graph to be identify.
+ * @param[in] input_tensors Input tensors.
+ * @param[in] input_tensors_num Number of input tensors.
+ * @param[in] output_tensors Output tensors.
+ * @param[in] output_tensors_num Number of output tensors.
+ * @return VSI_NN_ERROR_OK on success
+ */
+EXPORT VSI_NN_error_e VSI_NN_graph_identify_input_output
+    (
+    _IN VSI_NN_graph* graph,
+    _IN const VSI_NN_tensor** input_tensors,
+    _IN const int32_t input_tensors_num,
+    _IN const VSI_NN_tensor** output_tensors,
+    _IN const int32_t output_tensors_num
+    );
+
+/**
+ * To freeze a graph with verifying and compiling.
+ *
+ * This function may take a long time to compile the graph, and it must only be called
+ * once for a given graph.
+ *
+ * A frozen graph cannot be modified.
+ *
+ * @param[in] graph The graph to be finished.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_graph_verify
+    (
+    _IN VSI_NN_graph* graph
+    );
+
+/**
+ * Compute a frozen graph.
+ *
+ * @param[in] graph The graph to be executed.
+ *
+ * @return VSI_NN_ERROR_OK on success. VSI_NN_ERROR_UNCOMPLETE_GRAPH if
+ *         the graph is not finished.
+ */
+EXPORT VSI_NN_error_e VSI_NN_graph_compute
+    (
+    _IN const VSI_NN_graph* graph
+    );
+
+//EXPORT VSI_NN_error_e VSI_NN_GRPAH_profile(_IN const VSI_NN_graph* graph);
+
+/**
+ * Add a tensor to a graph.
+ *
+ * @param[in] graph The graph to be added.
+ * @param[in] dtype The data type.
+ * @param[in] shape The shape for the tensor.
+ * @param[in] ndim  The rank for the tensor.
+ * @param[in] memory The memory address to the data, the memory address
+ *            must be 64-byte align. If it's set to null, vsi_nn can
+ *            optimize the memory allocation and this is default behavior.
+ * @param[in] memory_size The size of memory.
+ * @param[in] quant_param The quantization parameters for the tensor, set
+ *            null if it's not quantized tensor.
+ *
+ * @return Tensor handle on success, or NULL if get failure.
+ */
+EXPORT VSI_NN_tensor* VSI_NN_tensor_create
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor_type_e dtype,
+    _IN const int32_t* shape,
+    _IN int32_t ndim,
+    _IN const VSI_NN_tensor_quant_param* quant_param,
+    _IN void* memory,
+    _IN size_t memory_size,
+    _IN int32_t is_constant
+    );
+
+/**
+ * Add a virtual tensor to a graph.
+ *
+ * @param[in] graph The graph to be added.
+ * @param[in] dtype The data type.
+ *
+ * @return Tensor handle on success, or NULL if get failure.
+ */
+EXPORT VSI_NN_tensor* VSI_NN_tensor_create_virtual
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor_type_e dtype,
+    _IN const VSI_NN_tensor_quant_param* quant_param
+    );
+
+/**
+ * Get element size of a tensor.
+ *
+ * @param[in] tensor Tensor to query element size.
+ *
+ * @return Element size of the tensor.
+ */
+EXPORT int32_t VSI_NN_tensor_get_size
+    (
+    _IN const VSI_NN_tensor* tensor
+    );
+
+/**
+ * Get bytes of a tensor.
+ *
+ * @param[in] tensor Tensor to query element size.
+ *
+ * @return Bytes of the tensor.
+ */
+EXPORT int32_t VSI_NN_tensor_get_bytes
+    (
+    _IN const VSI_NN_tensor* tensor
+    );
+
+/**
+ * Read tensor data.
+ *
+ * @param[in] tensor Tensor to read.
+ * @param[in] memory Memory to fill the data.
+ * @param[in] memory_size Element size of the read data,
+ *            must be equal to tensor size.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_tensor_read
+    (
+    _IN VSI_NN_tensor* tensor,
+    _IN void* memory,
+    _IN size_t memory_size
+    );
+
+/**
+ * Write data to tensor.
+ *
+ * @param[in] tensor Tensor to write.
+ * @param[in] memory Memory with the data.
+ * @param[in] memory_size Element size of the write data,
+ *            must be equal to tensor size.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_tensor_write
+    (
+    _IN VSI_NN_tensor* tensor,
+    _IN void* memory,
+    _IN size_t memory_size
+    );
+
+/**
+ * Swap tensors' memories.
+ *
+ * @param[in] tensor1 Tensor to swap the memory.
+ * @param[in] tensor2 Tensor to swap the memory.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_tensor_swap
+    (
+    _IN VSI_NN_tensor* tensor1,
+    _IN VSI_NN_tensor* tensor2
+    );
+
+/**
+ * Swap tensor memories.
+ * User can use this api to get tensor's original memory.
+ *
+ * @param[in] tensor Tensor to swap the memory.
+ * @param[in] new_memory The new memory for the tensor,
+ *            if NULL, there is no memory swapped.
+ * @param[in] old_memory Pointer for the tensor's original memory.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_tensor_swap_memory
+    (
+    _IN VSI_NN_tensor* tensor,
+    _IN _OPTIONAL void* new_memory,
+    _INOUT void** old_memory
+    );
+
+/**
+ * Flush tensor memory
+ * Once a tensor's memory is dirty, user should call this api to sync NPU memory.
+ *
+ * @param[in] tensor Tensor to flush memory
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_tensor_flush_memory
+    (
+    _IN const VSI_NN_tensor* tensor
+    );
+
+/** Convolutional */
+/**
+ * Convolution 1D node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] kernel Kernel with a 3D tensor.
+ * @param[in] bias Bias with a 1D tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] stride Convolution stride.
+ * @param[in] dilation Convolution dilation rate.
+ * @param[in] pad_front Padding front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_end Padding end value.
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_conv_1d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* kernel,
+    _IN _OPTIONAL VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t stride,
+    _IN int32_t dilation,
+    _IN int32_t pad_front, _IN int32_t pad_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding
+    );
+
+/**
+ * Convolution 2D node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] kernel Kernel with a 4D tensor.
+ * @param[in] bias Bias with a 1D tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] stride_h Convolution stride height.
+ * @param[in] stride_w Convolution stride width.
+ * @param[in] dilation_h Convolution height dilation rate.
+ * @param[in] dilation_w Convolution width dilation rate.
+ * @param[in] pad_h_front Padding height front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_h_end Padding height front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_w_front Padding width front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_w_end Padding widht front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_conv_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* kernel,
+    _IN _OPTIONAL VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t dilation_h, _IN int32_t dilation_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding
+    );
+
+/**
+ * Depthwise Convolution 2D node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] kernel Kernel with a 4D tensor.
+ * @param[in] bias Bias with a 1D tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] multiplier Depthwise convolution multiplier.
+ * @param[in] stride_h Convolution stride height.
+ * @param[in] stride_w Convolution stride width.
+ * @param[in] dilation_h Convolution height dilation rate.
+ * @param[in] dilation_w Convolution width dilation rate.
+ * @param[in] pad_h_front Padding height front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_h_end Padding height front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_w_front Padding width front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_w_end Padding widht front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_depthwise_conv_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* kernel,
+    _IN _OPTIONAL VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t multiplier,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t dilation_h, _IN int32_t dilation_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding
+    );
+
+/**
+ * Grouped Convolution 2D node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] kernel Kernel with a 4D tensor.
+ * @param[in] bias Bias with a 1D tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] group_number Group number for the convolution.
+ * @param[in] stride_h Convolution stride height.
+ * @param[in] stride_w Convolution stride width.
+ * @param[in] dilation_h Convolution height dilation rate.
+ * @param[in] dilation_w Convolution width dilation rate.
+ * @param[in] pad_h_front Padding height front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_h_end Padding height front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_w_front Padding width front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] pad_w_end Padding widht front value,
+ *            this field only effect when implicit
+ *            padding is VSI_NN_IMPLICIT_PADDING_NONE.
+ * @param[in] implicit_padding Implicit_padding with value VSI_NN_implicit_padding_e.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_grouped_conv_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* kernel,
+    _IN _OPTIONAL VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t group_number,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t dilation_h, _IN int32_t dilation_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_transposed_conv_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* kernel,
+    _IN _OPTIONAL VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t dilation_h, _IN int32_t dilation_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN int32_t output_pad_h, _IN int32_t output_pad_w
+    );
+
+/** Pooling */
+EXPORT VSI_NN_error_e VSI_NN_node_average_pool_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t ksize_h, _IN int32_t ksize_w,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding,
+    _IN VSI_NN_rounding_e size_rounding
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_max_pool_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t ksize_h, _IN int32_t ksize_w,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding,
+    _IN VSI_NN_rounding_e size_rounding
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_l2_pool_2d
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t ksize_h, _IN int32_t ksize_w,
+    _IN int32_t stride_h, _IN int32_t stride_w,
+    _IN int32_t pad_h_front, _IN int32_t pad_h_end,
+    _IN int32_t pad_w_front, _IN int32_t pad_w_end,
+    _IN VSI_NN_implicit_padding_e implicit_padding,
+    _IN VSI_NN_rounding_e size_rounding
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_unpool_2d();
+
+/** Normalization */
+EXPORT VSI_NN_error_e VSI_NN_node_batch_normalization
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* mean,
+    _IN VSI_NN_tensor* variance,
+    _IN VSI_NN_tensor* offset,
+    _IN VSI_NN_tensor* scale,
+    _IN VSI_NN_tensor* output,
+    _IN float variance_epsilon
+    );
+
+/**
+ * L2 Normalization node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] axis Normalize axis.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_l2_normalization
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_local_response_normalization
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t depth_radius,
+    _IN float bias,
+    _IN float alpha,
+    _IN float beta,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_instance_normalization
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* offset,
+    _IN VSI_NN_tensor* scale,
+    _IN VSI_NN_tensor* output,
+    _IN float variance_epsilon
+    );
+
+/** Math */
+/**
+ * Add node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_add
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Multiply node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_mul
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Divide node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_div
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Subtract node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_sub
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Floor node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_floor
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Square node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_square
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Sqrt node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_sqrt
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Rsqrt node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_rsqrt
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Matmul node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] transpose_input1 Whether to do transpose on input1.
+ * @param[in] transpose_input2 Whether to do transpose on input2.
+ * @param[in] transpose_output Whether to do transpose on output.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_matmul
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output,
+    _IN int transpose_input1,
+    _IN int transpose_input2,
+    _IN int transpose_output
+    );
+
+/**
+ * Abs node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_abs
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Pow node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_pow
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Maximum node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_maximum
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Minimum node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_minimum
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Exp node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_exp
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Reverse node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] axes Axes to reverse.
+ * @param[in] axes_size Number of axis to reverse.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_reverse
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* axes,
+    _IN int32_t axes_size
+    );
+
+/**
+ * Transpose node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] perm Transpose order.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_transpose
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* perm
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_gather
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* indices,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+/**
+ * Neg node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_neg
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Reduce max node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] axes Axes to reduce.
+ * @param[in] axes_size Number of axis to reduce.
+ * @param[in] keep_dim Whether to keep dims on output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_reduce_max
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* axes,
+    _IN int32_t axes_size,
+    _IN int32_t keep_dim
+    );
+
+/**
+ * Reduce min node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] axes Axes to reduce.
+ * @param[in] axes_size Number of axis to reduce.
+ * @param[in] keep_dim Whether to keep dims on output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_reduce_min
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* axes,
+    _IN int32_t axes_size,
+    _IN int32_t keep_dim
+    );
+
+/**
+ * Reduce sum node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] axes Axes to reduce.
+ * @param[in] axes_size Number of axis to reduce.
+ * @param[in] keep_dim Whether to keep dims on output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_reduce_sum
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* axes,
+    _IN int32_t axes_size,
+    _IN int32_t keep_dim
+    );
+
+/**
+ * Reduce mean node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ * @param[in] axes Axes to reduce.
+ * @param[in] axes_size Number of axis to reduce.
+ * @param[in] keep_dim Whether to keep dims on output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_reduce_mean
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* axes,
+    _IN int32_t axes_size,
+    _IN int32_t keep_dim
+    );
+
+/**
+ * Sin node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_sin
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_tile
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* multiples,
+    _IN int32_t multiples_size
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_topk
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN VSI_NN_tensor* output_indices,
+    _IN int32_t k
+    );
+
+/** Logical */
+/**
+ * Equal node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_equal
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Greater node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_greater
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Greater equal node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_greater_equal
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Less node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_less
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Less equal node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_less_equal
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Logical and node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_logical_and
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Logical or node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_logical_or
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Logical not node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_logical_not
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Not equal node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_not_equal
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Select node.
+ * If conditon is true, then output input1 tensor,
+ * else output input2 tensor.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] condition Conditon tensor..
+ * @param[in] input1 Node input tensor.
+ * @param[in] input2 Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_select
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* condition,
+    _IN VSI_NN_tensor* input1,
+    _IN VSI_NN_tensor* input2,
+    _IN VSI_NN_tensor* output
+    );
+
+/** Activation */
+/**
+ * relu node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_relu
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * ReLU1 node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_relu1
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * ReLU6 node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_relu6
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_tanh
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN float scale_a,
+    _IN float scale_b
+    );
+
+/**
+ * Sigmoid node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_sigmoid
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Hard sigmoid node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_hard_sigmoid
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Mish node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_mish
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_leaky_relu
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN float ratio
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_prelu
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* alpha,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Soft relu node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_soft_relu
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Elu node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_elu
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/** Misc */
+EXPORT VSI_NN_error_e VSI_NN_node_pad
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN VSI_NN_padding_mode_e mode,
+    _IN const int32_t* pad_front,
+    _IN const int32_t* pad_end,
+    _IN int32_t pad_value
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_fully_connected
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* kernel,
+    _IN _OPTIONAL VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_concate
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* const inputs[],
+    _IN int32_t input_num,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_split
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* const outputs[],
+    _IN int32_t output_num,
+    _IN const int32_t* slices,
+    _IN int32_t slices_size,
+    _IN int32_t axis
+    );
+
+/**
+ * Cast node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_cast
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Quantize node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_quantize
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+/**
+ * Dequantize node.
+ *
+ * @param[in] graph Graph to create the node.
+ * @param[in] input Node input tensor.
+ * @param[in] output Node output tensor.
+ *
+ * @return VSI_NN_ERROR_OK on success.
+ */
+EXPORT VSI_NN_error_e VSI_NN_node_dequantize
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_space_to_batch
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* block_size,
+    _IN int32_t block_size_num,
+    _IN const int32_t* pad_front,
+    _IN const int32_t* pad_end
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_batch_to_space
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* block_size,
+    _IN int32_t block_size_num,
+    _IN const int32_t* crop_front,
+    _IN const int32_t* crop_end
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_space_to_depth
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* block_size,
+    _IN int32_t block_size_num,
+    _IN const int32_t* pad_front,
+    _IN const int32_t* pad_end
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_depth_to_space
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* block_size,
+    _IN int32_t block_size_num,
+    _IN const int32_t* crop_front,
+    _IN const int32_t* crop_end
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_channel_shuffle
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t group_number,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_expand_dims
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_hashtable_lookup
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* lookups,
+    _IN VSI_NN_tensor* keys,
+    _IN VSI_NN_tensor* values,
+    _IN VSI_NN_tensor* output,
+    _IN VSI_NN_tensor* output_hits
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_embedding_lookup
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* lookups,
+    _IN VSI_NN_tensor* values,
+    _IN VSI_NN_tensor* output
+     );
+
+EXPORT VSI_NN_error_e VSI_NN_node_lsh_projection
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* hash_func,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* weight,
+    _IN VSI_NN_tensor* output,
+    _IN VSI_NN_lsh_projection_type_e type
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_slice
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* begin,
+    _IN const int32_t* size
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_strided_slice
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN const int32_t* begin,
+    _IN const int32_t* end,
+    _IN const int32_t* strides,
+    _IN int32_t begin_mask,
+    _IN int32_t end_mask,
+    _IN int32_t shrink_axis_mask
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_argmax
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_argmin
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t axis
+    );
+
+/** Detection */
+EXPORT VSI_NN_error_e VSI_NN_node_roi_pool
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* feature_map,
+    _IN VSI_NN_tensor* loc,
+    _IN VSI_NN_tensor* batch_index,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t output_h,
+    _IN int32_t output_w,
+    _IN float ratio_h,
+    _IN float ratio_w
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_roi_align
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* feature_map,
+    _IN VSI_NN_tensor* loc,
+    _IN VSI_NN_tensor* batch_index,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t output_h,
+    _IN int32_t output_w,
+    _IN float ratio_h,
+    _IN float ratio_w,
+    _IN int32_t sample_num_h,
+    _IN int32_t sample_num_w
+    );
+
+/** Image transform */
+EXPORT VSI_NN_error_e VSI_NN_node_resize_bilinear
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t scale_h,
+    _IN int32_t scale_w
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_resize_nearest
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* output,
+    _IN int32_t scale_h,
+    _IN int32_t scale_w
+    );
+
+/** RNN */
+EXPORT VSI_NN_error_e VSI_NN_node_svdf
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* weights_feature,
+    _IN VSI_NN_tensor* weights_time,
+    _IN VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* input_state,
+    _IN VSI_NN_tensor* output,
+    _IN VSI_NN_tensor* output_state,
+    _IN int32_t rank
+    );
+
+//EXPORT VSI_NN_error_e VSI_NN_node_rnn();
+
+EXPORT VSI_NN_error_e VSI_NN_node_rnn_unit
+    (
+    _IN VSI_NN_graph* graph,
+    _IN VSI_NN_tensor* input,
+    _IN VSI_NN_tensor* input_state,
+    _IN VSI_NN_tensor* weight, _IN VSI_NN_tensor* recrrent_weight,
+    _IN VSI_NN_tensor* bias,
+    _IN VSI_NN_tensor* output,
+    _IN VSI_NN_tensor* output_state,
+    _IN VSI_NN_activation_e activation
+    );
+
+EXPORT VSI_NN_error_e VSI_NN_node_lstm_unit
+    (
+    _IN VSI_NN_graph* graph
+    );
+
+__END_DECLS
+#endif
diff --git a/src/tim/vx/internal/include/vsi_nn_context.h b/src/tim/vx/internal/include/vsi_nn_context.h
index 4ac9f611..b426e4bd 100644
--- a/src/tim/vx/internal/include/vsi_nn_context.h
+++ b/src/tim/vx/internal/include/vsi_nn_context.h
@@ -26,6 +26,7 @@
 #define _VSI_NN_CONTEXT_H
 
 #include "vsi_nn_platform.h"
+#include "vsi_nn_types.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -75,12 +76,19 @@ typedef struct _vsi_nn_runtime_option_t
     int32_t enable_shader;
     int32_t enable_opcheck;
     int32_t enable_concat_optimize;
-    int32_t enable_asymi8_to_u8;
+    /*  0: disable convert int8 to uint8
+     *  1: enable convert asymm int8 to asymm uint8
+     *  2: enable convert both asymm and sym int8 to asymm uint8
+     */
+    int32_t enable_i8_to_u8;
     int32_t enable_dataconvert_optimize;
     int32_t enable_stream_processor;
     int32_t enable_rgb88_planar_nhwc;
     int32_t enable_slice_optimize;
     int32_t enable_batch_opt;
+    int32_t enable_save_file_type;
+    int32_t enable_use_image_process;
+    int32_t enable_use_from_handle;
 } vsi_nn_runtime_option_t;
 
 /**
@@ -101,6 +109,10 @@ typedef struct _vsi_nn_context_t
 OVXLIB_API vsi_nn_context_t vsi_nn_CreateContext
     ( void );
 
+OVXLIB_API vsi_status vsi_nn_initOptions
+    (
+    vsi_nn_runtime_option_t *options
+    );
 /**
  * Release context
  * Release ovxlib NN runtime resource and reset context handle to NULL.
diff --git a/src/tim/vx/internal/include/vsi_nn_feature_config.h b/src/tim/vx/internal/include/vsi_nn_feature_config.h
index 7918ae3e..b70b1dca 100644
--- a/src/tim/vx/internal/include/vsi_nn_feature_config.h
+++ b/src/tim/vx/internal/include/vsi_nn_feature_config.h
@@ -53,5 +53,9 @@
 #if defined(VX_13_NN_COMPATIBLITY)
 #define VSI_MAP_TENSOR_PATCH_SUPPORT
 #endif
+#if defined (VX_QUANT_PER_GROUP_SUPPORT)
+#define VSI_PER_GROUP_QUANTIZATION_SUPPORT
+#endif
+#define VSI_GRAPH_RUNTIME_ENV_SUPPORT
 
 #endif
diff --git a/src/tim/vx/internal/include/vsi_nn_graph.h b/src/tim/vx/internal/include/vsi_nn_graph.h
index 89786c42..c074cd5d 100644
--- a/src/tim/vx/internal/include/vsi_nn_graph.h
+++ b/src/tim/vx/internal/include/vsi_nn_graph.h
@@ -814,11 +814,77 @@ OVXLIB_API vsi_status vsi_nn_ExecuteGraphLoop
     vsi_nn_tensor_t *max_iteration_tensor
     );
 
-OVXLIB_API vsi_status vsi_nn_SetGraphTransformOption
+/**
+ * Set runtime variable
+ * Set runtime variable for ovxlib and driver.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] key Ovxlib and driver Envoriment variable name
+ * Ovxlib supported keys:
+ * VSI_NN_ENABLE_I8TOU8
+ * VSI_NN_ENABLE_OPCHECK
+ * VSI_SAVE_FILE_TYPE
+ * VSI_USE_IMAGE_PROCESS
+ * VSI_NN_ENABLE_CONCAT_OPTIMIZE
+ * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
+ * VSI_VX_ENABLE_STREAM_PROCESSOR
+ * VSI_NN_FORCE_RGB888_OUT_NHWC
+ * VSI_NN_ENABLE_SLICE_OPTIMIZE
+ * VSI_VX_ENABLE_BATCH_OPT
+ * VSI_USE_FROM_HANDLE
+ * Driver keys:
+ * VIV_VX_ENABLE_GRAPH_TRANSFORM
+ * VIV_VX_ENABLE_SHADER
+ * In addition to the ovxlib keys listed above, all others will be treated as the driver envoriment variable.
+ * @return VSI_SUCCESS on success, or appropriate error code otherwise
+ */
+OVXLIB_API vsi_status vsi_nn_SetRunTimeVariable
     (
     vsi_nn_graph_t* graph,
-    const char* ctrl_str,
-    size_t size
+    const char* key,
+    const char* value
+    );
+
+/**
+ * Get runtime variable
+ * Get runtime variable of ovxlib.
+ *
+ * @param[in] graph Graph handle
+ * @param[in] key Envoriment variable name
+ * Supported keys:
+ * VSI_NN_ENABLE_I8TOU8
+ * VSI_NN_ENABLE_OPCHECK
+ * VSI_SAVE_FILE_TYPE
+ * VSI_USE_IMAGE_PROCESS
+ * VSI_NN_ENABLE_CONCAT_OPTIMIZE
+ * VSI_NN_ENABLE_DATACONVERT_OPTIMIZE
+ * VSI_VX_ENABLE_STREAM_PROCESSOR
+ * VSI_NN_FORCE_RGB888_OUT_NHWC
+ * VSI_NN_ENABLE_SLICE_OPTIMIZE
+ * VSI_VX_ENABLE_BATCH_OPT
+ * VSI_USE_FROM_HANDLE
+ * VIV_VX_ENABLE_GRAPH_TRANSFORM
+ * VIV_VX_ENABLE_SHADER
+ * Only supported the keys listed above.
+ * @return Variable's value on success, or NULL otherwise, attention: if success,
+ *                 the caller need release the memory after use the return value.
+ */
+OVXLIB_API char* vsi_nn_GetRunTimeVariable
+    (
+    const vsi_nn_graph_t* graph,
+    const char* key
+    );
+
+int32_t vsi_nn_GetVariable(const char* variableKey);
+
+OVXLIB_API char* vsi_nn_GenerateGraphJson
+    (
+    vsi_nn_graph_t* graph
+    );
+
+OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
+    (
+    char* json
     );
 
 /**
diff --git a/src/tim/vx/internal/include/vsi_nn_node_type.h b/src/tim/vx/internal/include/vsi_nn_node_type.h
index a18e8949..dc82aeb5 100644
--- a/src/tim/vx/internal/include/vsi_nn_node_type.h
+++ b/src/tim/vx/internal/include/vsi_nn_node_type.h
@@ -212,6 +212,10 @@
 #include "ops/vsi_nn_op_crop_and_resize.h"
 #include "ops/vsi_nn_op_rmsnorm.h"
 #include "ops/vsi_nn_op_shape.h"
+#include "ops/vsi_nn_op_bitcast.h"
+#include "ops/vsi_nn_op_grouped_conv3d.h"
+#include "ops/vsi_nn_op_col2im.h"
+#include "ops/vsi_nn_op_l1_layer_norm.h"
 /* custom node head define define */
 #include "custom/vsi_nn_custom_node_type.h"
 #include "ops/vsi_nn_op_inverse_sigmoid.h"
@@ -412,6 +416,10 @@ typedef union _vsi_nn_nn_param
     vsi_nn_crop_and_resize_param    crop_and_resize;
     vsi_nn_rmsnorm_param            rmsnorm;
     vsi_nn_shape_param              shape;
+    vsi_nn_bitcast_param            bitcast;
+    vsi_nn_grouped_conv3d_param     grouped_conv3d;
+    vsi_nn_col2im_param             col2im;
+    vsi_nn_l1_layer_norm_param      l1_layer_norm;
     void*                         client_param;
 
     /* custom node data struct define */
diff --git a/src/tim/vx/internal/include/vsi_nn_tensor.h b/src/tim/vx/internal/include/vsi_nn_tensor.h
index d6ed0904..90dcb224 100644
--- a/src/tim/vx/internal/include/vsi_nn_tensor.h
+++ b/src/tim/vx/internal/include/vsi_nn_tensor.h
@@ -86,6 +86,8 @@ typedef enum
     VSI_NN_QNT_TYPE_SYMMETRIC_FLOAT8 = 0x6,
     /** perchannel float8 */
     VSI_NN_QNT_TYPE_PERCHANNEL_SYMMETRIC_FLOAT8 = 0x7,
+    /** GPQT */
+    VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC = 0x8,
     /** undefined type */
     VSI_NN_QNT_TYPE_NA = 0xff,
 } vsi_nn_qnt_type_e;
@@ -126,6 +128,16 @@ typedef struct vsi_nn_dtype
                 const int32_t * zero_points;
                 int32_t         zero_points_dim;
             };
+#endif
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+            /** Meanful in GPTQ_SYMMETRIC */
+            struct {
+                const float* group_scales;
+                int32_t group_channel_dim;
+                int32_t group_size;
+                const int32_t* group_zero_points;
+                int32_t group_count;
+            };
 #endif
         };
     };
diff --git a/src/tim/vx/internal/include/vsi_nn_version.h b/src/tim/vx/internal/include/vsi_nn_version.h
index 92f83491..37368a49 100644
--- a/src/tim/vx/internal/include/vsi_nn_version.h
+++ b/src/tim/vx/internal/include/vsi_nn_version.h
@@ -33,7 +33,7 @@ extern "C"{
 
 #define VSI_NN_VERSION_MAJOR 1
 #define VSI_NN_VERSION_MINOR 2
-#define VSI_NN_VERSION_PATCH 5
+#define VSI_NN_VERSION_PATCH 14
 #define VSI_NN_VERSION \
     (VSI_NN_VERSION_MAJOR * 10000 + VSI_NN_VERSION_MINOR * 100 + VSI_NN_VERSION_PATCH)
 
diff --git a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
index bc7d36ef..9d3ead3d 100644
--- a/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/argmax_cl.c
@@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
+#if (!VX_ARGMAX_VX_SUPPORT)
+
 __BEGIN_DECLS
 
 
@@ -289,3 +291,5 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( argmax, _setup )
+
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cl/col2im_cl.c b/src/tim/vx/internal/src/kernel/cl/col2im_cl.c
new file mode 100644
index 00000000..4daf9d48
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/cl/col2im_cl.c
@@ -0,0 +1,432 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_error.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+
+__BEGIN_DECLS
+
+#define _COL2IM_KERNEL_SOURCE_NAME      "col2im"
+
+// Add kernel hashtable here
+#define COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE, _image_2d) \
+        (( IN_DTYPE << 16 ) | ( OUT_DTYPE << 8 | (_image_2d)))
+#define COL2IM_KERNELS( IN_DTYPE, OUT_DTYPE ) \
+        { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 0), \
+         CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE), \
+         _COL2IM_KERNEL_SOURCE_NAME }
+
+#define COL2IM_KERNELS_2D( IN_DTYPE, OUT_DTYPE ) \
+        { COL2IM_HASH_KEY( IN_DTYPE, OUT_DTYPE , 1), \
+         CVIVANTE_NAMESPACE("cl.col2im_"#IN_DTYPE"to"#OUT_DTYPE"_2D"), \
+         _COL2IM_KERNEL_SOURCE_NAME }
+
+typedef struct
+{
+    uint32_t key;
+    char * function_name;
+    const char * source_name;
+} _kernel_map_type;
+
+static const _kernel_map_type _col2im_kernel_map[] =
+{
+    // Register kernel here
+    COL2IM_KERNELS( F32, F32 ),
+    COL2IM_KERNELS( F32, U32 ),
+    COL2IM_KERNELS( F32, I32 ),
+    COL2IM_KERNELS( U32, U32 ),
+    COL2IM_KERNELS( U32, F32 ),
+    COL2IM_KERNELS( U32, I32 ),
+    COL2IM_KERNELS( I32, I32 ),
+    COL2IM_KERNELS( I32, U32 ),
+    COL2IM_KERNELS( I32, F32 ),
+
+    COL2IM_KERNELS_2D( F32, F32 ),
+    COL2IM_KERNELS_2D( F32, U32 ),
+    COL2IM_KERNELS_2D( F32, I32 ),
+    COL2IM_KERNELS_2D( U32, U32 ),
+    COL2IM_KERNELS_2D( U32, F32 ),
+    COL2IM_KERNELS_2D( U32, I32 ),
+    COL2IM_KERNELS_2D( I32, I32 ),
+    COL2IM_KERNELS_2D( I32, U32 ),
+    COL2IM_KERNELS_2D( I32, F32 ),
+};
+
+
+/*
+ * Kernel params
+ */
+static vx_param_description_t _col2im_kernel_param_def[] =
+{
+    {VX_INPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_OUTPUT, VX_TYPE_TENSOR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+    {VX_INPUT, VX_TYPE_SCALAR, VX_PARAMETER_STATE_REQUIRED},
+};
+#define _COL2IM_PARAM_NUM  _cnt_of_array( _col2im_kernel_param_def )
+
+/*
+ * Kernel initializer
+ */
+DEF_KERNEL_INITIALIZER(_col2im_initializer)
+    (
+    vsi_nn_kernel_node_t                node,
+    const vsi_nn_kernel_node_param_t  * param,
+    size_t                              param_size
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    gpu_param_t gpu_param = {
+        3,         // workdim
+        {0, 0, 0}, // globalWorkOffset: control the start location be processed in the image
+        {0, 0, 0}, // globalWorkScale: how many pixels could be processed by a single thread
+        {0, 0, 0}, // localWorkSize: local group size in thread
+        {0, 0, 0}  // globalWorkSize: image size in thread
+        };
+    vsi_nn_kernel_tensor_attr_t * attr[2] = { NULL };
+    vsi_size_array_t * in_shape = NULL;
+    int32_t stride_w = 1, stride_h = 1;
+    int32_t dilation_w = 1, dilation_h = 1, dilation_d = 1;
+    int32_t pad_w_front = 0, pad_w_end = 0, pad_h_front = 0, pad_h_end = 0, pad_d_front = 0, pad_d_end = 0;
+    int32_t kernel_w = 1, kernel_h = 1, kernel_d = 1;
+    int32_t move_time_x = 0;
+    int32_t move_time_y = 0;
+    int32_t width_pad = 0;
+    int32_t height_pad = 0;
+    int32_t depth_pad = 0;
+    int32_t kernel_x_new = 1;
+    int32_t kernel_y_new = 1;
+    int32_t kernel_z_new = 1;
+    int32_t batch = 1;
+    int32_t width = 1;
+    int32_t height = 1;
+    int32_t depth = 1;
+
+    VSI_UNREFERENCED(param_size);
+    attr[0] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[0] );
+    CHECK_PTR_FAIL_GOTO( attr[0], "Create tensor attr buffer fail.", final );
+    attr[1] = vsi_nn_kernel_tensor_attr_create( (vsi_nn_kernel_tensor_t)param[1] );
+    CHECK_PTR_FAIL_GOTO( attr[1], "Create tensor attr buffer fail.", final );
+
+    status = vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[2], &stride_w);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[3], &stride_h);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[5], &dilation_w);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[6], &dilation_h);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[7], &dilation_d);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[8], &pad_w_front);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[9], &pad_w_end);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[10], &pad_h_front);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[11], &pad_h_end);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[12], &pad_d_front);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[13], &pad_d_end);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[14], &kernel_w);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[15], &kernel_h);
+    status |= vsi_nn_kernel_scalar_read_int32((vsi_nn_kernel_scalar_t)param[16], &kernel_d);
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    batch = (int32_t)(attr[0]->shape->data[2]);
+    width = (int32_t)(attr[1]->shape->data[0]);
+    height = (int32_t)(attr[1]->shape->data[1]);
+    depth  = (int32_t)(attr[1]->shape->data[2]) / batch;
+    width_pad = width + pad_w_front + pad_w_end;
+    height_pad = height + pad_h_front + pad_h_end;
+    depth_pad = depth + pad_d_front + pad_d_end;
+    move_time_x = (width_pad - ((kernel_w - 1) * dilation_w + 1) + stride_w) / stride_w;
+    move_time_y = (height_pad - ((kernel_h - 1) * dilation_h + 1) + stride_h) / stride_h;
+    kernel_x_new = (kernel_w - 1) * dilation_w + 1;
+    kernel_y_new = (kernel_h - 1) * dilation_h + 1;
+    kernel_z_new = (kernel_d - 1) * dilation_d + 1;
+
+    status = vsi_nn_kernel_gpu_add_param( node, "width_pad", &width_pad );
+    status |= vsi_nn_kernel_gpu_add_param( node, "height_pad", &height_pad );
+    status |= vsi_nn_kernel_gpu_add_param( node, "depth_pad", &depth_pad );
+    status |= vsi_nn_kernel_gpu_add_param( node, "move_time_x", &move_time_x );
+    status |= vsi_nn_kernel_gpu_add_param( node, "move_time_y", &move_time_y );
+    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_x_new", &kernel_x_new );
+    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_y_new", &kernel_y_new );
+    status |= vsi_nn_kernel_gpu_add_param( node, "kernel_z_new", &kernel_z_new );
+    status |= vsi_nn_kernel_gpu_add_param( node, "depth", &depth );
+    CHECK_STATUS_FAIL_GOTO(status, final );
+
+    in_shape  = attr[1]->shape;
+
+    gpu_param.global_scale[0] = 1;
+    gpu_param.global_scale[1] = 1;
+    gpu_param.global_scale[2] = 1;
+    gpu_param.global_size[0] = in_shape->data[0];
+    gpu_param.global_size[1] = in_shape->data[1];
+    gpu_param.global_size[2] = in_shape->data[2];
+
+    status = vsi_nn_kernel_gpu_config( node, &gpu_param );
+
+final:
+    if (attr[0])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[0] );
+    }
+    if (attr[1])
+    {
+        vsi_nn_kernel_tensor_attr_release( &attr[1] );
+    }
+    return status;
+} /* _col2im_initializer() */
+
+/*
+ * Query kernel
+ */
+static vsi_status _query_kernel
+    (
+    vsi_nn_kernel_t * kernel,
+    vsi_nn_tensor_t * const * const inputs,
+    vsi_nn_tensor_t * const * const outputs,
+    vsi_bool image_2d
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_dtype_e in_dtype;
+    vsi_nn_kernel_dtype_e out_dtype;
+    const _kernel_map_type * kernel_map = _col2im_kernel_map;
+    size_t kernel_map_size              = _cnt_of_array( _col2im_kernel_map );
+    vx_param_description_t * param_def  = _col2im_kernel_param_def;
+    vx_kernel_initialize_f  initializer = _col2im_initializer;
+
+    uint32_t key;
+    uint32_t i;
+
+    in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
+    if (F16 == in_dtype)
+    {
+        in_dtype = F32;
+    }
+    else if (U8 == in_dtype)
+    {
+        in_dtype = U32;
+    }
+    else if (I8 == in_dtype || I16 == in_dtype)
+    {
+        in_dtype = I32;
+    }
+
+    if (F16 == out_dtype)
+    {
+        out_dtype = F32;
+    }
+    else if (U8 == out_dtype)
+    {
+        out_dtype = U32;
+    }
+    else if (I8 == out_dtype || I16 == out_dtype)
+    {
+        out_dtype = I32;
+    }
+
+    key = COL2IM_HASH_KEY( in_dtype, out_dtype ,image_2d);
+
+    for ( i = 0; i < (uint32_t)kernel_map_size; i ++ )
+    {
+        if ( kernel_map[i].key == key )
+        {
+            break;
+        }
+    }
+    if ( i < (uint32_t)kernel_map_size )
+    {
+        snprintf( kernel->info.name, VX_MAX_KERNEL_NAME, "%s",  kernel_map[i].function_name );
+        kernel->info.parameters  = param_def;
+        kernel->info.numParams   = _cnt_of_array( _col2im_kernel_param_def );
+        kernel->info.initialize  = initializer;
+        // Register code source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_CODE, 2,
+                "eltwise_ops_helper",
+                kernel_map[i].source_name );
+        // Register binary source
+        vsi_nn_kernel_add_source( kernel, VSI_NN_GPU_SOURCE_FMT_EXECUTABLE, 1,
+                kernel_map[i].source_name );
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* _query_kernel() */
+
+
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_node_param_t node_params[_COL2IM_PARAM_NUM];
+    vsi_nn_kernel_node_t node = NULL;
+    vsi_bool image_2d = FALSE;
+    vsi_nn_kernel_tensor_t rs_input = NULL, rs_output = NULL;
+    vsi_size_t shapes[2][VSI_NN_MAX_DIM_NUM] = {{0}};
+    float inputScale = vsi_nn_get_tensor_scale(inputs[0]);
+    float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
+    float inputZp  = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
+    float outputZp  = (float)vsi_nn_get_tensor_zero_point(outputs[0]);
+    float inOutScale = inputScale / outputScale;
+    float inOutTile = outputZp - inOutScale * inputZp;
+    int32_t stride_w = vsi_nn_kernel_param_get_int32( params, "stride_w" );
+    int32_t stride_h = vsi_nn_kernel_param_get_int32( params, "stride_h" );
+    int32_t stride_d = vsi_nn_kernel_param_get_int32( params, "stride_d" );
+    int32_t dilation_w = vsi_nn_kernel_param_get_int32( params, "dilation_w" );
+    int32_t dilation_h = vsi_nn_kernel_param_get_int32( params, "dilation_h" );
+    int32_t dilation_d = vsi_nn_kernel_param_get_int32( params, "dilation_d" );
+    int32_t pad_w_front = vsi_nn_kernel_param_get_int32( params, "pad_w_front" );
+    int32_t pad_w_end = vsi_nn_kernel_param_get_int32( params, "pad_w_end" );
+    int32_t pad_h_front = vsi_nn_kernel_param_get_int32( params, "pad_h_front" );
+    int32_t pad_h_end = vsi_nn_kernel_param_get_int32( params, "pad_h_end" );
+    int32_t pad_d_front = vsi_nn_kernel_param_get_int32( params, "pad_d_front" );
+    int32_t pad_d_end = vsi_nn_kernel_param_get_int32( params, "pad_d_end" );
+    size_t dim_num = 0;
+    int32_t* block_shape = (int32_t *) vsi_nn_kernel_param_get_buffer( params, "block_shape", &dim_num);
+    int32_t kernel_w = block_shape[0];
+    int32_t kernel_h = dim_num > 1 ? block_shape[1] : 1;
+    int32_t kernel_d = dim_num > 2 ? block_shape[2] : 1;
+
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    image_2d = dim_num > 2 ? FALSE : TRUE;
+
+    shapes[0][0] = inputs[0]->attr.size[0];
+    shapes[0][1] = inputs[0]->attr.size[1] / outputs[0]->attr.size[dim_num];
+    shapes[0][2] = inputs[0]->attr.size[2] * outputs[0]->attr.size[dim_num];
+
+    shapes[1][0] = outputs[0]->attr.size[0];
+    shapes[1][1] = outputs[0]->attr.size[1];
+    if (image_2d)
+    {
+        shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3];
+    }
+    else
+    {
+        shapes[1][2] = outputs[0]->attr.size[2] * outputs[0]->attr.size[3] * outputs[0]->attr.size[4];
+    }
+
+    rs_input = vsi_nn_kernel_tensor_reshape( inputs[0]->t, shapes[0], 3 );
+    rs_output = vsi_nn_kernel_tensor_reshape( outputs[0]->t, shapes[1], 3 );
+
+    if (rs_input == NULL || rs_output == NULL)
+    {
+        goto final;
+    }
+
+    status = _query_kernel( kernel, inputs, outputs, image_2d );
+    if ( VSI_SUCCESS == status)
+    {
+        node = vsi_nn_kernel_create_node( graph, kernel );
+        if ( node )
+        {
+            node_params[0] = rs_input;
+            node_params[1] = rs_output;
+            node_params[2] = vsi_nn_kernel_scalar_create( graph, I32, &stride_w );
+            node_params[3] = vsi_nn_kernel_scalar_create( graph, I32, &stride_h );
+            node_params[4] = vsi_nn_kernel_scalar_create( graph, I32, &stride_d );
+            node_params[5] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_w );
+            node_params[6] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_h );
+            node_params[7] = vsi_nn_kernel_scalar_create( graph, I32, &dilation_d );
+            node_params[8] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_front );
+            node_params[9] = vsi_nn_kernel_scalar_create( graph, I32, &pad_w_end );
+            node_params[10] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_front );
+            node_params[11] = vsi_nn_kernel_scalar_create( graph, I32, &pad_h_end );
+            node_params[12] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_front );
+            node_params[13] = vsi_nn_kernel_scalar_create( graph, I32, &pad_d_end );
+            node_params[14] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_w );
+            node_params[15] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_h );
+            node_params[16] = vsi_nn_kernel_scalar_create( graph, I32, &kernel_d );
+            node_params[17] = vsi_nn_kernel_scalar_create( graph, F32, &inOutScale );
+            node_params[18] = vsi_nn_kernel_scalar_create( graph, F32, &inOutTile );
+
+            status  = vsi_nn_kernel_node_pass_param( node, node_params, _COL2IM_PARAM_NUM );
+            CHECK_STATUS(status);
+            vsi_nn_kernel_scalar_release( &node_params[2] );
+            vsi_nn_kernel_scalar_release( &node_params[3] );
+            vsi_nn_kernel_scalar_release( &node_params[4] );
+            vsi_nn_kernel_scalar_release( &node_params[5] );
+            vsi_nn_kernel_scalar_release( &node_params[6] );
+            vsi_nn_kernel_scalar_release( &node_params[7] );
+            vsi_nn_kernel_scalar_release( &node_params[8] );
+            vsi_nn_kernel_scalar_release( &node_params[9] );
+            vsi_nn_kernel_scalar_release( &node_params[10] );
+            vsi_nn_kernel_scalar_release( &node_params[11] );
+            vsi_nn_kernel_scalar_release( &node_params[12] );
+            vsi_nn_kernel_scalar_release( &node_params[13] );
+            vsi_nn_kernel_scalar_release( &node_params[14] );
+            vsi_nn_kernel_scalar_release( &node_params[15] );
+            vsi_nn_kernel_scalar_release( &node_params[16] );
+            vsi_nn_kernel_scalar_release( &node_params[17] );
+        }
+    }
+final:
+    if (rs_input)
+    {
+        vsi_nn_kernel_tensor_release( &rs_input );
+    }
+    if (rs_output)
+    {
+        vsi_nn_kernel_tensor_release( &rs_output );
+    }
+    return node;
+} /* _setup() */
+
+__END_DECLS
+
+REGISTER_BACKEND_CL( col2im, _setup )
+
diff --git a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
index 3a5e0d7b..50c435ba 100644
--- a/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/cumsum_cl.c
@@ -46,21 +46,36 @@ __BEGIN_DECLS
 
 #define KERNEL_SOURCE_1    "cumsum"
 #define KERNEL_SOURCE_2    "cumsum_2d"
+#define KERNEL_SOURCE_3    "cumsum_array_axis0"
+#define KERNEL_SOURCE_4    "cumsum_array_axis1"
+#define KERNEL_SOURCE_5    "cumsum_array_axis2"
+#define KERNEL_SOURCE_6    "cumsum_array_2d_axis0"
+#define KERNEL_SOURCE_7    "cumsum_array_2d_axis1"
 
 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
-    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d, is_array) \
+    ((AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))
 
 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
         CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
         KERNEL_SOURCE_1 },
 
 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
         CVIVANTE_NAMESPACE("cl.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
         KERNEL_SOURCE_2 },
 
+#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
+        CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
+#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 1), \
+        CVIVANTE_NAMESPACE("cl.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -82,6 +97,22 @@ static const struct {
     HASH_CUMSUM_KERNELS_2D(1, U8,  U8)
     HASH_CUMSUM_KERNELS_2D(1, F32, F32)
     HASH_CUMSUM_KERNELS_2D(1, F32, U8)
+
+    HASH_CUMSUM_ARRAY_KERNELS(0, U8,  U8, KERNEL_SOURCE_3)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F32, F32, KERNEL_SOURCE_3)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F32, U8, KERNEL_SOURCE_3)
+    HASH_CUMSUM_ARRAY_KERNELS(1, U8,  U8, KERNEL_SOURCE_4)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F32, F32, KERNEL_SOURCE_4)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F32, U8, KERNEL_SOURCE_4)
+    HASH_CUMSUM_ARRAY_KERNELS(2, U8,  U8, KERNEL_SOURCE_5)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F32, F32, KERNEL_SOURCE_5)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F32, U8, KERNEL_SOURCE_5)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8,  U8, KERNEL_SOURCE_6)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, F32, KERNEL_SOURCE_6)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F32, U8, KERNEL_SOURCE_6)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8,  U8, KERNEL_SOURCE_7)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, F32, KERNEL_SOURCE_7)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F32, U8, KERNEL_SOURCE_7)
 };
 
 /*
@@ -197,7 +228,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
     int32_t axis,
-    int32_t is_2d
+    int32_t is_2d,
+    int32_t is_array
     /* Add extra params */
     )
 {
@@ -230,7 +262,7 @@ static vsi_status _query_kernel
         output_dtype = F32;
     }
 
-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_2d, is_array);
 
     for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
     {
@@ -270,6 +302,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t node_params[_CUMSUM_PARAM_NUM] = {NULL};
     vsi_nn_kernel_node_t node = NULL;
@@ -291,6 +324,7 @@ static vsi_nn_kernel_node_t _setup
     int32_t height     = 0;
     int32_t channel    = 1;
     uint32_t i = 0;
+    int32_t is_array   = 0;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
@@ -326,13 +360,16 @@ static vsi_nn_kernel_node_t _setup
     reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
         outputs[0], shapes[0], (vsi_size_t)rs_dim );
 
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    for (i = 0; i < rs_dim; i++)
     {
-        return NULL;
+        if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
+        {
+            is_array = 1;
+        }
     }
+#undef VSI_NN_MAX_BLOCK_SIZE
 
-    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d );
+    status = _query_kernel( kernel, inputs, outputs, axis_new, is_2d, is_array);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/cl/gather_cl.c b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
index 66943314..a3ff29e4 100644
--- a/src/tim/vx/internal/src/kernel/cl/gather_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/gather_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_TENSOR_GATHER_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 __BEGIN_DECLS
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
index cecb25ae..d439b4d8 100644
--- a/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/layer_normalization_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 __BEGIN_DECLS
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
index a7bcaae3..99936150 100644
--- a/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/log_softmax_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_LOGSOFTMAX_VX_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 __BEGIN_DECLS
 
 
diff --git a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
index 8eee2c47..05cc3034 100644
--- a/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/nearest_grid_sample_cl.c
@@ -36,6 +36,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
+#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
+
 __BEGIN_DECLS
 
 /*
@@ -412,3 +414,4 @@ __END_DECLS
 
 REGISTER_BACKEND_CL( nearest_grid_sample, _setup )
 
+#endif
diff --git a/src/tim/vx/internal/src/kernel/cl/pow_cl.c b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
index 06e3652b..fbce08af 100644
--- a/src/tim/vx/internal/src/kernel/cl/pow_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/pow_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_TENSOR_POW_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_POW_API_SUPPORT)
 __BEGIN_DECLS
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
index 60fbda3e..21cd7100 100644
--- a/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/resize_bilinear_cl.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "libnnext/vx_lib_nnext.h"
-
+#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
 __BEGIN_DECLS
 
 #define _RESIZE_BILINEAR_KERNEL_SOURCE()      "resize_bilinear"
@@ -319,3 +319,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_CL( resize_bilinear, _setup )
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/cl/tile_cl.c b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
index 8227a365..a672e21d 100644
--- a/src/tim/vx/internal/src/kernel/cl/tile_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/tile_cl.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_TENSOR_TILE_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 __BEGIN_DECLS
 
 
diff --git a/src/tim/vx/internal/src/kernel/cl/topk_cl.c b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
index b8cdfd08..78b9a9bb 100644
--- a/src/tim/vx/internal/src/kernel/cl/topk_cl.c
+++ b/src/tim/vx/internal/src/kernel/cl/topk_cl.c
@@ -34,20 +34,24 @@
 #include "vsi_nn_tensor_util.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-#include "libnnext/vx_lib_nnext.h"
 
 __BEGIN_DECLS
 
 #define _TOPK_KERNEL_SOURCE      "topk"
 #define STR(a) #a
 // Add kernel hashtable here
-#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) )
+#define TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, SECTION ) \
+        ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) | (STAGES << 16) | (SECTION << 26))
 #define PACK_KERNEL_MAP( IN_DTYPE, OUT_DTYPE, STAGES ) \
-        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES ), \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, STAGES, 0 ), \
           CVIVANTE_NAMESPACE("cl.topk_stage"STR(STAGES)"_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
           _TOPK_KERNEL_SOURCE }
 
+#define PACK_MERGE_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
+        { TOPK_HASH_KEY( IN_DTYPE, OUT_DTYPE, 0, 1 ), \
+          CVIVANTE_NAMESPACE("cl.topk_stage_"STR(IN_DTYPE)"to"STR(OUT_DTYPE)"_I32"), \
+          "topk2" }
+
 #define TOPK_ODD_EVEN_SORT_HASH_KEY( IN_DTYPE, OUT_DTYPE ) \
         ( ( IN_DTYPE ) | ( OUT_DTYPE << 8 ) )
 #define PACK_ODD_EVEN_SORT_KERNEL_MAP( IN_DTYPE, OUT_DTYPE ) \
@@ -79,6 +83,7 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( F32, F32, 4 ),
     PACK_KERNEL_MAP( F32, F32, 5 ),
     PACK_KERNEL_MAP( F32, F32, 6 ),
+    PACK_KERNEL_MAP( F32, F32, 9 ),
 
     PACK_KERNEL_MAP( U32, U32, 0 ),
     PACK_KERNEL_MAP( U32, U32, 1 ),
@@ -87,6 +92,7 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( U32, U32, 4 ),
     PACK_KERNEL_MAP( U32, U32, 5 ),
     PACK_KERNEL_MAP( U32, U32, 6 ),
+    PACK_KERNEL_MAP( U32, U32, 9 ),
 
     PACK_KERNEL_MAP( I32, I32, 0 ),
     PACK_KERNEL_MAP( I32, I32, 1 ),
@@ -95,6 +101,7 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( I32, I32, 4 ),
     PACK_KERNEL_MAP( I32, I32, 5 ),
     PACK_KERNEL_MAP( I32, I32, 6 ),
+    PACK_KERNEL_MAP( I32, I32, 9 ),
 
     PACK_KERNEL_MAP( F32, U32, 0 ),
     PACK_KERNEL_MAP( F32, U32, 1 ),
@@ -103,6 +110,7 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( F32, U32, 4 ),
     PACK_KERNEL_MAP( F32, U32, 5 ),
     PACK_KERNEL_MAP( F32, U32, 6 ),
+    PACK_KERNEL_MAP( F32, U32, 9 ),
 
     PACK_KERNEL_MAP( F32, I32, 0 ),
     PACK_KERNEL_MAP( F32, I32, 1 ),
@@ -111,6 +119,10 @@ static const _kernel_map_type _topk_kernel_map[] =
     PACK_KERNEL_MAP( F32, I32, 4 ),
     PACK_KERNEL_MAP( F32, I32, 5 ),
     PACK_KERNEL_MAP( F32, I32, 6 ),
+    PACK_KERNEL_MAP( F32, I32, 9 ),
+
+    PACK_MERGE_KERNEL_MAP(U32, U32),
+    PACK_MERGE_KERNEL_MAP(I32, I32),
 };
 
 static const _kernel_map_type _topk_odd_even_sort_kernel_map[] =
@@ -254,7 +266,8 @@ static vsi_status _query_kernel
     vsi_nn_kernel_t * kernel,
     vsi_nn_tensor_t * const * const inputs,
     vsi_nn_tensor_t * const * const outputs,
-    int32_t num_stages
+    int32_t num_stages,
+    vsi_bool is_bitnoic_segment
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -272,21 +285,23 @@ static vsi_status _query_kernel
     in_dtype  = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     out_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
+    num_stages = is_bitnoic_segment ? 0 : num_stages;
+
     switch (_PACK_SELECT_KEY(in_dtype, out_dtype))
     {
     case _PACK_SELECT_KEY(F32, F32):
     case _PACK_SELECT_KEY(F16, F16):
-        key = TOPK_HASH_KEY( F32, F32, num_stages );
+        key = TOPK_HASH_KEY( F32, F32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(U32, U32):
     case _PACK_SELECT_KEY(U16, U16):
     case _PACK_SELECT_KEY(U8,  U8):
-        key = TOPK_HASH_KEY( U32, U32, num_stages );
+        key = TOPK_HASH_KEY( U32, U32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(I32, I32):
     case _PACK_SELECT_KEY(I16, I16):
     case _PACK_SELECT_KEY(I8,  I8):
-        key = TOPK_HASH_KEY( I32, I32, num_stages );
+        key = TOPK_HASH_KEY( I32, I32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(F32, U32):
     case _PACK_SELECT_KEY(F16, U32):
@@ -294,7 +309,7 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, U16):
     case _PACK_SELECT_KEY(F32, U8):
     case _PACK_SELECT_KEY(F16, U8):
-        key = TOPK_HASH_KEY( F32, U32, num_stages );
+        key = TOPK_HASH_KEY( F32, U32, num_stages, is_bitnoic_segment );
         break;
     case _PACK_SELECT_KEY(F32, I32):
     case _PACK_SELECT_KEY(F16, I32):
@@ -302,7 +317,7 @@ static vsi_status _query_kernel
     case _PACK_SELECT_KEY(F16, I16):
     case _PACK_SELECT_KEY(F32, I8):
     case _PACK_SELECT_KEY(F16, I8):
-        key = TOPK_HASH_KEY( F32, I32, num_stages );
+        key = TOPK_HASH_KEY( F32, I32, num_stages, is_bitnoic_segment );
         break;
     default:
         break;
@@ -440,7 +455,12 @@ static vsi_nn_kernel_node_t _setup
     int32_t top_k = vsi_nn_kernel_param_get_int32(params, "top_k");
     int32_t num_stages = (int32_t)vsi_nn_max(ceil(log10(block_size / 2.0f) / log10(2.0f)), 0);
     vsi_bool is_odd_even_sort = FALSE;
+    vsi_bool is_bitnoic_segment = FALSE;
     size_t param_num = _TOPK_PARAM_NUM;
+    int32_t max_stages = 7 + (int32_t)log2(graph->ctx->config.subGroupSize >> 2);
+    vsi_nn_kernel_dtype_e type0 = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+    vsi_nn_kernel_dtype_e type1 = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
+
     float inputScale  = vsi_nn_get_tensor_scale(inputs[0]);
     float inputTail   = (float)vsi_nn_get_tensor_zero_point(inputs[0]);
     float outputScale = vsi_nn_get_tensor_scale(outputs[0]);
@@ -471,9 +491,22 @@ static vsi_nn_kernel_node_t _setup
     rs_tensors[0] = vsi_nn_reshape_tensor( graph,
         inputs[0], shape[0], 2 );
 
-    if (num_stages < 7)
+    is_bitnoic_segment = (num_stages >= 9) && (top_k <= 512 && max_stages > 9) &&
+        type0 == type1 && (type0 == U8 || type0 == I8 || type0 == I16 || type0 == U16 || type0 == I32 || type0 == U32);
+
+    if (is_bitnoic_segment && num_stages == 9)
+    {
+        is_bitnoic_segment = FALSE;
+    }
+    else
+    {
+        num_stages = is_bitnoic_segment ? 9 : num_stages;
+        max_stages = is_bitnoic_segment ? max_stages : 7;
+    }
+
+    if (num_stages < max_stages || is_bitnoic_segment)
     {
-        status = _query_kernel( kernel, inputs, outputs, num_stages );
+        status = _query_kernel( kernel, inputs, outputs, num_stages, is_bitnoic_segment );
 
         rs_tensors[1] = vsi_nn_reshape_tensor( graph,
             outputs[0], shape[1], 2 );
diff --git a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
index f5010111..1ebde506 100644
--- a/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/argmax_evis.c
@@ -35,6 +35,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
+#if (!VX_ARGMAX_VX_SUPPORT)
+
 __BEGIN_DECLS
 
 #define HASH_ARGMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
@@ -510,3 +512,4 @@ __END_DECLS
 
 REGISTER_BACKEND_EVIS( argmax, _setup )
 
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
index 4660e894..c7589ffb 100644
--- a/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/cumsum_evis.c
@@ -51,26 +51,49 @@ __BEGIN_DECLS
 #define KERNEL_SOURCE_5    "cumsum_ex_rev_axis0"
 #define KERNEL_SOURCE_6    "cumsum_ex_rev_axis1"
 #define KERNEL_SOURCE_7    "cumsum_ex_rev_axis2"
+#define KERNEL_SOURCE_8    "cumsum_array"
+#define KERNEL_SOURCE_9    "cumsum_array_2d"
+#define KERNEL_SOURCE_10   "cumsum_array_bf16"
+#define KERNEL_SOURCE_11   "cumsum_array_f16_u8"
+#define KERNEL_SOURCE_12   "cumsum_array_ex_rev_axis0"
+#define KERNEL_SOURCE_13   "cumsum_array_ex_rev_axis1"
+#define KERNEL_SOURCE_14   "cumsum_array_ex_rev_axis2"
+#define KERNEL_SOURCE_15   "cumsum_array_f16_u8_2d"
 
 // Add kernel hashtable here
-#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d) \
-    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 4) | (_image_2d))
+#define HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, EX_REV, _image_2d, is_array) \
+    ((EX_REV << 24) | (AXIS << 20) | (IN_DTYPE << 12) | (OUT_DTYPE << 8) | (_image_2d << 4) | (is_array))
 
 #define HASH_CUMSUM_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 0), \
         CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
         SOURCE },
 
 #define HASH_CUMSUM_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 0), \
         CVIVANTE_NAMESPACE("evis.cumsum_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
         SOURCE },
 
 #define HASH_CUMSUM_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
-        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0), \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 0), \
         CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
         SOURCE },
 
+#define HASH_CUMSUM_ARRAY_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 0, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
+#define HASH_CUMSUM_ARRAY_KERNELS_2D( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 0, 1, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS"_2D"), \
+        SOURCE },
+
+#define HASH_CUMSUM_ARRAY_EX_REV_KERNELS( AXIS, IN_DTYPE, OUT_DTYPE, SOURCE) \
+        { HASH_CUMSUM_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, 1, 0, 1), \
+        CVIVANTE_NAMESPACE("evis.cumsum_ex_rev_array_"#IN_DTYPE"to"#OUT_DTYPE"_axis"#AXIS), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -135,6 +158,65 @@ static const struct {
     HASH_CUMSUM_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_4)
     HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_4)
     HASH_CUMSUM_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_4)
+
+    HASH_CUMSUM_ARRAY_KERNELS(0, U8,   U8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, I8,   I8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, I16,  I16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  F16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(0, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS(1, U8,   U8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, I8,   I8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, I16,  I16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  F16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(1, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS(2, U8,   U8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, I8,   I8,   KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, I16,  I16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  F16,  KERNEL_SOURCE_8)
+    HASH_CUMSUM_ARRAY_KERNELS(2, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, U8,   U8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, I8,   I8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, I16,  I16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  F16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, U8,   U8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, I8,   I8,   KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, I16,  I16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  F16,  KERNEL_SOURCE_9)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, BF16, BF16, KERNEL_SOURCE_10)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(0, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(1, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS(2, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  U8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  I8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(0, F16,  I16, KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  U8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  I8,  KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_KERNELS_2D(1, F16,  I16, KERNEL_SOURCE_15)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, U8,   U8,  KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I8,   I8,  KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, I16,  I16, KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(0, F16,  F16, KERNEL_SOURCE_12)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, U8,   U8,  KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I8,   I8,  KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, I16,  I16, KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  F16, KERNEL_SOURCE_13)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, U8,   U8,  KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I8,   I8,  KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, I16,  I16, KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  F16, KERNEL_SOURCE_14)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(1, F16,  I16, KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  U8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  I8,  KERNEL_SOURCE_11)
+    HASH_CUMSUM_ARRAY_EX_REV_KERNELS(2, F16,  I16, KERNEL_SOURCE_11)
 };
 
 /*
@@ -161,6 +243,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     size_t                              param_size
     )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
     vsi_status status = VSI_FAILURE;
     gpu_param_t shaderParam = {
         3,          // workdim
@@ -188,6 +271,9 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     float   in_out_zp_scale = 1.0f;
     float   in_out_scale    = 1.0f;
 
+    int32_t is_array        = 0;
+    int32_t remainder       = 0;
+
     uint32_t pack_key = 0;
 
     VSI_UNREFERENCED(param_size);
@@ -219,7 +305,15 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     height  = (int32_t)(input_shape->data[1]);
     channel = (int32_t)(dim > 2 ? input_shape->data[2] : 1);
 
+    if (width > VSI_NN_MAX_BLOCK_SIZE ||
+       height > VSI_NN_MAX_BLOCK_SIZE ||
+       channel > VSI_NN_MAX_BLOCK_SIZE)
+    {
+        is_array = 1;
+    }
+
 
+#undef VSI_NN_MAX_BLOCK_SIZE
     if (axis == 0)
     {
         w = 1;
@@ -245,6 +339,7 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
     {
         shaderParam.global_scale[0]  = 16;
     }
+    remainder = w % shaderParam.global_scale[0];
     shaderParam.global_scale[1]  = 1;
     shaderParam.global_scale[2]  = 1;
     shaderParam.global_size[0]   = (w + shaderParam.global_scale[0] - 1) / shaderParam.global_scale[0];
@@ -253,6 +348,12 @@ DEF_KERNEL_INITIALIZER(_cumsum_initializer)
 
     status = vsi_nn_kernel_gpu_config( node, &shaderParam );
     CHECK_STATUS_FAIL_GOTO(status, OnError);
+    if (is_array)
+    {
+        status = vsi_nn_kernel_gpu_add_param(node, "remainder", &remainder);
+        status |= vsi_nn_kernel_gpu_add_param(node, "w_size", &w);
+        CHECK_STATUS_FAIL_GOTO(status, OnError);
+    }
 
 #define _PACK_SELECT_KEY( IN0_TYPE, OUT_TYPE, AXIS, DIM)    \
         (IN0_TYPE | (OUT_TYPE << 8) | (AXIS << 16) | (DIM << 24))
@@ -767,7 +868,8 @@ static vsi_status _query_kernel
     const vsi_nn_kernel_param_t * params,
     int32_t axis,
     int32_t is_2d,
-    int32_t is_ex_rev
+    int32_t is_ex_rev,
+    int32_t is_array
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -781,7 +883,7 @@ static vsi_status _query_kernel
     input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
     output_dtype = vsi_nn_kernel_map_dtype( outputs[0]->attr.dtype.vx_type );
 
-    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d);
+    key = HASH_CUMSUM_HASH_KEY( axis, input0_dtype, output_dtype, is_ex_rev, is_2d, is_array);
 
     for ( i = 0; i < _cnt_of_array(cumsum_map); i ++ )
     {
@@ -819,6 +921,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_CUMSUM_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
@@ -831,7 +934,10 @@ static vsi_nn_kernel_node_t _setup
     int32_t is_2d      = 0;
     uint32_t rs_dim    = 2;
     uint32_t i         = 0;
+    int32_t is_array   = 0;
     int32_t is_ex_or_rev  = exclusive || reverse;
+    vsi_nn_kernel_dtype_e input0_dtype = U8;
+    int32_t width         = 0;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
@@ -860,7 +966,30 @@ static vsi_nn_kernel_node_t _setup
     reshape_tensors[1] = vsi_nn_reshape_tensor( graph,
         outputs[0], shapes[0], (vsi_size_t)rs_dim );
 
-    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev);
+    width = (int32_t)shapes[0][0];
+
+    for (i = 0; i < rs_dim; i++)
+    {
+        if (shapes[0][i] > VSI_NN_MAX_BLOCK_SIZE)
+        {
+            is_array = 1;
+        }
+    }
+
+#undef VSI_NN_MAX_BLOCK_SIZE
+
+    input0_dtype = vsi_nn_kernel_map_dtype( inputs[0]->attr.dtype.vx_type );
+
+    if (is_array &&
+       ((axis_new == 0 && width < 8) ||
+       (axis_new > 0 && (((input0_dtype == U8 || input0_dtype == I8) && width < 16) ||
+       ((input0_dtype != U8 && input0_dtype != I8) && width < 8)))
+       ))
+    {
+        return NULL;
+    }
+
+    status = _query_kernel( inputs, outputs, kernel, params, axis_new, is_2d, is_ex_or_rev, is_array);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
index c61565c0..b005f5f8 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_TENSOR_GATHER_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -35,7 +35,7 @@
 #include "vsi_nn_error.h"
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
-
+#if !(VX_TENSOR_GATHER_API_SUPPORT)
 __BEGIN_DECLS
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
index 1d829fdd..cdab7d77 100644
--- a/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/gather_nd_evis.c
@@ -58,14 +58,14 @@ __BEGIN_DECLS
     _3D
 } vsi_nn_kernel_coord_type_e;
 
-#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim) \
-    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim))
+#define HASH_GATHER_ND_KEY(_input0_type, _output_type, _coord_dim, _batch_dim, is_array) \
+    ((_input0_type << 24) | (_output_type << 16) | (_coord_dim << 8) | (_batch_dim << 4) | (is_array))
 
 #define HASH_GATHER_ND_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
     CVIVANTE_NAMESPACE("evis.gather_nd_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 
 #define TENSOR_GATHER_ND_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 0), \
         HASH_GATHER_ND_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
         SOURCE },
 
@@ -73,10 +73,26 @@ __BEGIN_DECLS
     CVIVANTE_NAMESPACE("evis.gather_nd_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
 
 #define TENSOR_GATHER_ND_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
-    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1), \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 0), \
         HASH_GATHER_ND_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
         SOURCE },
 
+#define HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gather_nd_array_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_ARRAY_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 0, 1), \
+        HASH_GATHER_ND_ARRAY_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
+#define HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(SRC0_TYPE, DST_TYPE, COORD_TYPE) \
+    CVIVANTE_NAMESPACE("evis.gather_nd_array_batch_"#SRC0_TYPE"to"#DST_TYPE#COORD_TYPE)
+
+#define TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(IN0_TYPE, IN1_TYPE, OUT_TYPE, COORD_TYPE, SOURCE) \
+    { HASH_GATHER_ND_KEY(IN0_TYPE, OUT_TYPE, COORD_TYPE, 1, 1), \
+        HASH_GATHER_ND_ARRAY_BATCH_SH_KERNEL_NAME(IN0_TYPE, OUT_TYPE, COORD_TYPE), \
+        SOURCE },
+
 static const struct {
         uint32_t key;
         char* function_name;
@@ -125,6 +141,50 @@ static const struct {
     TENSOR_GATHER_ND_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
     TENSOR_GATHER_ND_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
     TENSOR_GATHER_ND_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_1)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_2)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, I8,  _3D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, U8,  _3D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, I16, _3D,      KERNEL_SOURCE_3)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, F16, _3D,      KERNEL_SOURCE_3)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _1D,      KERNEL_SOURCE_4)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _1D,      KERNEL_SOURCE_4)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _2D,      KERNEL_SOURCE_5)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _2D,      KERNEL_SOURCE_5)
+
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I8,  I32, F16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(I16, I32, F16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I8,  _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, I16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(U8,  I32, F16, _3D,      KERNEL_SOURCE_6)
+    TENSOR_GATHER_ND_ARRAY_KERNELS(F16, I32, U8,  _3D,      KERNEL_SOURCE_6)
+
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8,  I32, I8,  _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8,  I32, U8,  _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _1D,      KERNEL_SOURCE_7)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I8,  I32, I8,  _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(U8,  I32, U8,  _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(I16, I32, I16, _2D,      KERNEL_SOURCE_8)
+    TENSOR_GATHER_ND_ARRAY_BATCH_KERNELS(F16, I32, F16, _2D,      KERNEL_SOURCE_8)
+
 };
 
 /*
@@ -148,7 +208,8 @@ static vsi_status get_gather_nd_tensor_reshape_size
     vsi_size_t block_size,
     uint32_t coordDim,
     int32_t* newDim,
-    uint32_t  batch_dims
+    uint32_t  batch_dims,
+    int32_t* arrayFlg
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -184,12 +245,20 @@ static vsi_status get_gather_nd_tensor_reshape_size
             for (i = 0; i < coordDim - 1; i++)
             {
                 sizes[rank++] = input_size[i + offset];
+                if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
+                {
+                    arrayFlg[0] = 1;
+                }
             }
 
             for (i = 0; i < batch_dims; i++)
             {
                 sizes[rank] *= input_size[dims_num - i - 1];
             }
+            if (sizes[rank] >= VSI_NN_MAX_IMAGE_WIDTH)
+            {
+                arrayFlg[0] = 1;
+            }
 
             newDim[0] = rank + 1;
         }
@@ -198,6 +267,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
             for (i = coordDim-1; i > 0; i--)
             {
                 sizes[i] = input_size[i + offset - 1];
+                if (sizes[i] >= VSI_NN_MAX_IMAGE_WIDTH)
+                {
+                    arrayFlg[0] = 1;
+                }
             }
             for (i = 0; i < offset; i++)
             {
@@ -210,6 +283,10 @@ static vsi_status get_gather_nd_tensor_reshape_size
                 newDim[0] = 2;
                 sizes[0] = block_size;
                 sizes[1] = elementCnt / block_size;
+                if ((elementCnt / block_size) >= VSI_NN_MAX_IMAGE_WIDTH)
+                {
+                    arrayFlg[0] = 1;
+                }
             }
             else if (coordDim == 4)
             {
@@ -242,6 +319,14 @@ static vsi_status get_gather_nd_tensor_reshape_size
             status = VSI_SUCCESS;
             newDim[0] = 3;
         }
+        else
+        {
+            sizes[0] = block_size;
+            sizes[1] = elementCnt / block_size;
+            status = VSI_SUCCESS;
+            newDim[0] = 2;
+            arrayFlg[0] = 1;
+        }
     }
 #undef VSI_NN_MAX_IMAGE_WIDTH
 
@@ -409,7 +494,8 @@ static vsi_status _query_kernel
     vsi_nn_tensor_t* const* const outputs,
     vsi_nn_kernel_t* kernel,
     int32_t coord_dim,
-    int32_t batch_dims
+    int32_t batch_dims,
+    int32_t is_array
     )
 {
     vsi_status status = VSI_FAILURE;
@@ -444,7 +530,7 @@ static vsi_status _query_kernel
         coord_type = _3D;
     }
 
-    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg );
+    key = HASH_GATHER_ND_KEY( input0_dtype, output_dtype, coord_type, batch_flg, is_array);
 
     for ( i = 0; i < _cnt_of_array(gather_nd_map); i ++ )
     {
@@ -482,6 +568,7 @@ static vsi_nn_kernel_node_t _setup
     vsi_nn_kernel_t             * kernel
     )
 {
+#define VSI_NN_MAX_BLOCK_SIZE  GPU_TENSOR_MAX_WIDTH
     vsi_status status = VSI_FAILURE;
     vsi_nn_kernel_node_param_t tmp_params[_GATHER_ND_PARAM_NUM] = { NULL };
     vsi_nn_kernel_node_t node = NULL;
@@ -489,26 +576,41 @@ static vsi_nn_kernel_node_t _setup
     int32_t batch_dims  = vsi_nn_kernel_param_get_int32( params, "batch_dims" );
     int32_t block_size  = vsi_nn_kernel_param_get_int32( params, "block_size" );
     int32_t coord_dim   = vsi_nn_kernel_param_get_int32( params, "coord_dim" );
+    int32_t input_size  = 1;
+    int32_t no_block_batch_size = 1;
     int32_t rs_in_dim = 0, rs_idx_dim = 0, rs_out_dim = 0;
+    int32_t is_array    = 0;
+    int32_t i = 0;
 
     VSI_UNREFERENCED(input_num);
     VSI_UNREFERENCED(output_num);
 
-    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0], block_size, coord_dim, &rs_in_dim, batch_dims);
-    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1], coord_dim, 0, &rs_idx_dim, batch_dims);
-    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2], block_size, 0, &rs_out_dim, batch_dims);
-    if (status != VSI_SUCCESS)
+    for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
     {
-        return NULL;
+        input_size = input_size * (int32_t)inputs[0]->attr.size[i];
     }
-
-    if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
-                outputs[0]->attr.dim_num ) )
+    no_block_batch_size = input_size / block_size;
+    is_array = no_block_batch_size > VSI_NN_MAX_BLOCK_SIZE ? 1 : 0;
+
+    status = get_gather_nd_tensor_reshape_size(&inputs[0], shapes[0],
+        block_size, coord_dim, &rs_in_dim, batch_dims, &is_array);
+    status |= get_gather_nd_tensor_reshape_size(&inputs[1], shapes[1],
+        coord_dim, 0, &rs_idx_dim, batch_dims, &is_array);
+    status |= get_gather_nd_tensor_reshape_size(&outputs[0], shapes[2],
+        block_size, 0, &rs_out_dim, batch_dims, &is_array);
+#undef VSI_NN_MAX_BLOCK_SIZE
+    if (status != VSI_SUCCESS)
     {
         return NULL;
     }
 
-    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims );
+    //if ( !vsi_nn_kernel_gpu_check_shape( outputs[0]->attr.size,
+    //            outputs[0]->attr.dim_num ) )
+    //{
+    //    return NULL;
+    //}
+
+    status = _query_kernel( inputs, outputs, kernel, coord_dim, batch_dims, is_array);
     if ( VSI_SUCCESS == status)
     {
         node = vsi_nn_kernel_create_node( graph, kernel );
diff --git a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
index 5ecb4b77..ad515d9a 100644
--- a/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/layer_normalization_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LAYER_NORMALIZATION_VX_SUPPORT_EXT)
 __BEGIN_DECLS
 
 #define SOURCE_AXIS0_0     "layer_normalization_0"
diff --git a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
index 37fddeaf..d2d3e203 100644
--- a/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/log_softmax_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_LOGSOFTMAX_VX_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_LOGSOFTMAX_VX_SUPPORT)
 __BEGIN_DECLS
 
 #define HASH_LOG_SOFTMAX_HASH_KEY(AXIS, IN_DTYPE, OUT_DTYPE, _image_2d) \
diff --git a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
index 6554c74a..ab42eec8 100644
--- a/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/nearest_grid_sample_evis.c
@@ -36,6 +36,8 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 
+#if (!VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
+
 __BEGIN_DECLS
 
 /*
@@ -625,3 +627,4 @@ __END_DECLS
 
 REGISTER_BACKEND_EVIS( nearest_grid_sample, _setup )
 
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/evis/pow_evis.c b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
index 8492528c..767ab83d 100644
--- a/src/tim/vx/internal/src/kernel/evis/pow_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pow_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_TENSOR_POW_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_eltwise.h"
-
+#if !(VX_TENSOR_POW_API_SUPPORT)
 __BEGIN_DECLS
 
 #define KERNEL_SOURCE    "pow",
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
index 167db3e9..ea840776 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_evis.c
@@ -750,6 +750,7 @@ static vsi_nn_kernel_node_t _setup
     shape[2] = 1;
     reshape_tensor = vsi_nn_reshape_tensor( graph,
             outputs[0], shape, outputs[0]->attr.dim_num );
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);
 
     if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -819,6 +820,7 @@ static vsi_nn_kernel_node_t _setup
 
 final:
     vsi_nn_safe_free(node_params);
+    vsi_safe_release_tensor(reshape_tensor);
 
     return node;
 } /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
index 17f3bc52..f3d89392 100644
--- a/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/pre_process_rgb888_planar_nhwc_evis.c
@@ -911,6 +911,7 @@ static vsi_nn_kernel_node_t _setup
     shape[2] = 1;
     reshape_tensor = vsi_nn_reshape_tensor( graph,
             outputs[0], shape, outputs[0]->attr.dim_num );
+    CHECK_PTR_FAIL_GOTO(reshape_tensor, "Create tensor fail.", final);
 
     if ( !vsi_nn_kernel_gpu_check_shape( reshape_tensor->attr.size,
                 outputs[0]->attr.dim_num ) )
@@ -978,6 +979,7 @@ static vsi_nn_kernel_node_t _setup
 
 final:
     vsi_nn_safe_free(node_params);
+    vsi_safe_release_tensor(reshape_tensor);
 
     return node;
 } /* _setup() */
diff --git a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
index d3d33755..a63fc3a8 100644
--- a/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/resize_bilinear_evis.c
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "utils/vsi_nn_dtype_util_prv.h"
-
+#if (!VX_RESIZE_BILINEAR_SH_SUPPORT)
 __BEGIN_DECLS
 
 /*
@@ -1515,3 +1515,4 @@ static vsi_nn_kernel_node_t _setup
 __END_DECLS
 
 REGISTER_BACKEND_EVIS( resize_bilinear, _setup )
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/kernel/evis/tile_evis.c b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
index 4fc76f92..4d3070bf 100644
--- a/src/tim/vx/internal/src/kernel/evis/tile_evis.c
+++ b/src/tim/vx/internal/src/kernel/evis/tile_evis.c
@@ -22,7 +22,7 @@
 *
 *****************************************************************************/
 
-#if !(VX_TENSOR_TILE_API_SUPPORT)
+
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
@@ -36,7 +36,7 @@
 #include "utils/vsi_nn_util.h"
 #include "kernel/vsi_nn_kernel.h"
 #include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
-
+#if !(VX_TENSOR_TILE_API_SUPPORT)
 __BEGIN_DECLS
 
 /*
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
index 331f2629..8ff82f54 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel.c
@@ -29,6 +29,7 @@
 #include "vsi_nn_context.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
@@ -1673,7 +1674,7 @@ vsi_status vsi_nn_KernelGpuConfig
 
 static vsi_bool _check_shader_support(vsi_nn_graph_t* graph)
 {
-    int32_t enableShader = graph->ctx->options.enable_shader;
+    int32_t enableShader = ((vsi_nn_graph_prv_t*)graph)->options->enable_shader;
 
 #if VX_HARDWARE_CAPS_PARAMS_EXT_SUPPORT
     if ( graph->ctx->config.subGroupSize == 0 )
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
index b837e663..92e94f65 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_selector.c
@@ -181,6 +181,9 @@ REGISTER_VX_FIRST_KERNEL_SELECTOR(cos)
 #if (VX_LOGSOFTMAX_VX_SUPPORT)
 REGISTER_VX_FIRST_KERNEL_SELECTOR(log_softmax)
 #endif
+#if (VX_BITCAST_VX_SUPPORT)
+REGISTER_VX_FIRST_KERNEL_SELECTOR(bitcast)
+#endif
 
 
 __END_DECLS
diff --git a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
index 0e0b2141..ca5ce158 100644
--- a/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
+++ b/src/tim/vx/internal/src/kernel/vsi_nn_kernel_util.c
@@ -916,11 +916,21 @@ vsi_nn_tensor_t * vsi_nn_kernel_insert_reshape_node
     {
         input = in_tensor;
         output = tensor;
+        /* Create a openvx tensor if it is not exist */
+        if (NULL == input->t)
+        {
+            vsi_nn_TensorReinit(graph, input);
+        }
     }
     else
     {
         input = tensor;
         output = in_tensor;
+        /* Create a openvx tensor if it is not exist */
+        if (NULL == output->t)
+        {
+            vsi_nn_TensorReinit(graph, output);
+        }
     }
 
     vxTensorReshapeNode(graph->g, input->t, &reshape_param, sizeof(reshape_param), output->t);
diff --git a/src/tim/vx/internal/src/kernel/vx/argmax_vx.c b/src/tim/vx/internal/src/kernel/vx/argmax_vx.c
new file mode 100644
index 00000000..75482d28
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/argmax_vx.c
@@ -0,0 +1,79 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_ARGMAX_VX_SUPPORT)
+
+#define REGISTER_ARGMAXOPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_ARGMAXOPENVX_KERNEL( argmax )
+{
+    vx_node node = NULL;
+    int32_t axis = vsi_nn_kernel_param_get_int32(params, "axis");
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    node = vxArgmaxLayer(graph->g,
+                        inputs[0]->t,
+                        axis,
+                        outputs[0]->t
+                        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* argmax() */
+
+#undef REGISTER_ARGMAXOPENVX_KERNEL
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c b/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c
new file mode 100644
index 00000000..85a72996
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/bitcast_vx.c
@@ -0,0 +1,77 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_BITCAST_VX_SUPPORT)
+
+#define REGISTER_BITCASTOPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_BITCASTOPENVX_KERNEL( bitcast )
+{
+    vx_node node = NULL;
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(params);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    node = vxBitCastLayer(graph->g,
+                              inputs[0]->t,
+                              outputs[0]->t
+                              );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* bitcast() */
+
+#undef REGISTER_BITCASTOPENVX_KERNEL
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c b/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c
new file mode 100644
index 00000000..fd6217b2
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/grid_sample_vx.c
@@ -0,0 +1,91 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_NEAREST_GRID_SAMPLE_VX_SUPPORT)
+static vsi_nn_kernel_node_t _setup
+    (
+    vsi_nn_graph_t              * graph,
+    vsi_nn_tensor_t            ** inputs,
+    size_t                        input_num,
+    vsi_nn_tensor_t            ** outputs,
+    size_t                        output_num,
+    const vsi_nn_kernel_param_t * params,
+    vsi_nn_kernel_t             * kernel
+    )
+{
+    vx_node node = NULL;
+    int32_t mode =
+        vsi_nn_kernel_param_get_int32(params, "mode");
+    int32_t align_corners =
+        vsi_nn_kernel_param_get_int32(params, "align_corners");
+    int32_t pad_mode =
+        vsi_nn_kernel_param_get_int32(params, "padding_mode");
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(output_num);
+    VSI_UNREFERENCED(input_num);
+
+    node = vxGridSampleLayer(
+        graph->g,
+        inputs[0]->t,
+        inputs[1]->t,
+        mode,
+        align_corners,
+        pad_mode,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* _setup() */
+
+#define REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL(KERNEL_NAME) \
+    static vsi_nn_kernel_node_t _##KERNEL_NAME##_setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num, \
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ) \
+    { \
+        return _setup(graph, inputs, input_num, outputs, output_num, \
+                params, kernel); \
+    } \
+    REGISTER_BACKEND_OPENVX( KERNEL_NAME, _##KERNEL_NAME##_setup )
+
+REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL( nearest_grid_sample )
+
+#undef REGISTER_NEAREST_GRID_SAMPLE_OPENVX_KERNEL
+
+#endif
diff --git a/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c b/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c
new file mode 100644
index 00000000..25c42629
--- /dev/null
+++ b/src/tim/vx/internal/src/kernel/vx/l1_layer_norm_vx.c
@@ -0,0 +1,82 @@
+/****************************************************************************
+*
+*    Copyright (c) 2021 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_tensor_util.h"
+#include "kernel/vsi_nn_kernel.h"
+
+#if (VX_L1_LAYER_NORM_VX_SUPPORT)
+#define REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( kernel_name )   \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        ); \
+    REGISTER_BACKEND_OPENVX( kernel_name, _##kernel_name##setup ) \
+    static vsi_nn_kernel_node_t _##kernel_name##setup \
+        ( \
+        vsi_nn_graph_t              * graph, \
+        vsi_nn_tensor_t            ** inputs, \
+        size_t                        input_num, \
+        vsi_nn_tensor_t            ** outputs, \
+        size_t                        output_num,\
+        const vsi_nn_kernel_param_t * params, \
+        vsi_nn_kernel_t             * kernel \
+        )
+
+REGISTER_L1_LAYER_NORM_OPENVX_KERNEL( l1_layer_norm )
+{
+    vx_node node = NULL;
+    float eps = vsi_nn_kernel_param_get_float32( params, "eps" );
+    int32_t axis = vsi_nn_kernel_param_get_int32( params, "axis" );
+
+    VSI_UNREFERENCED(kernel);
+    VSI_UNREFERENCED(input_num);
+    VSI_UNREFERENCED(output_num);
+
+    node = vxL1LayerNormalizationLayer(
+        graph->g,
+        eps,
+        axis,
+        inputs[0]->t,
+        inputs[1]->t,
+        inputs[2]->t,
+        inputs[3]->t,
+        outputs[0]->t
+        );
+
+    return (vsi_nn_kernel_node_t)node;
+} /* l1_layer_norm() */
+
+#undef REGISTER_L1_LAYER_NORM_OPENVX_KERNEL
+#endif
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl b/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl
new file mode 100644
index 00000000..38ac9e3c
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/col2im.cl
@@ -0,0 +1,162 @@
+#pragma OPENCL EXTENSION cl_viv_vx_extension : enable
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform int width_pad;
+_viv_uniform int height_pad;
+_viv_uniform int depth_pad;
+_viv_uniform int move_time_x;
+_viv_uniform int move_time_y;
+_viv_uniform int kernel_x_new;
+_viv_uniform int kernel_y_new;
+_viv_uniform int kernel_z_new;
+_viv_uniform int depth;
+
+#define COL2IM(name, read_type, dst_type ,convert_type, write_type) \
+__kernel void col2im_##name \
+( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              stride_w, \
+                 int              stride_h, \
+                 int              stride_d, \
+                 int              dilation_w, \
+                 int              dilation_h, \
+                 int              dilation_d, \
+                 int              pad_w_front, \
+                 int              pad_w_end, \
+                 int              pad_h_front, \
+                 int              pad_h_end, \
+                 int              pad_d_front, \
+                 int              pad_d_end, \
+                 int              kernel_x, \
+                 int              kernel_y, \
+                 int              kernel_z, \
+                 float            inOutScale, \
+                 float            inOutTile \
+) \
+{ \
+    int x = get_global_id(0); \
+    int y = get_global_id(1); \
+    int z = get_global_id(2); \
+    int4 coord_out = (int4)(x,y,z,0); \
+    int b = z / depth; \
+    z = z % depth; \
+    int4 coord_in = (int4)(0,0,b,0); \
+ \
+    float sum = 0.0f; \
+    x = x + pad_w_front; \
+    y = y + pad_h_front; \
+    z = z + pad_d_front; \
+    int offset_x = x % stride_w; \
+    int offset_y = y % stride_h; \
+    int offset_z = z % stride_d; \
+    int i,j,k; \
+    for (k = offset_z; k < kernel_z_new; k += stride_d) \
+    { \
+        if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \
+        { \
+            continue; \
+        } \
+        for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
+        { \
+            if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
+            { \
+                continue; \
+            } \
+            for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
+            { \
+                if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
+                { \
+                    continue; \
+                } \
+                coord_in.x = (x - i + stride_w - 1) / stride_w + \
+                             (y - j + stride_h - 1) / stride_h * move_time_x + \
+                             (z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \
+                coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \
+                sum = sum + convert_float(read_type(input, coord_in).x); \
+            } \
+        } \
+    } \
+    sum = sum * inOutScale + inOutTile; \
+    dst_type dst = 0; \
+    dst.x = convert_type(sum); \
+    write_type(output, coord_out, dst); \
+}
+COL2IM(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)
+COL2IM(U32toI32, read_imageui, int4,   convert_int,   write_imagei)
+COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef)
+COL2IM(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)
+COL2IM(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)
+COL2IM(I32toF32, read_imagei,  float4, convert_float, write_imagef)
+COL2IM(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)
+COL2IM(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)
+COL2IM(F32toF32, read_imagef,  float4, convert_float, write_imagef)
+
+#define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \
+__kernel void col2im_##name##_2D \
+( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+                 int              stride_w, \
+                 int              stride_h, \
+                 int              stride_d, \
+                 int              dilation_w, \
+                 int              dilation_h, \
+                 int              dilation_d, \
+                 int              pad_w_front, \
+                 int              pad_w_end, \
+                 int              pad_h_front, \
+                 int              pad_h_end, \
+                 int              pad_d_front, \
+                 int              pad_d_end, \
+                 int              kernel_x, \
+                 int              kernel_y, \
+                 int              kernel_z, \
+                 float            inOutScale, \
+                 float            inOutTile \
+) \
+{ \
+    int x = get_global_id(0); \
+    int y = get_global_id(1); \
+    int z = get_global_id(2); \
+    int4 coord_out = (int4)(x,y,z,0); \
+    int4 coord_in = (int4)(0,0,z,0); \
+ \
+    float sum = 0.0f; \
+    x = x + pad_w_front; \
+    y = y + pad_h_front; \
+    int offset_x = x % stride_w; \
+    int offset_y = y % stride_h; \
+    int i,j; \
+    for (j = offset_y; j < kernel_y_new; j = j + stride_h) \
+    { \
+        if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \
+        { \
+            continue; \
+        } \
+        for (i = offset_x; i < kernel_x_new; i = i + stride_w) \
+        { \
+            if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \
+            { \
+                continue; \
+            } \
+            coord_in.x = (x - i + stride_w - 1) / stride_w + \
+                         (y - j + stride_h - 1) / stride_h * move_time_x; \
+            coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \
+            sum = sum + convert_float(read_type(input, coord_in).x); \
+        } \
+    } \
+    sum = sum * inOutScale + inOutTile; \
+    dst_type dst = 0; \
+    dst.x = convert_type(sum); \
+    write_type(output, coord_out, dst); \
+}
+COL2IM_2D(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)
+COL2IM_2D(U32toI32, read_imageui, int4,   convert_int,   write_imagei)
+COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef)
+COL2IM_2D(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)
+COL2IM_2D(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)
+COL2IM_2D(I32toF32, read_imagei,  float4, convert_float, write_imagef)
+COL2IM_2D(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)
+COL2IM_2D(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)
+COL2IM_2D(F32toF32, read_imagef,  float4, convert_float, write_imagef)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl
new file mode 100644
index 00000000..98938459
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis0.cl
@@ -0,0 +1,332 @@
+
+__kernel void cumsum_array_F32toF32_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float sum = (float)(0);
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+
+        for(; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.z--;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.z++;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+__kernel void cumsum_array_U8toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint sum = (uint)(0);
+    uint dst = (uint)(0);
+
+    int tmp_zp = convert_int_rte(output_zp);
+    dst.x = convert_uint_sat(tmp_zp);
+
+    float cnt = 0.0f;
+
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global uint* in_ptr = (__global uint*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+__kernel void cumsum_array_F32toU8_axis0_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float4 sum = (float4)(0);
+    uint4 dst = (uint4)(0);
+    int tmp_zp = convert_int_rte(output_zp);
+    dst.x = convert_uint_sat(tmp_zp);
+
+    float cnt = 0.0f;
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.x = width - 1;
+        coord.z = coord.x;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.z--;
+            cnt += 1.0;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive)
+    {
+        coord.z = 0;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            coord.z++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;
+
+            dst.x = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl
new file mode 100644
index 00000000..545d05e1
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_2d_axis1.cl
@@ -0,0 +1,321 @@
+
+__kernel void cumsum_array_F32toF32_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float sum = (float)(0);
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.w--;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        write_imagef(output, coord.zw, sum);
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord.w++;
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+__kernel void cumsum_array_U8toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    uint sum = (uint)(0);
+    uint dst = (uint)(0);
+
+    int tmp_zp = convert_int_rte(output_zp);
+    dst = convert_uint_sat(tmp_zp);
+
+    float cnt = 0;
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global uint* in_ptr = (__global uint*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive)
+    {
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global uint*)input_ptr;
+            uint data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+__kernel void cumsum_array_F32toU8_axis1_2D(
+    __read_only image2d_t  input,
+    __write_only image2d_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int chn,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    float sum = (float)(0);
+    uint dst = (uint)(0);
+    int tmp_zp = convert_int_rte(output_zp);
+    dst = convert_uint_sat(tmp_zp);
+
+    float cnt = 0;
+    Image img1 = create_image_from_image2d(input, 4);
+    Image img2 = create_image_from_image2d(output, 4);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global uint* out_ptr = (__global uint*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord.w = height - 1;
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w--;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive)
+    {
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+        out_ptr = (__global uint*)output_ptr;
+        out_ptr[0] = dst;
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            coord.w++;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            cnt += 1.0f;
+            sum += data;
+
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;
+            float tmpSum = sum * in_out_scale + tmpAlpha;
+
+            dst = (uint)convert_int_rte(tmpSum);
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);
+            out_ptr = (__global uint*)output_ptr;
+            out_ptr[0] = dst;
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl
new file mode 100644
index 00000000..2b5f2296
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis0.cl
@@ -0,0 +1,215 @@
+
+__kernel void cumsum_array_F32toF32_axis0(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float sum = (float)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord_out.x = width - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.x = width - 1; coord.x > 0; coord.x--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.x--;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.x = 0;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.x = 0; coord.x < width - 1; coord.x++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.x++;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.x = 0; coord.x < width; coord.x++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+#define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \
+__kernel void cumsum_array_##name##toU8_axis0( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint dst = (uint)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst = convert_uint_sat(tmp_zp); \
+ \
+    float cnt = 0; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global uint* out_ptr = (__global uint*)output_ptr; \
+    if(exclusive && rev) \
+    { \
+        coord_out.x = width - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.x = width - 1; coord.x > 0; coord.x--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.x--; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.x = 0; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.x++; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else \
+    { \
+        for(coord.x = 0; coord.x < width; coord.x++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint)
+CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl
new file mode 100644
index 00000000..92b9c743
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis1.cl
@@ -0,0 +1,216 @@
+
+__kernel void cumsum_array_F32toF32_axis1(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float sum = (float)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord_out.y = height - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.y = height - 1; coord.y > 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.y--;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.y = 0;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.y = 0; coord.y < height - 1; coord.y++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.y++;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.y = 0; coord.y < height; coord.y++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+#define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \
+__kernel void cumsum_array_##name##toU8_axis1( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint dst = (uint4)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst = convert_uint_sat(tmp_zp); \
+ \
+    float cnt = 0; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global uint* out_ptr = (__global uint*)output_ptr; \
+    if(exclusive && rev) \
+    { \
+        coord_out.y = height - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+ \
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            coord_out.y--; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.y = 0; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            coord_out.y++; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else \
+    { \
+        for(coord.y = 0; coord.y < height; coord.y++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint)
+CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float)
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl
new file mode 100644
index 00000000..44940725
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/cumsum_array_axis2.cl
@@ -0,0 +1,215 @@
+__kernel void cumsum_array_F32toF32_axis2(
+    __read_only image2d_array_t  input,
+    __write_only image2d_array_t  output,
+    int axis,
+    int exclusive,
+    int rev,
+    int width,
+    int height,
+    int channel,
+    int input_zp,
+    float in_out_scale,
+    float in_out_zp_scale,
+    float output_zp
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    float sum = 0;
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global float* in_ptr = (__global float*)input_ptr;
+    __global float* out_ptr = (__global float*)output_ptr;
+    if(exclusive && rev)
+    {
+        coord_out.z = channel - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.z--;
+            sum += data;
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(exclusive)
+    {
+        coord_out.z = 0;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global float*)output_ptr;
+        out_ptr[0] = sum;
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            coord_out.z++;
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else if(rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+    else
+    {
+        for(coord.z = 0; coord.z < channel; coord.z++)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global float*)input_ptr;
+            float data = in_ptr[0];
+            sum += data;
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global float*)output_ptr;
+            out_ptr[0] = sum;
+        }
+    }
+}
+
+#define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \
+__kernel void cumsum_array_##name##toU8_axis2( \
+    __read_only image2d_array_t  input, \
+    __write_only image2d_array_t  output, \
+    int axis, \
+    int exclusive, \
+    int rev, \
+    int width, \
+    int height, \
+    int channel, \
+    int input_zp, \
+    float in_out_scale, \
+    float in_out_zp_scale, \
+    float output_zp \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type sum = (src_type)(0); \
+    uint dst = (uint)(0); \
+    int tmp_zp = convert_int_rte(output_zp); \
+    dst = convert_uint_sat(tmp_zp); \
+ \
+    float cnt = 0.0f; \
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global uint* out_ptr = (__global uint*)output_ptr; \
+ \
+    if(exclusive && rev) \
+    { \
+        coord_out.z = channel - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.z = channel - 1; coord.z > 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.z--; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive) \
+    { \
+        coord_out.z = 0; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+        out_ptr = (__global uint*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(coord.z = 0; coord.z < channel - 1; coord.z++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            coord_out.z++; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else \
+    { \
+        for(coord.z = 0; coord.z < channel; coord.z++) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src_type data = in_ptr[0]; \
+            cnt += 1.0f; \
+            sum += data; \
+ \
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \
+            float tmpSum = sum * in_out_scale + tmpAlpha; \
+ \
+            dst = (uint)convert_int_rte(tmpSum); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global uint*)output_ptr; \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint)
+CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
index 0e6166c4..a215f1fe 100644
--- a/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk.cl
@@ -18,8 +18,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
  \
-    __local float local_data[128]; \
-    __local uint local_indices[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
  \
     float left = read_imagef(input, coord.xy).x; \
     coord.z += work_group_size; \
@@ -51,7 +51,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -78,13 +78,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.xy, index.xxxx); \
     write_imagei(indices, coord.zy, index.yyyy); \
  }
-TOPK_F32(1 << 0, 0)
-TOPK_F32(1 << 1, 1)
-TOPK_F32(1 << 2, 2)
-TOPK_F32(1 << 3, 3)
-TOPK_F32(1 << 4, 4)
-TOPK_F32(1 << 5, 5)
-TOPK_F32(1 << 6, 6)
+TOPK_F32((1 << 0), 0)
+TOPK_F32((1 << 1), 1)
+TOPK_F32((1 << 2), 2)
+TOPK_F32((1 << 3), 3)
+TOPK_F32((1 << 4), 4)
+TOPK_F32((1 << 5), 5)
+TOPK_F32((1 << 6), 6)
+TOPK_F32((1 << 9), 9)
 
 #define TOPK_U32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \
@@ -106,8 +107,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
  \
-    __local uint local_data[128]; \
-    __local uint local_indices[128]; \
+    __local uint local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
  \
     uint left = read_imageui(input, coord.xy).x; \
     coord.z += work_group_size; \
@@ -139,7 +140,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             uint left_elem = local_data[left_id]; \
             uint right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -166,13 +167,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.xy, index.xxxx); \
     write_imagei(indices, coord.zy, index.yyyy); \
  }
-TOPK_U32(1 << 0, 0)
-TOPK_U32(1 << 1, 1)
-TOPK_U32(1 << 2, 2)
-TOPK_U32(1 << 3, 3)
-TOPK_U32(1 << 4, 4)
-TOPK_U32(1 << 5, 5)
-TOPK_U32(1 << 6, 6)
+TOPK_U32((1 << 0), 0)
+TOPK_U32((1 << 1), 1)
+TOPK_U32((1 << 2), 2)
+TOPK_U32((1 << 3), 3)
+TOPK_U32((1 << 4), 4)
+TOPK_U32((1 << 5), 5)
+TOPK_U32((1 << 6), 6)
+TOPK_U32((1 << 9), 9)
 
 #define TOPK_I32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \
@@ -194,8 +196,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
  \
-    __local int local_data[128]; \
-    __local int local_indices[128]; \
+    __local int local_data[LOCAL_SIZE0 * 2]; \
+    __local int local_indices[LOCAL_SIZE0 * 2]; \
  \
     int left = read_imagei(input, coord.xy).x; \
     coord.z += work_group_size; \
@@ -227,7 +229,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             int left_elem = local_data[left_id]; \
             int right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -254,13 +256,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.xy, index.xxxx); \
     write_imagei(indices, coord.zy, index.yyyy); \
  }
-TOPK_I32(1 << 0, 0)
-TOPK_I32(1 << 1, 1)
-TOPK_I32(1 << 2, 2)
-TOPK_I32(1 << 3, 3)
-TOPK_I32(1 << 4, 4)
-TOPK_I32(1 << 5, 5)
-TOPK_I32(1 << 6, 6)
+TOPK_I32((1 << 0), 0)
+TOPK_I32((1 << 1), 1)
+TOPK_I32((1 << 2), 2)
+TOPK_I32((1 << 3), 3)
+TOPK_I32((1 << 4), 4)
+TOPK_I32((1 << 5), 5)
+TOPK_I32((1 << 6), 6)
+TOPK_I32((1 << 9), 9)
 
 #define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \
@@ -282,8 +285,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
  \
-    __local float local_data[128]; \
-    __local uint local_indices[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
  \
     float left = read_imagef(input, coord.xy).x; \
     coord.z += work_group_size; \
@@ -315,7 +318,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -342,13 +345,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.zy, index.yyyy); \
  }
 
-TOPK_F32toU32(1 << 0, 0)
-TOPK_F32toU32(1 << 1, 1)
-TOPK_F32toU32(1 << 2, 2)
-TOPK_F32toU32(1 << 3, 3)
-TOPK_F32toU32(1 << 4, 4)
-TOPK_F32toU32(1 << 5, 5)
-TOPK_F32toU32(1 << 6, 6)
+TOPK_F32toU32((1 << 0), 0)
+TOPK_F32toU32((1 << 1), 1)
+TOPK_F32toU32((1 << 2), 2)
+TOPK_F32toU32((1 << 3), 3)
+TOPK_F32toU32((1 << 4), 4)
+TOPK_F32toU32((1 << 5), 5)
+TOPK_F32toU32((1 << 6), 6)
+TOPK_F32toU32((1 << 9), 9)
 
 #define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \
@@ -370,8 +374,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \
  \
-    __local float local_data[128]; \
-    __local uint local_indices[128]; \
+    __local float local_data[LOCAL_SIZE0 * 2]; \
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \
  \
     float left = read_imagef(input, coord.xy).x; \
     coord.z += work_group_size; \
@@ -403,7 +407,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \
             float right_elem = local_data[right_id]; \
  \
-            if ((left_elem < right_elem) ^ signo) \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
             { \
                 local_data[left_id] = right_elem; \
                 local_data[right_id] = left_elem; \
@@ -430,10 +434,11 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.zy, index.yyyy); \
  }
 
-TOPK_F32toI32(1 << 0, 0)
-TOPK_F32toI32(1 << 1, 1)
-TOPK_F32toI32(1 << 2, 2)
-TOPK_F32toI32(1 << 3, 3)
-TOPK_F32toI32(1 << 4, 4)
-TOPK_F32toI32(1 << 5, 5)
-TOPK_F32toI32(1 << 6, 6)
\ No newline at end of file
+TOPK_F32toI32((1 << 0), 0)
+TOPK_F32toI32((1 << 1), 1)
+TOPK_F32toI32((1 << 2), 2)
+TOPK_F32toI32((1 << 3), 3)
+TOPK_F32toI32((1 << 4), 4)
+TOPK_F32toI32((1 << 5), 5)
+TOPK_F32toI32((1 << 6), 6)
+TOPK_F32toI32((1 << 9), 9)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
new file mode 100644
index 00000000..0eae5ab2
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/cl/topk2.cl
@@ -0,0 +1,368 @@
+
+#define BITONIC_STEP(dtype) \
+void bitonic_step_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = local_indices[left_id]; \
+            int right_idx = local_indices[right_id]; \
+ \
+            dtype left_elem = local_data[left_id]; \
+            dtype right_elem = local_data[right_id]; \
+ \
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+            { \
+                local_data[left_id] = right_elem; \
+                local_data[right_id] = left_elem; \
+ \
+                local_indices[left_id] = right_idx; \
+                local_indices[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP(int)
+BITONIC_STEP(uint)
+
+#define BITONIC_STEP_ASCEND(dtype) \
+void bitonic_step_ascend_##dtype(uint num_stages, int lx, \
+        __local dtype *p_share_k, __local int *p_share_v) \
+{ \
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \
+    { \
+        uint signo = (lx >> stage) & 1; \
+ \
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+        { \
+            uint postShift = (stage - passOfStage); \
+            uint pairDistance = 1 << postShift; \
+ \
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+            uint right_id = left_id + pairDistance; \
+ \
+            int left_idx = p_share_v[left_id]; \
+            int right_idx = p_share_v[right_id]; \
+ \
+            dtype left_elem = p_share_k[left_id]; \
+            dtype right_elem = p_share_k[right_id]; \
+ \
+            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \
+            { \
+                p_share_k[left_id] = right_elem; \
+                p_share_k[right_id] = left_elem; \
+ \
+                p_share_v[left_id] = right_idx; \
+                p_share_v[right_id] = left_idx; \
+            } \
+ \
+            barrier(CLK_LOCAL_MEM_FENCE); \
+        } \
+    } \
+}
+BITONIC_STEP_ASCEND(int)
+BITONIC_STEP_ASCEND(uint)
+
+#define BITONIC_MERGE(dtype) \
+void bitonic_merge_##dtype(uint num_stages, int lx, \
+        __local dtype *local_data, __local int *local_indices) \
+{ \
+    uint stage = num_stages; \
+    uint signo = (lx >> stage) & 1; \
+ \
+    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \
+    { \
+        uint postShift = (stage - passOfStage); \
+        uint pairDistance = 1 << postShift; \
+ \
+        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \
+        uint right_id = left_id + pairDistance; \
+ \
+        int left_idx = local_indices[left_id]; \
+        int right_idx = local_indices[right_id]; \
+ \
+        dtype left_elem = local_data[left_id]; \
+        dtype right_elem = local_data[right_id]; \
+ \
+        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \
+        { \
+            local_data[left_id] = right_elem; \
+            local_data[right_id] = left_elem; \
+ \
+            local_indices[left_id] = right_idx; \
+            local_indices[right_id] = left_idx; \
+        } \
+ \
+        barrier(CLK_LOCAL_MEM_FENCE); \
+    } \
+}
+BITONIC_MERGE(int)
+BITONIC_MERGE(uint)
+
+#define BLOCK_SIZE              (512)
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const int init_k = -2147483647;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local int local_data[1536];
+    __local int local_indices[1536];
+
+    int left = read_imagei(input, coord.xy).x;
+    coord.z += threads_per_block;
+    int right = read_imagei(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_int(num_stages, lx, local_data, local_indices);
+
+    int min_data = local_data[511];
+
+    int *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        int2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imagei(input, coord.xy).x;
+        data.y = read_imagei(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_int(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            int2 data;
+            data.x = read_imagei(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imagei(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_int(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    int4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imagei(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
+
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32
+(
+  __read_only  image2d_t input,
+  __write_only image2d_t output,
+  __write_only image2d_t indices,
+               float     input_scale,
+               float     input_tail,
+               float     output_scale,
+               float     output_tail,
+               int       _num_stages,
+               int       width
+  )
+ {
+    uint lx = get_local_id(0);
+    const uint init_k = 0;
+    const int init_v = -2147483647;
+    const int num_stages = 9;
+    const int threads_per_block = BLOCK_SIZE;
+    const int index_minus_1 = threads_per_block * 2 - 1;
+    uint offset = 0;
+    uint lx1 = lx + threads_per_block;
+
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));
+
+    __local uint local_data[1536];
+    __local int local_indices[1536];
+
+    uint left = read_imageui(input, coord.xy).x;
+    coord.z += threads_per_block;
+    uint right = read_imageui(input, coord.zy).x;
+
+    local_data[lx] = left;
+    local_indices[lx] = coord.x;
+    local_data[lx1] = right;
+    local_indices[lx1] = coord.z;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    bitonic_step_uint(num_stages, lx, local_data, local_indices);
+
+    uint min_data = local_data[511];
+
+    uint *p_share_k = local_data + threads_per_block;
+    int *p_share_v = local_indices + threads_per_block;
+
+    int limit = (width >> 10) << 10;
+    p_share_k[lx] = init_k;
+    p_share_v[lx] = init_v;
+
+    p_share_k[lx1] = init_k;
+    p_share_v[lx1] = init_v;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)
+    {
+        uint2 data;
+        coord.z = coord.x + threads_per_block;
+        data.x = read_imageui(input, coord.xy).x;
+        data.y = read_imageui(input, coord.zy).x;
+
+        p_share_k[lx] = data.x;
+        p_share_v[lx] = coord.x;
+
+        p_share_k[lx1] = data.y;
+        p_share_v[lx1] = coord.z;
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] < min_data)
+        {
+            continue;
+        }
+
+        p_share_k[lx] = p_share_k[lx1];
+        p_share_v[lx] = p_share_v[lx1];
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+
+        min_data = local_data[511];
+        p_share_k[lx] = init_k;
+        p_share_v[lx] = init_v;
+        p_share_k[lx1] = init_k;
+        p_share_v[lx1] = init_v;
+    }
+
+    if (width > limit)
+    {
+        if (coord.x < width)
+        {
+            uint2 data;
+            data.x = read_imageui(input, coord.xy).x;
+            coord.z = coord.x + threads_per_block;
+            data.y = read_imageui(input, coord.zy).x;
+
+            p_share_k[lx] = data.x;
+            p_share_v[lx] = coord.x;
+
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);
+
+        if (p_share_k[index_minus_1] >= min_data)
+        {
+            p_share_k[lx] = p_share_k[lx1];
+            p_share_v[lx] = p_share_v[lx1];
+            barrier(CLK_LOCAL_MEM_FENCE);
+            bitonic_merge_uint(num_stages, lx, local_data, local_indices);
+        }
+    }
+
+    uint4 dst;
+    dst.x = local_data[lx];
+
+    coord.x = lx;
+    write_imageui(output, coord.xy, dst.xxxx);
+
+    int4 index;
+    index.x = local_indices[lx];
+
+    write_imagei(indices, coord.xy, index.xxxx);
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx
new file mode 100644
index 00000000..6bce2234
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array.vx
@@ -0,0 +1,344 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_array_F16toF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    for(coord.z = 0; coord.z < channel; coord.z++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_array_I16toI16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);
+
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_array_F16toF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 2); \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+ \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)
+
+__kernel void cumsum_array_I16toI16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_array_F16toF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+
+    for(; coord.x < width; coord.x += 8)
+    {
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+
+    }
+}
+
+#define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+    short zp = (short)input_zp; \
+ \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \
+ \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst; \
+    } \
+}
+
+CUMSUM_ARRAY_QINT_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)
+CUMSUM_ARRAY_QINT_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)
+CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8,  vxc_short8)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx
new file mode 100644
index 00000000..83c11645
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_2d.vx
@@ -0,0 +1,259 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_array_F16toF16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    for(; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0); \
+    int4 sum1 = (int4)(0); \
+    int4 sum2 = (int4)(0); \
+    int4 sum3 = (int4)(0); \
+ \
+    Image img1 = create_image_from_image2d(input, 1); \
+    Image img2 = create_image_from_image2d(output, 1); \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32A_4x4); \
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32B_4x4); \
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32C_4x4); \
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32D_4x4); \
+ \
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+ \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_array_I16toI16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32A_4x4);
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertU8toI32B_4x4);
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                 uniConvertInt32toUint8_2x8);
+
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_array_F16toF16_axis0_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    for(; coord.x < width; coord.x += 8)
+    {
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+        src = in_ptr[0];
+        _viv_asm(COPY, data, src, 16);
+
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16A_4x4);
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16B_4x4);
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16C_2x8);
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzF16toF16_2x8);
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+    }
+}
+
+#define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \
+__kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0, sum1; \
+    sum0 ^= sum0; \
+    sum1 ^= sum1; \
+    short zp = (short)input_zp; \
+    Image img1 = create_image_from_image2d(input, stride_data); \
+    Image img2 = create_image_from_image2d(output, stride_data); \
+ \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzU8toI16A_4x4); \
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzU8toI16B_8x4); \
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSubZpI16toI16_2x8); \
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzI16toI32A_4x4); \
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzI16toI32B_4x4); \
+ \
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+ \
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                 uniConvertInt32toUint8_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+
+CUMSUM_ARRAY_QINT_AXIS0_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)
+CUMSUM_ARRAY_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16, 1)
+CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8, 2)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx
new file mode 100644
index 00000000..adc80187
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_bf16.vx
@@ -0,0 +1,244 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_array_BF16toBF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(coord.z = 0; coord.z < channel; coord.z++)
+    {
+        float4 data0, data1;
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        sum0 += data0;
+        sum1 += data1;
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(coord.y = 0; coord.y < height; coord.y++)
+    {
+        float4 data0, data1;
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+        sum0 += data0;
+        sum1 += data1;
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float preSum = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+
+    for(; coord.x < width; coord.x += 8)
+    {
+        float4 data0, data1;
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
+        tmpSum1 += tmpSum0.w;
+
+        tmpSum0 += preSum;
+        tmpSum1 += preSum;
+
+        preSum = tmpSum1.w;
+
+        _viv_asm(COPY, dst0, tmpSum0, 16);
+        _viv_asm(COPY, dst1, tmpSum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis1_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), 0);
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+
+    for(; coord.y < height; coord.y++)
+    {
+        float4 data0, data1;
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        sum0 += data0;
+        sum1 += data1;
+
+        _viv_asm(COPY, dst0, sum0, 16);
+        _viv_asm(COPY, dst1, sum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
+
+__kernel void cumsum_BF16toBF16_axis0_2D(
+    __read_only image2d_t   input,
+    __write_only image2d_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));
+
+    vxc_ushort8 src, val0, val1;
+    vxc_ushort8 dst0, dst1, dst;
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);
+    float preSum = 0;
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);
+
+    Image img1 = create_image_from_image2d(input, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    for(; coord.x < width; coord.x += 8)
+    {
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+        {
+            coord.x = coord.x - (8 - remainder);
+        }
+        float4 data0, data1;
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;
+        src = in_ptr[0];
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part0_2x8);
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                    uniConvBF16toF32_Part1_2x8);
+        _viv_asm(COPY, data0, val0, 16);
+        _viv_asm(COPY, data1, val1, 16);
+
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));
+        tmpSum1 += tmpSum0.w;
+
+        tmpSum0 += preSum;
+        tmpSum1 += preSum;
+
+        preSum = tmpSum1.w;
+
+        _viv_asm(COPY, dst0, tmpSum0, 16);
+        _viv_asm(COPY, dst1, tmpSum1, 16);
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                uniExtractOddData_2x8);
+        out_ptr[0] = dst;
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx
new file mode 100644
index 00000000..78e33fbc
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis0.vx
@@ -0,0 +1,259 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;
+
+
+_viv_uniform int width;
+_viv_uniform int input_zp;
+_viv_uniform float in_out_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_ex_rev_array_F16toF16_axis0(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev
+    )
+{
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);
+    int4 coord_out = coord;
+
+    vxc_short8 src, dst;
+    vxc_half8 data, tmpsum, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)
+        {
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+            {
+                coord.x = coord.x - (8 - remainder);
+            }
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniSumHorzRevF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+        for(; coord.x < width - 8;)
+        {
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+            {
+                coord.x = coord.x - (8 - remainder);
+            }
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord_out.x = coord.x + 1;
+            coord.x += 8;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+            out_ptr = (__global vxc_short8*)output_ptr;
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.x = width - 8;
+        coord_out.x = width - 1;
+        _viv_asm(COPY, dst, sum, 16);
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.x > 0;)
+        {
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+            {
+                coord.x = coord.x - (8 - remainder);
+            }
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            coord_out.x = coord.x - 1;
+            coord.x -= 8;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),
+                        uniSumHorzRevF16toF16C_2x8);
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+#define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \
+    int4 coord_out = coord; \
+ \
+    src_type src; \
+    dst_type dst; \
+    vxc_short8 rowSum; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \
+    short zp = (short)input_zp; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \
+        { \
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+            { \
+                coord.x = coord.x - (8 - remainder); \
+            } \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        for(coord.x = -1; coord.x < width - 8;) \
+        { \
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+            { \
+                coord.x = coord.x - (8 - remainder); \
+            } \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            coord_out.x = coord.x + 1; \
+            coord.x += 8; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        for(coord.x = width - 7; coord.x > 0;) \
+        { \
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+            { \
+                coord.x = coord.x - (8 - remainder); \
+            } \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            coord_out.x = coord.x - 1; \
+            coord.x -= 8; \
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32A_4x4); \
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \
+                        uniAccSumHorzRevI16toI32B_4x4); \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8,  I8,  vxc_char16,  vxc_char16, 1)
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8,  vxc_short8, 2)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx
new file mode 100644
index 00000000..5b548ec7
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis1.vx
@@ -0,0 +1,330 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int height;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_ex_rev_array_F16toF16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        dst ^= dst;
+        out_ptr[0] = dst;
+        for(; coord.y < height - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.y++;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        dst ^= dst;
+        coord.y = height - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst;
+
+        for(; coord.y > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.y--;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.y < height - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.y++; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        coord.y = height - 1; \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global vxc_short8*)output_ptr; \
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.y > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \
+            coord.y--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)
+
+__kernel void cumsum_ex_rev_array_I16toI16_axis1(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.y < height - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.y++;
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.y = height - 1;
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.y > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;
+            coord.y--;
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx
new file mode 100644
index 00000000..5d94783d
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_ex_rev_axis2.vx
@@ -0,0 +1,322 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int channel;
+_viv_uniform float in_out_scale;
+_viv_uniform float in_out_zp_scale;
+_viv_uniform float output_zp;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+__kernel void cumsum_ex_rev_array_F16toF16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_short8 src, dst;
+    vxc_half8 data, sum;
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(rev && exclusive == 0)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev == 0 && exclusive)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        out_ptr[0] = dst;
+        for(; coord.z < channel - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.z++;
+            _viv_asm(COPY, data, src, 16);
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+    else if(rev && exclusive)
+    {
+        _viv_asm(COPY, dst, sum, 16);
+        coord.z = channel - 1;
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst;
+        for(; coord.z > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.z--;
+            _viv_asm(COPY, data, src, 16);
+
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            out_ptr = (__global vxc_short8*)output_ptr;
+
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);
+            _viv_asm(COPY, dst, sum, 16);
+            out_ptr[0] = dst;
+        }
+    }
+}
+
+#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    src_type src; \
+    dst_type dst; \
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \
+ \
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \
+    { \
+        coord.x = coord.x - (16 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(rev && exclusive == 0) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8);\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+                        uniConvertInt32toUint8_2x8);\
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.z < channel - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.z++; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(rev && exclusive) \
+    { \
+        coord.z = channel - 1; \
+        int tmpAlpha0 = convert_int_rte(output_zp); \
+        int4 tmpVal; \
+        tmpVal.x = tmpAlpha0; \
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global vxc_short8*)output_ptr; \
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \
+        for(; coord.z > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global src_type*)input_ptr; \
+            src = in_ptr[0]; \
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \
+            coord.z--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \
+                        uniConvertInt32toUint8_2x8); \
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)
+
+__kernel void cumsum_ex_rev_array_I16toI16_axis2(
+    __read_only image2d_array_t   input,
+    __write_only image2d_array_t  output,
+    int axis, int exclusive, int rev)
+{
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);
+
+    vxc_short8 src, dst;
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)
+    {
+        coord.x = coord.x - (8 - remainder);
+    }
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;
+    if(exclusive == 0 && rev)
+    {
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            out_ptr = (__global vxc_short8*)output_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
+        }
+    }
+    else if(exclusive && rev == 0)
+    {
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.z < channel - 1;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            coord.z++;
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+    else if(exclusive && rev)
+    {
+        coord.z = channel - 1;
+        int tmpAlpha0 = convert_int_rte(output_zp);
+        int4 tmpVal;
+        tmpVal.x = tmpAlpha0;
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);
+        out_ptr = (__global vxc_short8*)output_ptr;
+        out_ptr[0] = dst.xxxxxxxx;
+        for(; coord.z > 0;)
+        {
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);
+            in_ptr = (__global vxc_short8*)input_ptr;
+            src = in_ptr[0];
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;
+            coord.z--;
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),
+                        uniConvertInt32toUint8_2x8);
+
+            out_ptr[0] = dst;
+        }
+    }
+}
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx
new file mode 100644
index 00000000..41e9981f
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8.vx
@@ -0,0 +1,324 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    for(coord.z = 0; coord.z < channel; coord.z++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)
+
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    for(coord.y = 0; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis0( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, tmpsum, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+        for(; coord.z < channel - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.z++; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+     \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        coord.z = channel - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global dst_type*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(; coord.z > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.z--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+     \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \
+    __read_only image2d_array_t   input, \
+    __write_only image2d_array_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+    if(exclusive == 0 && rev) \
+    { \
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            out_ptr = (__global dst_type*)output_ptr; \
+            src = in_ptr[0]; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev == 0) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+        for(; coord.y < height - 1;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.y++; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+    else if(exclusive && rev) \
+    { \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+        coord.y = height - 1; \
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        out_ptr = (__global dst_type*)output_ptr; \
+        out_ptr[0] = dst; \
+        for(; coord.y > 0;) \
+        { \
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+            in_ptr = (__global vxc_short8*)input_ptr; \
+            src = in_ptr[0]; \
+            coord.y--; \
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+            out_ptr = (__global dst_type*)output_ptr; \
+            _viv_asm(COPY, data, src, 16); \
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                    uniU8MulAndPostShift_0_Lo_2x8); \
+            out_ptr[0] = dst; \
+        } \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx
new file mode 100644
index 00000000..21d37e09
--- /dev/null
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/cumsum_array_f16_u8_2d.vx
@@ -0,0 +1,108 @@
+#include "cl_viv_vx_ext.h"
+
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;
+
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;
+
+_viv_uniform int width;
+_viv_uniform int height;
+_viv_uniform int channel;
+
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;
+
+_viv_uniform int remainder;
+_viv_uniform int w_size;
+
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis1_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), 0); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+    { \
+        coord.x = coord.x - (8 - remainder); \
+    } \
+    for(; coord.y < height; coord.y++) \
+    { \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumVertF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16, 1)
+
+#define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \
+__kernel void cumsum_array_F16to##out_name##_axis0_2D( \
+    __read_only image2d_t   input, \
+    __write_only image2d_t  output, \
+    int axis, int exclusive, int rev \
+    ) \
+{ \
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \
+ \
+    vxc_short8 src; \
+    dst_type dst; \
+    vxc_half8 data, tmpsum, sum; \
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \
+    for(; coord.x < width; coord.x += 8) \
+    { \
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \
+        { \
+            coord.x = coord.x - (8 - remainder); \
+        } \
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \
+        src = in_ptr[0]; \
+        _viv_asm(COPY, data, src, 16); \
+ \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16A_4x4); \
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16B_4x4); \
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniSumHorzF16toF16C_2x8); \
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\
+                uniAccSumHorzF16toF16_2x8); \
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \
+                uniU8MulAndPostShift_0_Lo_2x8); \
+        out_ptr[0] = dst; \
+    } \
+}
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16, 1)
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2)
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16, 1)
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
index f6aa7c7c..77abb3b2 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd.vx
@@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_1D(
     VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_I8toI8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_U8toU8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_I16toI16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+
+}
+
+__kernel void gather_nd_array_F16toF16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    coord.w = indice.x;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
index 74c1a229..eb127a58 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d.vx
@@ -92,3 +92,116 @@ __kernel void gather_nd_F16toF16_2D(
     VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_I8toI8_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_U8toU8_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+
+}
+
+__kernel void gather_nd_array_I16toI16_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_F16toF16_2D(
+    __read_only image2d_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
index e45482c7..175b4785 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_2d_mix.vx
@@ -80,3 +80,85 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \
 GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)
+
+#define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \
+__kernel void gather_nd_array_##src0_type_name##toF16_2D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+ \
+    Image img1 = create_image_from_image2d(input0, stride); \
+    Image img2 = create_image_from_image2d(output, 2); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
+    read_type src = data_ptr[0]; \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    dst_ptr[0] = dst0; \
+}
+GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2)
+
+#define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \
+__kernel void gather_nd_array_F16to##src1_type_name##_2D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+ \
+    Image img1 = create_image_from_image2d(input0, 2); \
+    Image img2 = create_image_from_image2d(output, stride); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
+    vxc_short8 src = data_ptr[0]; \
+ \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \
+    dst_ptr[0] = dst; \
+}
+GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2)
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
index 566aaa55..7cf0cb89 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d.vx
@@ -98,3 +98,120 @@ __kernel void gather_nd_F16toF16_3D(
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
+__kernel void gather_nd_array_I8toI8_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_U8toU8_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Image img2 = create_image_from_image2d(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+
+}
+
+__kernel void gather_nd_array_I16toI16_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_F16toF16_3D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_t   input1,
+    __write_only image2d_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // indices_num
+
+    int4 coord = (int4)(0, gidy, gidx, 0);
+    Image img = create_image_from_image2d(input1, 4);
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.w = 0;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Image img2 = create_image_from_image2d(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
index e9ca9ecd..28397fe4 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_3d_mix.vx
@@ -80,3 +80,86 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)
 
+#define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \
+__kernel void gather_nd_array_##src0_type_name##toF16_3D( \
+    __read_only image2d_array_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+    indice.w = 0; \
+    Tensor img1 = create_tensor_from_image2d_array(input0, stride); \
+    Image img2 = create_image_from_image2d(output, 2); \
+ \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
+    read_type src = data_ptr[0]; \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    dst_ptr[0] = dst0; \
+}
+GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2)
+
+#define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \
+__kernel void gather_nd_array_F16to##src1_type_name##_3D( \
+    __read_only image2d_array_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    indice.x = indice.x * block_size + gidx; \
+    indice.w = 0; \
+ \
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2); \
+    Image img2 = create_image_from_image2d(output, stride); \
+ \
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
+    vxc_short8 src = data_ptr[0]; \
+ \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \
+    dst_ptr[0] = dst; \
+}
+GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2)
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
index e467f252..b3632383 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch.vx
@@ -95,3 +95,118 @@ __kernel void gather_nd_batch_F16toF16_1D(
     VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_batch_I8toI8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_U8toU8_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_I16toI16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_F16toF16_1D(
+    __read_only image2d_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);
+
+    Image img1 = create_image_from_image2d(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
index 58c2af34..8e52eeac 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_batch_2d.vx
@@ -26,7 +26,7 @@ __kernel void gather_nd_batch_I8toI8_2D(
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
-__kernel void gather_nd_U8toU8_2D(
+__kernel void gather_nd_batch_U8toU8_2D(
     __read_only image2d_array_t   input0,
     __read_only image2d_array_t   input1,
     __write_only image2d_array_t  output,
@@ -51,7 +51,7 @@ __kernel void gather_nd_U8toU8_2D(
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
-__kernel void gather_nd_I16toI16_2D(
+__kernel void gather_nd_batch_I16toI16_2D(
     __read_only image2d_array_t   input0,
     __read_only image2d_array_t   input1,
     __write_only image2d_array_t  output,
@@ -76,7 +76,7 @@ __kernel void gather_nd_I16toI16_2D(
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
 
-__kernel void gather_nd_F16toF16_2D(
+__kernel void gather_nd_batch_F16toF16_2D(
     __read_only image2d_array_t   input0,
     __read_only image2d_array_t   input1,
     __write_only image2d_array_t  output,
@@ -100,3 +100,123 @@ __kernel void gather_nd_F16toF16_2D(
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));
 }
+
+__kernel void gather_nd_array_batch_I8toI8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global char* data_ptr = (__global char*)input_ptr;
+    __global char* dst_ptr = (__global char*)output_ptr;
+    char src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_U8toU8_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global uchar* data_ptr = (__global uchar*)input_ptr;
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;
+    uchar src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_I16toI16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
+
+__kernel void gather_nd_array_batch_F16toF16_2D(
+    __read_only image2d_array_t   input0,
+    __read_only image2d_array_t   input1,
+    __write_only image2d_array_t  output,
+    int block_size,
+    int coord_dim
+    )
+{
+    int gidx = get_global_id(0);  // block_size
+    int gidy = get_global_id(1);  // index num
+    int gidz = get_global_id(2);  // batch num
+
+    int4 coord = (int4)(gidx, gidy, gidz, 0);
+    Tensor img = create_tensor_from_image2d_array(input1, 4);
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);
+    int4 indice = ((int4 *)indice_ptr)[0];
+
+    indice.x = indice.x * block_size + gidx;
+    indice.zw = coord.zw;
+
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);
+    __global short* data_ptr = (__global short*)input_ptr;
+    __global short* dst_ptr = (__global short*)output_ptr;
+    short src = data_ptr[0];
+    dst_ptr[0] = src;
+}
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
index 8288ab05..b4660c29 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/gather_nd_mix.vx
@@ -81,3 +81,85 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)
 GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)
 GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)
 
+#define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \
+__kernel void gather_nd_array_##src0_type_name##toF16_1D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    coord.w = indice.x; \
+ \
+    Image img1 = create_image_from_image2d(input0, stride); \
+    Image img2 = create_image_from_image2d(output, 2); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \
+    read_type src = data_ptr[0]; \
+ \
+    vxc_half8  src0; \
+    vxc_short8 dst0; \
+    vxc_ushort8 ms0; \
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \
+    _viv_asm(COPY, dst0, src0, 16); \
+    dst_ptr[0] = dst0; \
+}
+GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2)
+
+#define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \
+__kernel void gather_nd_array_F16to##src1_type_name##_1D( \
+    __read_only image2d_t   input0, \
+    __read_only image2d_t   input1, \
+    __write_only image2d_t  output, \
+    int block_size, \
+    int coord_dim \
+    ) \
+{ \
+    int gidx = get_global_id(0); \
+    int gidy = get_global_id(1); \
+ \
+    int4 coord = (int4)(0, gidy, gidx, 0); \
+    Image img = create_image_from_image2d(input1, 4); \
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \
+    int4 indice = ((int4 *)indice_ptr)[0]; \
+ \
+    coord.w = indice.x; \
+ \
+    Image img1 = create_image_from_image2d(input0, 2); \
+    Image img2 = create_image_from_image2d(output, stride); \
+ \
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \
+ \
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \
+    vxc_short8 src = data_ptr[0]; \
+    vxc_ushort8 mp1; \
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \
+    vxc_half8 data; \
+    write_type dst; \
+    _viv_asm(COPY, data, src, 16); \
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \
+    dst_ptr[0] = dst; \
+}
+GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1)
+GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2)
+
+
diff --git a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
index 3396163a..92cd9fba 100644
--- a/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
+++ b/src/tim/vx/internal/src/libnnext/ops/vx/pre_process_gray_2.vx
@@ -65,5 +65,5 @@ __kernel void pre_process_gray_half_U8toU8
 
     coord_in.xy = coord_in.xy >> 1;
 
-    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));
+    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));
 }
diff --git a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
index debd6873..5d4159ac 100644
--- a/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
+++ b/src/tim/vx/internal/src/libnnext/vsi_nn_libnnext_resource.c
@@ -6431,7 +6431,613 @@ CUMSUM_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16)\n\
 CUMSUM_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8)\n\
 "; /* end of cumsum_2d_vx*/
 
-static const char cumsum_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char cumsum_array_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
+\n\
+\n\
+__kernel void cumsum_array_F16toF16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_array_##in_name##to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \\\n\
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (16 - remainder); \\\n\
+    } \\\n\
+    for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+    { \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp; \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_array_I16toI16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+        float tmpAlpha = convert_float(coord.z + 1) * in_out_zp_scale + output_zp;\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1), uniConvertInt32toUint8_2x8);\n\
+\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_array_F16toF16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_array_##in_name##to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2); \\\n\
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (16 - remainder); \\\n\
+    } \\\n\
+ \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)\n\
+\n\
+__kernel void cumsum_array_I16toI16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_array_F16toF16_axis0(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, tmpsum, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+        {\n\
+            coord.x = coord.x - (8 - remainder);\n\
+        }\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        out_ptr[0] = dst;\n\
+\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_ARRAY_QINT_AXIS0(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_array_##in_name##to##out_name##_axis0( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 rowSum; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\
+    short zp = (short)input_zp; \\\n\
+ \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+        { \\\n\
+            coord.x = coord.x - (8 - remainder); \\\n\
+        } \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumHorzI16toI32B_4x4); \\\n\
+ \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
+}\n\
+\n\
+CUMSUM_ARRAY_QINT_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_ARRAY_QINT_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)\n\
+CUMSUM_ARRAY_QINT_AXIS0(I16, I16, vxc_short8,  vxc_short8)\n\
+"; /* end of cumsum_array_vx*/
+
+static const char cumsum_array_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
+\n\
+\n\
+__kernel void cumsum_array_F16toF16_axis1_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+\n\
+    Image img1 = create_image_from_image2d(input, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    for(; coord.y < height; coord.y++)\n\
+    {\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_ARRAY_AXIS1_2D(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_array_##in_name##to##out_name##_axis1_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0); \\\n\
+    int4 sum1 = (int4)(0); \\\n\
+    int4 sum2 = (int4)(0); \\\n\
+    int4 sum3 = (int4)(0); \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input, 1); \\\n\
+    Image img2 = create_image_from_image2d(output, 1); \\\n\
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (16 - remainder); \\\n\
+    } \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \\\n\
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32B_4x4); \\\n\
+        VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32C_4x4); \\\n\
+        VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32D_4x4); \\\n\
+ \\\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp; \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+        float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+        int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+        int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+ \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                 uniConvertInt32toUint8_2x8); \\\n\
+        VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero, 1), \\\n\
+                 uniConvertInt32toUint8_2x8); \\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_ARRAY_AXIS1_2D(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_ARRAY_AXIS1_2D(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_array_I16toI16_axis1_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(input, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32A_4x4);\n\
+        VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertU8toI32B_4x4);\n\
+        float tmpAlpha = convert_float(coord.y + 1) * in_out_zp_scale + output_zp;\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                 uniConvertInt32toUint8_2x8);\n\
+\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_array_F16toF16_axis0_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, tmpsum, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    Image img1 = create_image_from_image2d(input, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+        {\n\
+            coord.x = coord.x - (8 - remainder);\n\
+        }\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+        __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
+        src = in_ptr[0];\n\
+        _viv_asm(COPY, data, src, 16);\n\
+\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16A_4x4);\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16B_4x4);\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16C_2x8);\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzF16toF16_2x8);\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        out_ptr[0] = dst;\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_ARRAY_QINT_AXIS0_2D(in_name, out_name, src_type, dst_type, stride_data) \\\n\
+__kernel void cumsum_array_##in_name##to##out_name##_axis0_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 rowSum; \\\n\
+    int4 sum0, sum1; \\\n\
+    sum0 ^= sum0; \\\n\
+    sum1 ^= sum1; \\\n\
+    short zp = (short)input_zp; \\\n\
+    Image img1 = create_image_from_image2d(input, stride_data); \\\n\
+    Image img2 = create_image_from_image2d(output, stride_data); \\\n\
+ \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+        { \\\n\
+            coord.x = coord.x - (8 - remainder); \\\n\
+        } \\\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord); \\\n\
+        __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzU8toI16A_4x4); \\\n\
+        VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzU8toI16B_8x4); \\\n\
+        VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSubZpI16toI16_2x8); \\\n\
+        VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzI16toI32A_4x4); \\\n\
+        VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzI16toI32B_4x4); \\\n\
+ \\\n\
+        float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+        float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+        int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+        int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+ \\\n\
+        VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                 uniConvertInt32toUint8_2x8); \\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
+}\n\
+\n\
+CUMSUM_ARRAY_QINT_AXIS0_2D(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)\n\
+CUMSUM_ARRAY_QINT_AXIS0_2D(I8,  I8,  vxc_char16,  vxc_char16, 1)\n\
+CUMSUM_ARRAY_QINT_AXIS0_2D(I16, I16, vxc_short8,  vxc_short8, 2)"; /* end of cumsum_array_2d_vx*/
+
+static const char cumsum_array_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
 _viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
@@ -6440,8 +7046,11 @@ _viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 _viv_uniform int width;\n\
 _viv_uniform int height;\n\
 _viv_uniform int channel;\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
 \n\
-__kernel void cumsum_BF16toBF16_axis2(\n\
+\n\
+__kernel void cumsum_array_BF16toBF16_axis2(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output,\n\
     int axis, int exclusive, int rev\n\
@@ -6453,11 +7062,22 @@ __kernel void cumsum_BF16toBF16_axis2(\n\
     vxc_ushort8 dst0, dst1, dst;\n\
     vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
 \n\
     for(coord.z = 0; coord.z < channel; coord.z++)\n\
     {\n\
         float4 data0, data1;\n\
-        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\
+        src = in_ptr[0];\n\
         VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
         VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
         _viv_asm(COPY, data0, val0, 16);\n\
@@ -6468,7 +7088,7 @@ __kernel void cumsum_BF16toBF16_axis2(\n\
         _viv_asm(COPY, dst0, sum0, 16);\n\
         _viv_asm(COPY, dst1, sum1, 16);\n\
         VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
     }\n\
 }\n\
 \n\
@@ -6484,11 +7104,22 @@ __kernel void cumsum_BF16toBF16_axis1(\n\
     vxc_ushort8 dst0, dst1, dst;\n\
     vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
 \n\
     for(coord.y = 0; coord.y < height; coord.y++)\n\
     {\n\
         float4 data0, data1;\n\
-        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\
+        src = in_ptr[0];\n\
         VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
         VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
         _viv_asm(COPY, data0, val0, 16);\n\
@@ -6498,7 +7129,7 @@ __kernel void cumsum_BF16toBF16_axis1(\n\
         _viv_asm(COPY, dst0, sum0, 16);\n\
         _viv_asm(COPY, dst1, sum1, 16);\n\
         VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
     }\n\
 }\n\
 \n\
@@ -6516,11 +7147,21 @@ __kernel void cumsum_BF16toBF16_axis0(\n\
     float preSum = 0;\n\
     float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
     float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
 \n\
     for(; coord.x < width; coord.x += 8)\n\
     {\n\
         float4 data0, data1;\n\
-        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+        {\n\
+            coord.x = coord.x - (8 - remainder);\n\
+        }\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\
+        src = in_ptr[0];\n\
         VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
         VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
         _viv_asm(COPY, data0, val0, 16);\n\
@@ -6538,7 +7179,7 @@ __kernel void cumsum_BF16toBF16_axis0(\n\
         _viv_asm(COPY, dst0, tmpSum0, 16);\n\
         _viv_asm(COPY, dst1, tmpSum1, 16);\n\
         VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
     }\n\
 }\n\
 \n\
@@ -6554,11 +7195,22 @@ __kernel void cumsum_BF16toBF16_axis1_2D(\n\
     vxc_ushort8 dst0, dst1, dst;\n\
     vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
     float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    Image img1 = create_image_from_image2d(input, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
 \n\
     for(; coord.y < height; coord.y++)\n\
     {\n\
         float4 data0, data1;\n\
-        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\
+        src = in_ptr[0];\n\
         VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
                     uniConvBF16toF32_Part0_2x8);\n\
         VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
@@ -6573,7 +7225,7 @@ __kernel void cumsum_BF16toBF16_axis1_2D(\n\
         _viv_asm(COPY, dst1, sum1, 16);\n\
         VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
                 uniExtractOddData_2x8);\n\
-        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
     }\n\
 }\n\
 \n\
@@ -6592,10 +7244,20 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\
     float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
     float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\
 \n\
+    Image img1 = create_image_from_image2d(input, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
     for(; coord.x < width; coord.x += 8)\n\
     {\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+        {\n\
+            coord.x = coord.x - (8 - remainder);\n\
+        }\n\
         float4 data0, data1;\n\
-        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+        uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+        __global vxc_ushort8* in_ptr = (__global vxc_ushort8*)input_ptr;\n\
+        __global vxc_ushort8* out_ptr = (__global vxc_ushort8*)output_ptr;\n\
+        src = in_ptr[0];\n\
         VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
                     uniConvBF16toF32_Part0_2x8);\n\
         VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
@@ -6616,12 +7278,12 @@ __kernel void cumsum_BF16toBF16_axis0_2D(\n\
         _viv_asm(COPY, dst1, tmpSum1, 16);\n\
         VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
                 uniExtractOddData_2x8);\n\
-        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
     }\n\
 }\n\
-"; /* end of cumsum_bf16_vx*/
+"; /* end of cumsum_array_bf16_vx*/
 
-static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char cumsum_array_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
 \n\
@@ -6654,7 +7316,11 @@ _viv_uniform int input_zp;\n\
 _viv_uniform float in_out_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-__kernel void cumsum_ex_rev_F16toF16_axis0(\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
+\n\
+\n\
+__kernel void cumsum_ex_rev_array_F16toF16_axis0(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output,\n\
     int axis, int exclusive, int rev\n\
@@ -6666,11 +7332,26 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\
     vxc_short8 src, dst;\n\
     vxc_half8 data, tmpsum, sum;\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
     if(exclusive == 0 && rev)\n\
     {\n\
         for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+            {\n\
+                coord.x = coord.x - (8 - remainder);\n\
+            }\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
+            src = in_ptr[0];\n\
             _viv_asm(COPY, data, src, 16);\n\
 \n\
             VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
@@ -6679,26 +7360,34 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\
                         uniSumHorzRevF16toF16C_2x8);\n\
             VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev == 0)\n\
     {\n\
         _viv_asm(COPY, dst, sum, 16);\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
         for(; coord.x < width - 8;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+            {\n\
+                coord.x = coord.x - (8 - remainder);\n\
+            }\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord_out.x = coord.x + 1;\n\
             coord.x += 8;\n\
             _viv_asm(COPY, data, src, 16);\n\
 \n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
             VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\
             VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\
             VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\
             VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev)\n\
@@ -6706,10 +7395,20 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\
         coord.x = width - 8;\n\
         coord_out.x = width - 1;\n\
         _viv_asm(COPY, dst, sum, 16);\n\
-        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global vxc_short8*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(; coord.x > 0;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+            {\n\
+                coord.x = coord.x - (8 - remainder);\n\
+            }\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
+            src = in_ptr[0];\n\
             coord_out.x = coord.x - 1;\n\
             coord.x -= 8;\n\
             _viv_asm(COPY, data, src, 16);\n\
@@ -6720,13 +7419,13 @@ __kernel void cumsum_ex_rev_F16toF16_axis0(\n\
                         uniSumHorzRevF16toF16C_2x8);\n\
             VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
+#define CUMSUM_QINT_EX_REV_ARRAY_AXIS0(in_name, out_name, src_type, dst_type, stride_data) \\\n\
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis0( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev \\\n\
@@ -6741,10 +7440,25 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
     int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\
     short zp = (short)input_zp; \\\n\
  \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, stride_data); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_data); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
     if(exclusive == 0 && rev) \\\n\
     { \\\n\
         for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\
         { \\\n\
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+            { \\\n\
+                coord.x = coord.x - (8 - remainder); \\\n\
+            } \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
             VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
             VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
@@ -6759,16 +7473,24 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
             int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
             VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev == 0) \\\n\
     { \\\n\
         for(coord.x = -1; coord.x < width - 8;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+            { \\\n\
+                coord.x = coord.x - (8 - remainder); \\\n\
+            } \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord_out.x = coord.x + 1; \\\n\
             coord.x += 8; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\
             VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\
             VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\
@@ -6782,14 +7504,22 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
             int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
             VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev) \\\n\
     { \\\n\
         for(coord.x = width - 7; coord.x > 0;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+            { \\\n\
+                coord.x = coord.x - (8 - remainder); \\\n\
+            } \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord_out.x = coord.x - 1; \\\n\
             coord.x -= 8; \\\n\
             VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
@@ -6805,16 +7535,16 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
             int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
             VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_QINT_EX_REV_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
-CUMSUM_QINT_EX_REV_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)\n\
-CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8,  vxc_short8)\n\
-"; /* end of cumsum_ex_rev_axis0_vx*/
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16, 1)\n\
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I8,  I8,  vxc_char16,  vxc_char16, 1)\n\
+CUMSUM_QINT_EX_REV_ARRAY_AXIS0(I16, I16, vxc_short8,  vxc_short8, 2)\n\
+"; /* end of cumsum_array_ex_rev_axis0_vx*/
 
-static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char cumsum_array_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
@@ -6830,7 +7560,11 @@ _viv_uniform float in_out_scale;\n\
 _viv_uniform float in_out_zp_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-__kernel void cumsum_ex_rev_F16toF16_axis1(\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
+\n\
+\n\
+__kernel void cumsum_ex_rev_array_F16toF16_axis1(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output,\n\
     int axis, int exclusive, int rev)\n\
@@ -6840,54 +7574,80 @@ __kernel void cumsum_ex_rev_F16toF16_axis1(\n\
     vxc_short8 src, dst;\n\
     vxc_half8 data, sum;\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
     if(exclusive == 0 && rev)\n\
     {\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
+            src = in_ptr[0];\n\
             _viv_asm(COPY, data, src, 16);\n\
 \n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev == 0)\n\
     {\n\
         dst ^= dst;\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
         for(; coord.y < height - 1;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord.y++;\n\
             _viv_asm(COPY, data, src, 16);\n\
+\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
 \n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev)\n\
     {\n\
         dst ^= dst;\n\
         coord.y = height - 1;\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        out_ptr = (__global vxc_short8*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
 \n\
         for(; coord.y > 0;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord.y--;\n\
             _viv_asm(COPY, data, src, 16);\n\
+\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
 \n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
+#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis1( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev) \\\n\
@@ -6898,11 +7658,25 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
     dst_type dst; \\\n\
     int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
  \\\n\
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (16 - remainder); \\\n\
+    } \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
     if(exclusive == 0 && rev) \\\n\
     { \\\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
             VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
@@ -6920,7 +7694,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
             VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev == 0) \\\n\
@@ -6929,11 +7703,15 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
         int4 tmpVal; \\\n\
         tmpVal.x = tmpAlpha0; \\\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\
         for(; coord.y < height - 1;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord.y++; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
             VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
@@ -6951,7 +7729,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
                         uniConvertInt32toUint8_2x8);\\\n\
             VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
                         uniConvertInt32toUint8_2x8);\\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev) \\\n\
@@ -6961,16 +7739,22 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
         int4 tmpVal; \\\n\
         tmpVal.x = tmpAlpha0; \\\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        out_ptr = (__global vxc_short8*)output_ptr; \\\n\
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\
         for(; coord.y > 0;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
             VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
             VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
             float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
             coord.y--; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
             float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
             float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
@@ -6983,14 +7767,14 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
                         uniConvertInt32toUint8_2x8);\\\n\
             VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
                         uniConvertInt32toUint8_2x8);\\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
-CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16,  vxc_char16)\n\
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS1(I8, I8, vxc_char16,  vxc_char16)\n\
 \n\
-__kernel void cumsum_ex_rev_I16toI16_axis1(\n\
+__kernel void cumsum_ex_rev_array_I16toI16_axis1(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output,\n\
     int axis, int exclusive, int rev)\n\
@@ -6999,11 +7783,25 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\
 \n\
     vxc_short8 src, dst;\n\
     int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
     if(exclusive == 0 && rev)\n\
     {\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
+            src = in_ptr[0];\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
             float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
@@ -7013,8 +7811,7 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\
             int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
             VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
                         uniConvertInt32toUint8_2x8);\n\
-\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev == 0)\n\
@@ -7023,12 +7820,15 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\
         int4 tmpVal;\n\
         tmpVal.x = tmpAlpha0;\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
+        out_ptr[0] = dst.xxxxxxxx;\n\
         for(; coord.y < height - 1;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord.y++;\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
             float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\
@@ -7039,7 +7839,7 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\
             VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
                         uniConvertInt32toUint8_2x8);\n\
 \n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev)\n\
@@ -7049,15 +7849,20 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\
         int4 tmpVal;\n\
         tmpVal.x = tmpAlpha0;\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
-\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        out_ptr = (__global vxc_short8*)output_ptr;\n\
+        out_ptr[0] = dst.xxxxxxxx;\n\
         for(; coord.y > 0;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
             float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
             coord.y--;\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
             float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
             float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
             int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
@@ -7065,13 +7870,13 @@ __kernel void cumsum_ex_rev_I16toI16_axis1(\n\
             VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
                         uniConvertInt32toUint8_2x8);\n\
 \n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
-"; /* end of cumsum_ex_rev_axis1_vx*/
+"; /* end of cumsum_array_ex_rev_axis1_vx*/
 
-static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char cumsum_array_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
 _viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
@@ -7087,7 +7892,11 @@ _viv_uniform float in_out_scale;\n\
 _viv_uniform float in_out_zp_scale;\n\
 _viv_uniform float output_zp;\n\
 \n\
-__kernel void cumsum_ex_rev_F16toF16_axis2(\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
+\n\
+\n\
+__kernel void cumsum_ex_rev_array_F16toF16_axis2(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output,\n\
     int axis, int exclusive, int rev)\n\
@@ -7097,53 +7906,76 @@ __kernel void cumsum_ex_rev_F16toF16_axis2(\n\
     vxc_short8 src, dst;\n\
     vxc_half8 data, sum;\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
     if(rev && exclusive == 0)\n\
     {\n\
         for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
+            src = in_ptr[0];\n\
             _viv_asm(COPY, data, src, 16);\n\
 \n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(rev == 0 && exclusive)\n\
     {\n\
         _viv_asm(COPY, dst, sum, 16);\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst;\n\
         for(; coord.z < channel - 1;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord.z++;\n\
             _viv_asm(COPY, data, src, 16);\n\
 \n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(rev && exclusive)\n\
     {\n\
         _viv_asm(COPY, dst, sum, 16);\n\
         coord.z = channel - 1;\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        out_ptr = (__global vxc_short8*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(; coord.z > 0;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord.z--;\n\
             _viv_asm(COPY, data, src, 16);\n\
+\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
 \n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
             _viv_asm(COPY, dst, sum, 16);\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
+#define CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_array_##in_name##to##out_name##_axis2( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev) \\\n\
@@ -7154,11 +7986,25 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
     dst_type dst; \\\n\
     int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
  \\\n\
+    if (coord.x == ((w_size >> 4) * 16) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (16 - remainder); \\\n\
+    } \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 1); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
     if(rev && exclusive == 0) \\\n\
     { \\\n\
         for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
             VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
@@ -7176,7 +8022,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
                         uniConvertInt32toUint8_2x8);\\\n\
             VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
                         uniConvertInt32toUint8_2x8);\\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev == 0) \\\n\
@@ -7185,10 +8031,12 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
         int4 tmpVal; \\\n\
         tmpVal.x = tmpAlpha0; \\\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\
         for(; coord.z < channel - 1;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord.z++; \\\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
@@ -7207,7 +8055,7 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
             VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(rev && exclusive) \\\n\
@@ -7217,16 +8065,22 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
         int4 tmpVal; \\\n\
         tmpVal.x = tmpAlpha0; \\\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        out_ptr = (__global vxc_short8*)output_ptr; \\\n\
+        out_ptr[0] = dst.xxxxxxxxxxxxxxxx; \\\n\
         for(; coord.z > 0;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
             VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
             VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
             float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
             coord.z--; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
             float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
             float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
@@ -7239,14 +8093,14 @@ __kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
                         uniConvertInt32toUint8_2x8); \\\n\
             VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\
                         uniConvertInt32toUint8_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
-CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_ARRAY_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
 \n\
-__kernel void cumsum_ex_rev_I16toI16_axis2(\n\
+__kernel void cumsum_ex_rev_array_I16toI16_axis2(\n\
     __read_only image2d_array_t   input,\n\
     __write_only image2d_array_t  output,\n\
     int axis, int exclusive, int rev)\n\
@@ -7255,11 +8109,25 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\
 \n\
     vxc_short8 src, dst;\n\
     int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0)\n\
+    {\n\
+        coord.x = coord.x - (8 - remainder);\n\
+    }\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr;\n\
+    __global vxc_short8* out_ptr = (__global vxc_short8*)output_ptr;\n\
     if(exclusive == 0 && rev)\n\
     {\n\
         for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            out_ptr = (__global vxc_short8*)output_ptr;\n\
+            src = in_ptr[0];\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
             float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
@@ -7279,10 +8147,12 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\
         int4 tmpVal;\n\
         tmpVal.x = tmpAlpha0;\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        out_ptr[0] = dst.xxxxxxxx;\n\
         for(; coord.z < channel - 1;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             coord.z++;\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
@@ -7294,7 +8164,7 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\
             VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
                         uniConvertInt32toUint8_2x8);\n\
 \n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive && rev)\n\
@@ -7304,10 +8174,14 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\
         int4 tmpVal;\n\
         tmpVal.x = tmpAlpha0;\n\
         VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
-        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+        out_ptr = (__global vxc_short8*)output_ptr;\n\
+        out_ptr[0] = dst.xxxxxxxx;\n\
         for(; coord.z > 0;)\n\
         {\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global vxc_short8*)input_ptr;\n\
+            src = in_ptr[0];\n\
             VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
             VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
             float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
@@ -7319,13 +8193,14 @@ __kernel void cumsum_ex_rev_I16toI16_axis2(\n\
             VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
                         uniConvertInt32toUint8_2x8);\n\
 \n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
-"; /* end of cumsum_ex_rev_axis2_vx*/
+\n\
+"; /* end of cumsum_array_ex_rev_axis2_vx*/
 
-static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char cumsum_array_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
 _viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
 _viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
@@ -7342,8 +8217,12 @@ _viv_uniform int channel;\n\
 _viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
 _viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
 \n\
-#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_F16to##out_name##_axis2( \\\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
+\n\
+\n\
+#define CUMSUM_ARRAY_F16TOQINT_AXIS2(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_F16to##out_name##_axis2( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev \\\n\
@@ -7357,24 +8236,34 @@ __kernel void cumsum_F16to##out_name##_axis2( \\\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
     vxc_ushort8 ms0; \\\n\
     _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (8 - remainder); \\\n\
+    } \\\n\
     for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
     { \\\n\
-        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
         _viv_asm(COPY, data, src, 16); \\\n\
  \\\n\
         VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                 uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst; \\\n\
     } \\\n\
 }\n\
-CUMSUM_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)\n\
 \n\
 \n\
-#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_F16to##out_name##_axis1( \\\n\
+#define CUMSUM_ARRAY_F16TOQINT_AXIS1(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_F16to##out_name##_axis1( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev \\\n\
@@ -7388,23 +8277,33 @@ __kernel void cumsum_F16to##out_name##_axis1( \\\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
     vxc_ushort8 ms0; \\\n\
     _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (8 - remainder); \\\n\
+    } \\\n\
     for(coord.y = 0; coord.y < height; coord.y++) \\\n\
     { \\\n\
-        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
         _viv_asm(COPY, data, src, 16); \\\n\
  \\\n\
         VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                 uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst; \\\n\
     } \\\n\
 }\n\
-CUMSUM_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)\n\
 \n\
-#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_F16to##out_name##_axis0( \\\n\
+#define CUMSUM_ARRAY_F16TOQINT_AXIS0(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_F16to##out_name##_axis0( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev \\\n\
@@ -7418,9 +8317,19 @@ __kernel void cumsum_F16to##out_name##_axis0( \\\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
     vxc_ushort8 ms0; \\\n\
     _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
     for(; coord.x < width; coord.x += 8) \\\n\
     { \\\n\
-        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+        { \\\n\
+            coord.x = coord.x - (8 - remainder); \\\n\
+        } \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
         _viv_asm(COPY, data, src, 16); \\\n\
  \\\n\
         VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \\\n\
@@ -7429,83 +8338,15 @@ __kernel void cumsum_F16to##out_name##_axis0( \\\n\
         VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                 uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    } \\\n\
-}\n\
-CUMSUM_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16)\n\
-\n\
-#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_F16to##out_name##_axis1_2D( \\\n\
-    __read_only image2d_t   input, \\\n\
-    __write_only image2d_t  output, \\\n\
-    int axis, int exclusive, int rev \\\n\
-    ) \\\n\
-{ \\\n\
-    int2 coord = (int2)(get_global_id(0), 0); \\\n\
- \\\n\
-    vxc_short8 src; \\\n\
-    dst_type dst; \\\n\
-    vxc_half8 data, sum; \\\n\
-    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
-    vxc_ushort8 ms0; \\\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
-    for(; coord.y < height; coord.y++) \\\n\
-    { \\\n\
-        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        _viv_asm(COPY, data, src, 16); \\\n\
- \\\n\
-        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniAccSumVertF16toF16_2x8); \\\n\
-        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
-                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-    } \\\n\
-}\n\
-CUMSUM_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16)\n\
-\n\
-#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\
-    __read_only image2d_t   input, \\\n\
-    __write_only image2d_t  output, \\\n\
-    int axis, int exclusive, int rev \\\n\
-    ) \\\n\
-{ \\\n\
-    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
- \\\n\
-    vxc_short8 src; \\\n\
-    dst_type dst; \\\n\
-    vxc_half8 data, tmpsum, sum; \\\n\
-    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
-    vxc_ushort8 ms0; \\\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
-    for(; coord.x < width; coord.x += 8) \\\n\
-    { \\\n\
-        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
-        _viv_asm(COPY, data, src, 16); \\\n\
- \\\n\
-        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniSumHorzF16toF16A_4x4); \\\n\
-        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniSumHorzF16toF16B_4x4); \\\n\
-        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniSumHorzF16toF16C_2x8); \\\n\
-        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
-                uniAccSumHorzF16toF16_2x8); \\\n\
-        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
-                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst; \\\n\
     } \\\n\
 }\n\
-CUMSUM_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16, 1)\n\
 \n\
-#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\
+#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_ex_rev_F16to##out_name##_axis2( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev \\\n\
@@ -7519,33 +8360,51 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
     vxc_ushort8 ms0; \\\n\
     _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (8 - remainder); \\\n\
+    } \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
     if(exclusive == 0 && rev) \\\n\
     { \\\n\
         for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             _viv_asm(COPY, data, src, 16); \\\n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
             VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev == 0) \\\n\
     { \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst; \\\n\
         for(; coord.z < channel - 1;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord.z++; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             _viv_asm(COPY, data, src, 16); \\\n\
      \\\n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
             VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev) \\\n\
@@ -7553,26 +8412,32 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
         coord.z = channel - 1; \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        out_ptr = (__global dst_type*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
         for(; coord.z > 0;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord.z--; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             _viv_asm(COPY, data, src, 16); \\\n\
      \\\n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
             VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16)\n\
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16, 1)\n\
 \n\
-#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\
-__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\
+#define CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_ex_rev_F16to##out_name##_axis1( \\\n\
     __read_only image2d_array_t   input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, int exclusive, int rev \\\n\
@@ -7586,32 +8451,50 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\
     VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
     vxc_ushort8 ms0; \\\n\
     _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (8 - remainder); \\\n\
+    } \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+    __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
     if(exclusive == 0 && rev) \\\n\
     { \\\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             _viv_asm(COPY, data, src, 16); \\\n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
             VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev == 0) \\\n\
     { \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        out_ptr[0] = dst; \\\n\
         for(; coord.y < height - 1;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord.y++; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             _viv_asm(COPY, data, src, 16); \\\n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
             VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
     else if(exclusive && rev) \\\n\
@@ -7619,191 +8502,1512 @@ __kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\
         VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
         coord.y = height - 1; \\\n\
-        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        out_ptr = (__global dst_type*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
         for(; coord.y > 0;) \\\n\
         { \\\n\
-            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+            src = in_ptr[0]; \\\n\
             coord.y--; \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global dst_type*)output_ptr; \\\n\
             _viv_asm(COPY, data, src, 16); \\\n\
             VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
             VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
                     uniU8MulAndPostShift_0_Lo_2x8); \\\n\
-            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            out_ptr[0] = dst; \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16)\n\
-CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\
-CUMSUM_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
-"; /* end of cumsum_f16_u8_vx*/
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16, 1)"; /* end of cumsum_array_f16_u8_vx*/
 
-static const char custom_softmax_vx[] = "/*\n\
- ============================================================================\n\
- Name        : Softmax2.vx\n\
- Author      : VSI\n\
- Version     :\n\
- Copyright   : Your copyright notice\n\
- Description :\n\
- ============================================================================\n\
- */\n\
-#include \"cl_viv_vx_ext.h\"\n\
+static const char cumsum_array_f16_u8_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\
-_viv_uniform int  sf_size;\n\
- #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\
-__kernel void Softmax2VXC\n\
-    (\n\
-    image2d_array_t input,\n\
-    image2d_array_t output,\n\
-    int axis\n\
-    )\n\
-{\n\
-   int4 coord_in = (int4)(0,0,0,0);\n\
-   float fMax = 0.0;\n\
-   for (int i = 0; i < sf_size; i++)\n\
-   {\n\
-       vxc_char8 val;\n\
-       coord_in.x = i;\n\
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
-       float fval;\n\
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
 \n\
-       fMax = F_MAX(fMax, fval);\n\
-   }\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
 \n\
-    float  fProbSum = 0.0f;\n\
-    vxc_short8 dst;\n\
-    for (int i = 0; i < sf_size; i++)\n\
-    {\n\
-       vxc_char8 val;\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
 \n\
-       coord_in.x = i;\n\
-       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
-       float fval;\n\
-       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
 \n\
-       float fOut = (float)exp(fval - fMax);\n\
-       fProbSum += fOut;\n\
-       half hVal;\n\
-       _viv_asm(CONV,hVal,fOut);\n\
-       _viv_asm(COPY,dst,hVal, 4);\n\
-       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
+_viv_uniform int remainder;\n\
+_viv_uniform int w_size;\n\
 \n\
-    for (int i = 0; i < sf_size; i++)\n\
-    {\n\
-       vxc_short8 val;\n\
-       vxc_half8  val_h;\n\
-       coord_in.x = i;\n\
-       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
-       float fval;\n\
-       _viv_asm(COPY, val_h,val, 16);\n\
-       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
 \n\
-       float fOut =fval/fProbSum;\n\
-       half hVal;\n\
-       _viv_asm(CONV,hVal,fOut);\n\
-       _viv_asm(COPY,dst,hVal, 4);\n\
-       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    }\n\
+#define CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_F16to##out_name##_axis1_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
+    if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+    { \\\n\
+        coord.x = coord.x - (8 - remainder); \\\n\
+    } \\\n\
+    for(; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
 }\n\
-"; /* end of custom_softmax_vx*/
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16, 1)\n\
+\n\
+#define CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type, stride_out) \\\n\
+__kernel void cumsum_array_F16to##out_name##_axis0_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, tmpsum, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 2); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, stride_out); \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        if (coord.x == ((w_size >> 3) * 8) && remainder != 0) \\\n\
+        { \\\n\
+            coord.x = coord.x - (8 - remainder); \\\n\
+        } \\\n\
+        uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+        uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+        __global vxc_short8* in_ptr = (__global vxc_short8*)input_ptr; \\\n\
+        __global dst_type* out_ptr = (__global dst_type*)output_ptr; \\\n\
+        src = in_ptr[0]; \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16A_4x4); \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16B_4x4); \\\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16C_2x8); \\\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        out_ptr[0] = dst; \\\n\
+    } \\\n\
+}\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16, 1)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8, 2)\n\
+CUMSUM_ARRAY_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16, 1)\n\
+"; /* end of cumsum_array_f16_u8_2d_vx*/
 
-static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+static const char cumsum_bf16_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-#include \"cl_viv_vx_ext.h\"\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part0_2x8;\n\
+_viv_uniform VXC_512Bits uniConvBF16toF32_Part1_2x8;\n\
+_viv_uniform VXC_512Bits uniExtractOddData_2x8;\n\
 \n\
-_viv_uniform float4 matrix0;\n\
-_viv_uniform float2 matrix1;\n\
-_viv_uniform float4 matrix4;\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
 \n\
-__kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\
-(\n\
-    __read_only  image2d_array_t input,\n\
-    __write_only image2d_array_t output,\n\
-                 float           _m0,\n\
-                 float           _m1,\n\
-                 float           _m2,\n\
-                 float           _m3,\n\
-                 float           _m4,\n\
-                 float           _m5\n\
-)\n\
+__kernel void cumsum_BF16toBF16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
 {\n\
-    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
-\n\
-    float4 coord_f = convert_float4(coord_in);\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
-    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
 \n\
-    coord_in = convert_int4(coord_f);\n\
+    for(coord.z = 0; coord.z < channel; coord.z++)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
 \n\
-    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+        sum0 += data0;\n\
+        sum1 += data1;\n\
+        _viv_asm(COPY, dst0, sum0, 16);\n\
+        _viv_asm(COPY, dst1, sum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
 \n\
-    vxc_uchar16 dst;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_input.xy = coord_in.zw;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_f = coord_f.zwzw + matrix4;\n\
-    coord_in = convert_int4(coord_f);\n\
-    coord_input.xy = coord_in.xy;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_input.xy = coord_in.zw;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
-    coord_f = coord_f.zwzw + matrix4;\n\
-    coord_in = convert_int4(coord_f);\n\
-    coord_input.xy = coord_in.xy;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
-    coord_input.xy = coord_in.zw;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
-    coord_f = coord_f.zwzw + matrix4;\n\
-    coord_in = convert_int4(coord_f);\n\
-    coord_input.xy = coord_in.xy;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
-    coord_input.xy = coord_in.zw;\n\
-    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+__kernel void cumsum_BF16toBF16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
 \n\
-    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    for(coord.y = 0; coord.y < height; coord.y++)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+        sum0 += data0;\n\
+        sum1 += data1;\n\
+        _viv_asm(COPY, dst0, sum0, 16);\n\
+        _viv_asm(COPY, dst1, sum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
 }\n\
 \n\
-__kernel void custom_warp_affine_bilinear_U8toU8\n\
-(\n\
-    __read_only  image2d_array_t input,\n\
-    __write_only image2d_array_t output,\n\
-                 float           _m0,\n\
-                 float           _m1,\n\
-                 float           _m2,\n\
-                 float           _m3,\n\
-                 float           _m4,\n\
-                 float           _m5\n\
-)\n\
+__kernel void cumsum_BF16toBF16_axis0(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
 {\n\
-    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
-    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
 \n\
-    float4 coord_f = convert_float4(coord_in);\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float preSum = 0;\n\
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\
 \n\
-    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
 \n\
-    coord_in = convert_int4(coord_f);\n\
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\
+        tmpSum1 += tmpSum0.w;\n\
 \n\
-    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
-    int8 input_desc;\n\
-    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
-    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
-    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+        tmpSum0 += preSum;\n\
+        tmpSum1 += preSum;\n\
 \n\
-    vxc_uchar16 src0, src1, dst;\n\
-    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
-        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
-        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
-#if (VX_VERSION==1)\n\
+        preSum = tmpSum1.w;\n\
+\n\
+        _viv_asm(COPY, dst0, tmpSum0, 16);\n\
+        _viv_asm(COPY, dst1, tmpSum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniExtractOddData_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis1_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), 0);\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float4 sum0 = (float4)(0), sum1 = (float4)(0);\n\
+\n\
+    for(; coord.y < height; coord.y++)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+\n\
+        sum0 += data0;\n\
+        sum1 += data1;\n\
+\n\
+        _viv_asm(COPY, dst0, sum0, 16);\n\
+        _viv_asm(COPY, dst1, sum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniExtractOddData_2x8);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_BF16toBF16_axis0_2D(\n\
+    __read_only image2d_t   input,\n\
+    __write_only image2d_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1));\n\
+\n\
+    vxc_ushort8 src, val0, val1;\n\
+    vxc_ushort8 dst0, dst1, dst;\n\
+    vxc_ushort8 zero = (vxc_ushort8)(0, 0, 0, 0, 0, 0, 0, 0);\n\
+    float preSum = 0;\n\
+    float4 one = (float4)(1.0, 1.0, 1.0, 1.0);\n\
+    float4 q = (float4)(1.0, 1.0, 1.0, 0);\n\
+\n\
+    for(; coord.x < width; coord.x += 8)\n\
+    {\n\
+        float4 data0, data1;\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        VXC_DP2x8(val0, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part0_2x8);\n\
+        VXC_DP2x8(val1, src, zero, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                    uniConvBF16toF32_Part1_2x8);\n\
+        _viv_asm(COPY, data0, val0, 16);\n\
+        _viv_asm(COPY, data1, val1, 16);\n\
+\n\
+        float4 tmpSum0 = (float4)(data0.x, data0.x + data0.y, dot(data0, q), dot(data0, one));\n\
+        float4 tmpSum1 = (float4)(data1.x, data1.x + data1.y, dot(data1, q), dot(data1, one));\n\
+        tmpSum1 += tmpSum0.w;\n\
+\n\
+        tmpSum0 += preSum;\n\
+        tmpSum1 += preSum;\n\
+\n\
+        preSum = tmpSum1.w;\n\
+\n\
+        _viv_asm(COPY, dst0, tmpSum0, 16);\n\
+        _viv_asm(COPY, dst1, tmpSum1, 16);\n\
+        VXC_DP2x8(dst, dst0, dst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                uniExtractOddData_2x8);\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of cumsum_bf16_vx*/
+
+static const char cumsum_ex_rev_axis0_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzI16toI32B_4x4;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevF16toF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzRevU8toI16B_8x4;\n\
+_viv_uniform VXC_512Bits uniSubZpRevI16toI16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzRevI16toI32B_4x4;\n\
+\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int input_zp;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis0(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, tmpsum, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniSumHorzRevF16toF16C_2x8);\n\
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.x < width - 8;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord_out.x = coord.x + 1;\n\
+            coord.x += 8;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4);\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4);\n\
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8);\n\
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 8;\n\
+        coord_out.x = width - 1;\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.x > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord_out.x = coord.x - 1;\n\
+            coord.x -= 8;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16A_4x4);\n\
+            VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevF16toF16B_4x4);\n\
+            VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\n\
+                        uniSumHorzRevF16toF16C_2x8);\n\
+            VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzRevF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_QINT_EX_REV_AXIS0(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis0( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(0, get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    vxc_short8 rowSum; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0); \\\n\
+    short zp = (short)input_zp; \\\n\
+ \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 8; coord.x >= 0; coord.x -= 8) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32B_4x4); \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        for(coord.x = -1; coord.x < width - 8;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord_out.x = coord.x + 1; \\\n\
+            coord.x += 8; \\\n\
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16A_4x4); \\\n\
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzU8toI16B_8x4); \\\n\
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpI16toI16_2x8); \\\n\
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzI16toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzI16toI32B_4x4); \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 7; coord.x > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord_out.x = coord.x - 1; \\\n\
+            coord.x -= 8; \\\n\
+            VXC_DP4x4(rowSum, src, src, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16A_4x4); \\\n\
+            VXC_DP8x4(rowSum, src, src, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzRevU8toI16B_8x4); \\\n\
+            VXC_DP2x8(rowSum, rowSum, zp, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSubZpRevI16toI16_2x8); \\\n\
+            VXC_DP4x4(sum0, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, rowSum, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), \\\n\
+                        uniAccSumHorzRevI16toI32B_4x4); \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + output_zp; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + output_zp; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            VXC_DP2x8(dst, tmpDst1, tmpDst0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord_out, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_QINT_EX_REV_AXIS0(U8,  U8,  vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_QINT_EX_REV_AXIS0(I8,  I8,  vxc_char16,  vxc_char16)\n\
+CUMSUM_QINT_EX_REV_AXIS0(I16, I16, vxc_short8,  vxc_short8)\n\
+"; /* end of cumsum_ex_rev_axis0_vx*/
+
+static const char cumsum_ex_rev_axis1_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int height;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        dst ^= dst;\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.y < height - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        dst ^= dst;\n\
+        coord.y = height - 1;\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        for(; coord.y > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y--;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_EX_REV_AXIS1(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y < height - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        coord.y = height - 1; \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp; \\\n\
+            coord.y--; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15,0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_EX_REV_AXIS1(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_AXIS1(I8, I8, vxc_char16,  vxc_char16)\n\
+\n\
+__kernel void cumsum_ex_rev_I16toI16_axis1(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        for(; coord.y < height - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.y++;\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(coord.y) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        coord.y = height - 1;\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+        for(; coord.y > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(height - coord.y) * in_out_zp_scale + output_zp;\n\
+            coord.y--;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_ex_rev_axis1_vx*/
+
+static const char cumsum_ex_rev_axis2_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32A_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32B_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32C_4x4;\n\
+_viv_uniform VXC_512Bits uniAccSumVertU8toI32D_4x4;\n\
+_viv_uniform VXC_512Bits uniConvertInt32toUint8_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int channel;\n\
+_viv_uniform float in_out_scale;\n\
+_viv_uniform float in_out_zp_scale;\n\
+_viv_uniform float output_zp;\n\
+\n\
+__kernel void cumsum_ex_rev_F16toF16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    vxc_half8 data, sum;\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8);\n\
+    if(rev && exclusive == 0)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(rev == 0 && exclusive)\n\
+    {\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z < channel - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.z++;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(rev && exclusive)\n\
+    {\n\
+        _viv_asm(COPY, dst, sum, 16);\n\
+        coord.z = channel - 1;\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.z--;\n\
+            _viv_asm(COPY, data, src, 16);\n\
+\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8);\n\
+            _viv_asm(COPY, dst, sum, 16);\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_8BITS_EX_REV_AXIS2(in_name, out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_##in_name##to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    src_type src; \\\n\
+    dst_type dst; \\\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0), sum2 = (int4)(0), sum3 = (int4)(0); \\\n\
+ \\\n\
+    if(rev && exclusive == 0) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
+                        uniConvertInt32toUint8_2x8);\\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z < channel - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.z++; \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev && exclusive) \\\n\
+    { \\\n\
+        coord.z = channel - 1; \\\n\
+        int tmpAlpha0 = convert_int_rte(output_zp); \\\n\
+        int4 tmpVal; \\\n\
+        tmpVal.x = tmpAlpha0; \\\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\\\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxxxxxxxxxx, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4); \\\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4); \\\n\
+            VXC_DP4x4(sum2, src, sum2, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32C_4x4); \\\n\
+            VXC_DP4x4(sum3, src, sum3, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32D_4x4); \\\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp; \\\n\
+            coord.z--; \\\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum2 = convert_float4(sum2) * in_out_scale + tmpAlpha; \\\n\
+            float4 tmpSum3 = convert_float4(sum3) * in_out_scale + tmpAlpha; \\\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0); \\\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1); \\\n\
+            int4 tmpDst2 = convert_int4_rte(tmpSum2); \\\n\
+            int4 tmpDst3 = convert_int4_rte(tmpSum3); \\\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), \\\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_DP2x8(dst, tmpDst2, tmpDst3, VXC_MODIFIER(8, 15, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 15, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_8BITS_EX_REV_AXIS2(U8, U8, vxc_uchar16, vxc_uchar16)\n\
+CUMSUM_8BITS_EX_REV_AXIS2(I8, I8, vxc_char16, vxc_char16)\n\
+\n\
+__kernel void cumsum_ex_rev_I16toI16_axis2(\n\
+    __read_only image2d_array_t   input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis, int exclusive, int rev)\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    int4 sum0 = (int4)(0), sum1 = (int4)(0);\n\
+    if(exclusive == 0 && rev)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev == 0)\n\
+    {\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z < channel - 1;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            coord.z++;\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(coord.z) * in_out_zp_scale + output_zp;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+    else if(exclusive && rev)\n\
+    {\n\
+        coord.z = channel - 1;\n\
+        int tmpAlpha0 = convert_int_rte(output_zp);\n\
+        int4 tmpVal;\n\
+        tmpVal.x = tmpAlpha0;\n\
+        VXC_DP2x8(dst, tmpVal, tmpVal, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1), uniConvertInt32toUint8_2x8);\n\
+        VXC_WriteImage2DArray(output, coord, dst.xxxxxxxx, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        for(; coord.z > 0;)\n\
+        {\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+            VXC_DP4x4(sum0, src, sum0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32A_4x4);\n\
+            VXC_DP4x4(sum1, src, sum1, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniAccSumVertU8toI32B_4x4);\n\
+            float tmpAlpha = convert_float(channel - coord.z) * in_out_zp_scale + output_zp;\n\
+            coord.z--;\n\
+            float4 tmpSum0 = convert_float4(sum0) * in_out_scale + tmpAlpha;\n\
+            float4 tmpSum1 = convert_float4(sum1) * in_out_scale + tmpAlpha;\n\
+            int4 tmpDst0 = convert_int4_rte(tmpSum0);\n\
+            int4 tmpDst1 = convert_int4_rte(tmpSum1);\n\
+            VXC_DP2x8(dst, tmpDst0, tmpDst1, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero,1),\n\
+                        uniConvertInt32toUint8_2x8);\n\
+\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_ex_rev_axis2_vx*/
+
+static const char cumsum_f16_u8_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits uniAccSumVertF16toF16_2x8;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16A_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16B_4x4;\n\
+_viv_uniform VXC_512Bits uniSumHorzF16toF16C_2x8;\n\
+_viv_uniform VXC_512Bits uniAccSumHorzF16toF16_2x8;\n\
+\n\
+_viv_uniform VXC_512Bits uniSetZeroF16_2x8;\n\
+\n\
+_viv_uniform int width;\n\
+_viv_uniform int height;\n\
+_viv_uniform int channel;\n\
+\n\
+_viv_uniform int2 multAndoutZP0;//[0:15] multiplier, [31:63] output zp\n\
+_viv_uniform VXC_512Bits uniU8MulAndPostShift_0_Lo_2x8;\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS2(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS2(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS2(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS2(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS1(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS1(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS1(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS0(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis0( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, tmpsum, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16A_4x4); \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16B_4x4); \\\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSumHorzF16toF16C_2x8); \\\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumHorzF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS0(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS0(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS0(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS1_2D(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis1_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(; coord.y < height; coord.y++) \\\n\
+    { \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumVertF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS1_2D(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS1_2D(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS1_2D(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_AXIS0_2D(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_F16to##out_name##_axis0_2D( \\\n\
+    __read_only image2d_t   input, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int2 coord = (int2)(get_global_id(0), get_global_id(1)); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, tmpsum, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    for(; coord.x < width; coord.x += 8) \\\n\
+    { \\\n\
+        VXC_ReadImage(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        _viv_asm(COPY, data, src, 16); \\\n\
+ \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16A_4x4); \\\n\
+        VXC_DP4x4(tmpsum, data, data, VXC_MODIFIER(4, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16B_4x4); \\\n\
+        VXC_DP2x8(tmpsum, tmpsum, tmpsum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniSumHorzF16toF16C_2x8); \\\n\
+        VXC_DP2x8(sum, tmpsum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0),\\\n\
+                uniAccSumHorzF16toF16_2x8); \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_AXIS0_2D(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_AXIS0_2D(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_AXIS0_2D(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_EX_REV_AXIS2(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_F16to##out_name##_axis2( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), 0, 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z < channel - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.z++; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+     \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        coord.z = channel - 1; \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.z > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.z--; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+     \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS2(U8,  vxc_half8, vxc_uchar16)\n\
+\n\
+#define CUMSUM_F16TOQINT_EX_REV_AXIS1(out_name, src_type, dst_type) \\\n\
+__kernel void cumsum_ex_rev_F16to##out_name##_axis1( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, int exclusive, int rev \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), 0, get_global_id(2), 0); \\\n\
+ \\\n\
+    vxc_short8 src; \\\n\
+    dst_type dst; \\\n\
+    vxc_half8 data, sum; \\\n\
+    VXC_DP2x8(sum, sum, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniSetZeroF16_2x8); \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    if(exclusive == 0 && rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev == 0) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y < height - 1;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y++; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive && rev) \\\n\
+    { \\\n\
+        VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+        coord.y = height - 1; \\\n\
+        VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        for(; coord.y > 0;) \\\n\
+        { \\\n\
+            VXC_ReadImage2DArray(src, input, coord, 0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+            coord.y--; \\\n\
+            _viv_asm(COPY, data, src, 16); \\\n\
+            VXC_DP2x8(sum, data, sum, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0), uniAccSumVertF16toF16_2x8); \\\n\
+            VXC_DP2x8(dst, sum, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_ToNearestEven, 1), \\\n\
+                    uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+            VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0)); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I8,  vxc_half8, vxc_char16)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(I16, vxc_half8, vxc_short8)\n\
+CUMSUM_F16TOQINT_EX_REV_AXIS1(U8,  vxc_half8, vxc_uchar16)\n\
+"; /* end of cumsum_f16_u8_vx*/
+
+static const char custom_softmax_vx[] = "/*\n\
+ ============================================================================\n\
+ Name        : Softmax2.vx\n\
+ Author      : VSI\n\
+ Version     :\n\
+ Copyright   : Your copyright notice\n\
+ Description :\n\
+ ============================================================================\n\
+ */\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform VXC_512Bits Uni4x4_Fp16ToFp32;\n\
+_viv_uniform int  sf_size;\n\
+ #define F_MAX(a,b) ((a)>(b)?(a):(b))\n\
+__kernel void Softmax2VXC\n\
+    (\n\
+    image2d_array_t input,\n\
+    image2d_array_t output,\n\
+    int axis\n\
+    )\n\
+{\n\
+   int4 coord_in = (int4)(0,0,0,0);\n\
+   float fMax = 0.0;\n\
+   for (int i = 0; i < sf_size; i++)\n\
+   {\n\
+       vxc_char8 val;\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+\n\
+       fMax = F_MAX(fMax, fval);\n\
+   }\n\
+\n\
+    float  fProbSum = 0.0f;\n\
+    vxc_short8 dst;\n\
+    for (int i = 0; i < sf_size; i++)\n\
+    {\n\
+       vxc_char8 val;\n\
+\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, input, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       VXC_DP4x4(fval, val, val, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+\n\
+       float fOut = (float)exp(fval - fMax);\n\
+       fProbSum += fOut;\n\
+       half hVal;\n\
+       _viv_asm(CONV,hVal,fOut);\n\
+       _viv_asm(COPY,dst,hVal, 4);\n\
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+\n\
+    for (int i = 0; i < sf_size; i++)\n\
+    {\n\
+       vxc_short8 val;\n\
+       vxc_half8  val_h;\n\
+       coord_in.x = i;\n\
+       VXC_ReadImage2DArray(val, output, coord_in, 0, VXC_MODIFIER(0,0, 0, VXC_RM_TowardZero, 0));\n\
+       float fval;\n\
+       _viv_asm(COPY, val_h,val, 16);\n\
+       VXC_DP4x4(fval, val_h, val_h, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 1), Uni4x4_Fp16ToFp32);\n\
+\n\
+       float fOut =fval/fProbSum;\n\
+       half hVal;\n\
+       _viv_asm(CONV,hVal,fOut);\n\
+       _viv_asm(COPY,dst,hVal, 4);\n\
+       VXC_WriteImage2DArray(output, coord_in, dst, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    }\n\
+}\n\
+"; /* end of custom_softmax_vx*/
+
+static const char custom_warp_affine_vx[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform float4 matrix0;\n\
+_viv_uniform float2 matrix1;\n\
+_viv_uniform float4 matrix4;\n\
+\n\
+__kernel void custom_warp_affine_nearest_neighbor_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 dst;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(4, 4, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(5, 5, 0, VXC_RM_TowardZero, 0));\n\
+    coord_f = coord_f.zwzw + matrix4;\n\
+    coord_in = convert_int4(coord_f);\n\
+    coord_input.xy = coord_in.xy;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(6, 6, 0, VXC_RM_TowardZero, 0));\n\
+    coord_input.xy = coord_in.zw;\n\
+    VXC_OP4(img_load_3d, dst, input, coord_input.xywz, 0, VXC_MODIFIER(7, 7, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void custom_warp_affine_bilinear_U8toU8\n\
+(\n\
+    __read_only  image2d_array_t input,\n\
+    __write_only image2d_array_t output,\n\
+                 float           _m0,\n\
+                 float           _m1,\n\
+                 float           _m2,\n\
+                 float           _m3,\n\
+                 float           _m4,\n\
+                 float           _m5\n\
+)\n\
+{\n\
+    int4   coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), get_global_id(2));\n\
+    int4   coord_in = (int4)(get_global_id(0), get_global_id(1), get_global_id(0) + 1, get_global_id(1));\n\
+\n\
+    float4 coord_f = convert_float4(coord_in);\n\
+\n\
+    coord_f = coord_f.xxzz * matrix0.xyxy + coord_f.yyww * matrix0.zwzw + matrix1.xyxy;\n\
+\n\
+    coord_in = convert_int4(coord_f);\n\
+\n\
+    int4 coord_input = (int4)(coord_in.xy, get_global_id(2), get_global_id(2));\n\
+    int8 input_desc;\n\
+    _viv_asm(COPY, input_desc, input, sizeof(input_desc));\n\
+    int baseAddr = (int)coord_input.z * input_desc.s4 + input_desc.s0;\n\
+    _viv_asm(MOV, coord_input.w, baseAddr);\n\
+\n\
+    vxc_uchar16 src0, src1, dst;\n\
+    VXC_OP4(img_load_3d, src0, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 0),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_OP4(img_load_3d, src1, input, coord_input.xywz, VXC_5BITOFFSET_XY(0, 1),\n\
+        VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
+#if (VX_VERSION==1)\n\
     VXC_BiLinear(dst, src0, src1, coord_f.xy, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 #else\n\
     VXC_Lerp(src0, src0, src1, coord_f.y, VXC_MODIFIER(0, 1, 0, VXC_RM_TowardZero, 0));\n\
@@ -13458,35 +15662,244 @@ __kernel void gather_batch_I16toF16_axis0(\n\
     int is_array\n\
     )\n\
 {\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 indices = read_imagei(input1, coord.xz);\n\
-    indices = indices >= 0 ? indices : indices + axis_num;\n\
-    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 indices = read_imagei(input1, coord.xz);\n\
+    indices = indices >= 0 ? indices : indices + axis_num;\n\
+    int4 coord_in = (int4)(indices.x, get_global_id(1), get_global_id(2), 0);\n\
+\n\
+    vxc_short8 src, dst;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.y;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.z;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
+    coord_in.x = indices.w;\n\
+    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    vxc_half8  src0;\n\
+    vxc_short8 dst0;\n\
+    vxc_ushort8 ms0;\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
+    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
+                uniU8MulAndPostShift_0_Lo_2x8);\n\
+    _viv_asm(COPY, dst0, src0, 16);\n\
+\n\
+    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+"; /* end of gather_mix_batch_vx*/
+
+static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+\n\
+__kernel void gather_nd_I8toI8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    vxc_char16 src;\n\
+    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+\n\
+    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_U8toU8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    vxc_uchar16 src;\n\
+    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_I16toI16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_F16toF16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    vxc_short8 src;\n\
+    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+}\n\
+\n\
+__kernel void gather_nd_array_I8toI8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global char* data_ptr = (__global char*)input_ptr;\n\
+    __global char* dst_ptr = (__global char*)output_ptr;\n\
+    char src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_U8toU8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global uchar* data_ptr = (__global uchar*)input_ptr;\n\
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\
+    uchar src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_I16toI16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    coord.w = indice.x;\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+\n\
+}\n\
+\n\
+__kernel void gather_nd_array_F16toF16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
 \n\
-    vxc_short8 src, dst;\n\
-    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = indices.y;\n\
-    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(1, 1, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = indices.z;\n\
-    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(2, 2, 0, VXC_RM_TowardZero, 0));\n\
-    coord_in.x = indices.w;\n\
-    VXC_ReadImage2DArray(src, input0, coord_in, 0, VXC_MODIFIER(3, 3, 0, VXC_RM_TowardZero, 0));\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    vxc_half8  src0;\n\
-    vxc_short8 dst0;\n\
-    vxc_ushort8 ms0;\n\
-    _viv_asm(COPY, ms0, multAndoutZP0, 16);\n\
-    VXC_DP2x8(src0, src, ms0, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 1),\\\n\
-                uniU8MulAndPostShift_0_Lo_2x8);\n\
-    _viv_asm(COPY, dst0, src0, 16);\n\
+    coord.w = indice.x;\n\
 \n\
-    VXC_WriteImage2DArray(output, coord, dst0, VXC_MODIFIER(0, 3, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
 }\n\
-"; /* end of gather_mix_batch_vx*/
+"; /* end of gather_nd_vx*/
 
-static const char gather_nd_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
+static const char gather_nd_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-__kernel void gather_nd_I8toI8_1D(\n\
+__kernel void gather_nd_I8toI8_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13502,15 +15915,15 @@ __kernel void gather_nd_I8toI8_1D(\n\
     uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.w = indice.x;\n\
+    indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_char16 src;\n\
-    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 \n\
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void gather_nd_U8toU8_1D(\n\
+__kernel void gather_nd_U8toU8_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13526,14 +15939,14 @@ __kernel void gather_nd_U8toU8_1D(\n\
     uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.w = indice.x;\n\
+    indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_uchar16 src;\n\
-    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void gather_nd_I16toI16_1D(\n\
+__kernel void gather_nd_I16toI16_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13549,14 +15962,14 @@ __kernel void gather_nd_I16toI16_1D(\n\
     uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.w = indice.x;\n\
+    indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_short8 src;\n\
-    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void gather_nd_F16toF16_1D(\n\
+__kernel void gather_nd_F16toF16_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13572,17 +15985,14 @@ __kernel void gather_nd_F16toF16_1D(\n\
     uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
     int4 indice = ((int4 *)indice_ptr)[0];\n\
 \n\
-    coord.w = indice.x;\n\
+    indice.x = indice.x * block_size + gidx;\n\
 \n\
     vxc_short8 src;\n\
-    VXC_ReadImage(src, input0, coord.zw, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-"; /* end of gather_nd_vx*/
-
-static const char gather_nd_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
-__kernel void gather_nd_I8toI8_2D(\n\
+__kernel void gather_nd_array_I8toI8_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13600,13 +16010,17 @@ __kernel void gather_nd_I8toI8_2D(\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
-    vxc_char16 src;\n\
-    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-\n\
-    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global char* data_ptr = (__global char*)input_ptr;\n\
+    __global char* dst_ptr = (__global char*)output_ptr;\n\
+    char src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
 }\n\
 \n\
-__kernel void gather_nd_U8toU8_2D(\n\
+__kernel void gather_nd_array_U8toU8_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13624,12 +16038,18 @@ __kernel void gather_nd_U8toU8_2D(\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
-    vxc_uchar16 src;\n\
-    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global uchar* data_ptr = (__global uchar*)input_ptr;\n\
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\
+    uchar src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+\n\
 }\n\
 \n\
-__kernel void gather_nd_I16toI16_2D(\n\
+__kernel void gather_nd_array_I16toI16_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13647,12 +16067,17 @@ __kernel void gather_nd_I16toI16_2D(\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
-    vxc_short8 src;\n\
-    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
 }\n\
 \n\
-__kernel void gather_nd_F16toF16_2D(\n\
+__kernel void gather_nd_array_F16toF16_2D(\n\
     __read_only image2d_t   input0,\n\
     __read_only image2d_t   input1,\n\
     __write_only image2d_t  output,\n\
@@ -13670,9 +16095,14 @@ __kernel void gather_nd_F16toF16_2D(\n\
 \n\
     indice.x = indice.x * block_size + gidx;\n\
 \n\
-    vxc_short8 src;\n\
-    VXC_ReadImage(src, input0, indice.xy, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
-    VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
 }\n\
 "; /* end of gather_nd_2d_vx*/
 
@@ -13758,7 +16188,88 @@ __kernel void gather_nd_F16to##src1_type_name##_2D( \\\n\
 GATHER_ND_F16_TO_QINT_2D(U8, vxc_uchar16)\n\
 GATHER_ND_F16_TO_QINT_2D(I8, vxc_char16)\n\
 GATHER_ND_F16_TO_QINT_2D(I16, vxc_short8)\n\
-"; /* end of gather_nd_2d_mix_vx*/
+\n\
+#define GATHER_ND_ARRAY_QINT_TO_F16_2D(src0_type_name, read_type, ptr_type, stride) \\\n\
+__kernel void gather_nd_array_##src0_type_name##toF16_2D( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
+    indice.x = indice.x * block_size + gidx; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input0, stride); \\\n\
+    Image img2 = create_image_from_image2d(output, 2); \\\n\
+ \\\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\
+ \\\n\
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \\\n\
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \\\n\
+    read_type src = data_ptr[0]; \\\n\
+ \\\n\
+    vxc_half8  src0; \\\n\
+    vxc_short8 dst0; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    _viv_asm(COPY, dst0, src0, 16); \\\n\
+    dst_ptr[0] = dst0; \\\n\
+}\n\
+GATHER_ND_ARRAY_QINT_TO_F16_2D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\
+GATHER_ND_ARRAY_QINT_TO_F16_2D(I8, vxc_char16, vxc_char16*, 1)\n\
+GATHER_ND_ARRAY_QINT_TO_F16_2D(I16, vxc_short8, vxc_short8*, 2)\n\
+\n\
+#define GATHER_ND_ARRAY_F16_TO_QINT_2D(src1_type_name, write_type, ptr_type, stride) \\\n\
+__kernel void gather_nd_array_F16to##src1_type_name##_2D( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
+    indice.x = indice.x * block_size + gidx; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input0, 2); \\\n\
+    Image img2 = create_image_from_image2d(output, stride); \\\n\
+ \\\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, indice.xy); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\
+ \\\n\
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \\\n\
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \\\n\
+    vxc_short8 src = data_ptr[0]; \\\n\
+ \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    vxc_half8 data; \\\n\
+    write_type dst; \\\n\
+    _viv_asm(COPY, data, src, 16); \\\n\
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1),uniConvertFp16toU8_2x8); \\\n\
+    dst_ptr[0] = dst; \\\n\
+}\n\
+GATHER_ND_ARRAY_F16_TO_QINT_2D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\
+GATHER_ND_ARRAY_F16_TO_QINT_2D(I8, vxc_char16, vxc_char16*, 1)\n\
+GATHER_ND_ARRAY_F16_TO_QINT_2D(I16, vxc_short8, vxc_short8*, 2)"; /* end of gather_nd_2d_mix_vx*/
 
 static const char gather_nd_3d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -13860,6 +16371,123 @@ __kernel void gather_nd_F16toF16_3D(\n\
     VXC_WriteImage(output, coord.zy, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
+__kernel void gather_nd_array_I8toI8_3D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.w = 0;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global char* data_ptr = (__global char*)input_ptr;\n\
+    __global char* dst_ptr = (__global char*)output_ptr;\n\
+    char src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_U8toU8_3D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.w = 0;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\
+    Image img2 = create_image_from_image2d(output, 1);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global uchar* data_ptr = (__global uchar*)input_ptr;\n\
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\
+    uchar src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+\n\
+}\n\
+\n\
+__kernel void gather_nd_array_I16toI16_3D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.w = 0;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_F16toF16_3D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_t   input1,\n\
+    __write_only image2d_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // indices_num\n\
+\n\
+    int4 coord = (int4)(0, gidy, gidx, 0);\n\
+    Image img = create_image_from_image2d(input1, 4);\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.w = 0;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\
+    Image img2 = create_image_from_image2d(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
 "; /* end of gather_nd_3d_vx*/
 
 static const char gather_nd_3d_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -13944,6 +16572,89 @@ GATHER_ND_F16_TO_QINT_3D(U8, vxc_uchar16)\n\
 GATHER_ND_F16_TO_QINT_3D(I8, vxc_char16)\n\
 GATHER_ND_F16_TO_QINT_3D(I16, vxc_short8)\n\
 \n\
+#define GATHER_ND_ARRAY_QINT_TO_F16_3D(src0_type_name, read_type, ptr_type, stride) \\\n\
+__kernel void gather_nd_array_##src0_type_name##toF16_3D( \\\n\
+    __read_only image2d_array_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
+    indice.x = indice.x * block_size + gidx; \\\n\
+    indice.w = 0; \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, stride); \\\n\
+    Image img2 = create_image_from_image2d(output, 2); \\\n\
+ \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\
+ \\\n\
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \\\n\
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \\\n\
+    read_type src = data_ptr[0]; \\\n\
+ \\\n\
+    vxc_half8  src0; \\\n\
+    vxc_short8 dst0; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    _viv_asm(COPY, dst0, src0, 16); \\\n\
+    dst_ptr[0] = dst0; \\\n\
+}\n\
+GATHER_ND_ARRAY_QINT_TO_F16_3D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\
+GATHER_ND_ARRAY_QINT_TO_F16_3D(I8, vxc_char16, vxc_char16*, 1)\n\
+GATHER_ND_ARRAY_QINT_TO_F16_3D(I16, vxc_short8, vxc_short8*, 2)\n\
+\n\
+#define GATHER_ND_ARRAY_F16_TO_QINT_3D(src1_type_name, write_type, ptr_type, stride) \\\n\
+__kernel void gather_nd_array_F16to##src1_type_name##_3D( \\\n\
+    __read_only image2d_array_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
+    indice.x = indice.x * block_size + gidx; \\\n\
+    indice.w = 0; \\\n\
+ \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2); \\\n\
+    Image img2 = create_image_from_image2d(output, stride); \\\n\
+ \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\
+ \\\n\
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \\\n\
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \\\n\
+    vxc_short8 src = data_ptr[0]; \\\n\
+ \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    vxc_half8 data; \\\n\
+    write_type dst; \\\n\
+    _viv_asm(COPY, data, src, 16); \\\n\
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven,1), uniConvertFp16toU8_2x8); \\\n\
+    dst_ptr[0] = dst; \\\n\
+}\n\
+GATHER_ND_ARRAY_F16_TO_QINT_3D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\
+GATHER_ND_ARRAY_F16_TO_QINT_3D(I8, vxc_char16, vxc_char16*, 1)\n\
+GATHER_ND_ARRAY_F16_TO_QINT_3D(I16, vxc_short8, vxc_short8*, 2)\n\
+\n\
 "; /* end of gather_nd_3d_mix_vx*/
 
 static const char gather_nd_batch_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -14043,7 +16754,121 @@ __kernel void gather_nd_batch_F16toF16_1D(\n\
     VXC_ReadImage(src, input0, coord0, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
-"; /* end of gather_nd_batch_vx*/
+\n\
+__kernel void gather_nd_array_batch_I8toI8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global char* data_ptr = (__global char*)input_ptr;\n\
+    __global char* dst_ptr = (__global char*)output_ptr;\n\
+    char src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_batch_U8toU8_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 1);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global uchar* data_ptr = (__global uchar*)input_ptr;\n\
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\
+    uchar src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_batch_I16toI16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_batch_F16toF16_1D(\n\
+    __read_only image2d_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    int2 coord0 = (int2)(indice.x * block_size + gidx, gidz);\n\
+\n\
+    Image img1 = create_image_from_image2d(input0, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord0);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}"; /* end of gather_nd_batch_vx*/
 
 static const char gather_nd_batch_2d_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -14073,7 +16898,7 @@ __kernel void gather_nd_batch_I8toI8_2D(\n\
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void gather_nd_U8toU8_2D(\n\
+__kernel void gather_nd_batch_U8toU8_2D(\n\
     __read_only image2d_array_t   input0,\n\
     __read_only image2d_array_t   input1,\n\
     __write_only image2d_array_t  output,\n\
@@ -14098,7 +16923,7 @@ __kernel void gather_nd_U8toU8_2D(\n\
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void gather_nd_I16toI16_2D(\n\
+__kernel void gather_nd_batch_I16toI16_2D(\n\
     __read_only image2d_array_t   input0,\n\
     __read_only image2d_array_t   input1,\n\
     __write_only image2d_array_t  output,\n\
@@ -14123,7 +16948,7 @@ __kernel void gather_nd_I16toI16_2D(\n\
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 \n\
-__kernel void gather_nd_F16toF16_2D(\n\
+__kernel void gather_nd_batch_F16toF16_2D(\n\
     __read_only image2d_array_t   input0,\n\
     __read_only image2d_array_t   input1,\n\
     __write_only image2d_array_t  output,\n\
@@ -14147,6 +16972,126 @@ __kernel void gather_nd_F16toF16_2D(\n\
     VXC_ReadImage2DArray(src, input0, indice, 0, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
     VXC_WriteImage2DArray(output, coord, src, VXC_MODIFIER(0, 0, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
+\n\
+__kernel void gather_nd_array_batch_I8toI8_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global char* data_ptr = (__global char*)input_ptr;\n\
+    __global char* dst_ptr = (__global char*)output_ptr;\n\
+    char src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_batch_U8toU8_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 1);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 1);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global uchar* data_ptr = (__global uchar*)input_ptr;\n\
+    __global uchar* dst_ptr = (__global uchar*)output_ptr;\n\
+    uchar src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_batch_I16toI16_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
+\n\
+__kernel void gather_nd_array_batch_F16toF16_2D(\n\
+    __read_only image2d_array_t   input0,\n\
+    __read_only image2d_array_t   input1,\n\
+    __write_only image2d_array_t  output,\n\
+    int block_size,\n\
+    int coord_dim\n\
+    )\n\
+{\n\
+    int gidx = get_global_id(0);  // block_size\n\
+    int gidy = get_global_id(1);  // index num\n\
+    int gidz = get_global_id(2);  // batch num\n\
+\n\
+    int4 coord = (int4)(gidx, gidy, gidz, 0);\n\
+    Tensor img = create_tensor_from_image2d_array(input1, 4);\n\
+    uchar* indice_ptr = get_tensor_ptr_from_coord(img, coord.wyzw);\n\
+    int4 indice = ((int4 *)indice_ptr)[0];\n\
+\n\
+    indice.x = indice.x * block_size + gidx;\n\
+    indice.zw = coord.zw;\n\
+\n\
+    Tensor img1 = create_tensor_from_image2d_array(input0, 2);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 2);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, indice);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global short* data_ptr = (__global short*)input_ptr;\n\
+    __global short* dst_ptr = (__global short*)output_ptr;\n\
+    short src = data_ptr[0];\n\
+    dst_ptr[0] = src;\n\
+}\n\
 "; /* end of gather_nd_batch_2d_vx*/
 
 static const char gather_nd_mix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -14232,6 +17177,88 @@ GATHER_ND_F16_TO_QINT_1D(U8, vxc_uchar16)\n\
 GATHER_ND_F16_TO_QINT_1D(I8, vxc_char16)\n\
 GATHER_ND_F16_TO_QINT_1D(I16, vxc_short8)\n\
 \n\
+#define GATHER_ND_ARRAY_QINT_TO_F16_1D(src0_type_name, read_type, ptr_type, stride) \\\n\
+__kernel void gather_nd_array_##src0_type_name##toF16_1D( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
+    coord.w = indice.x; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input0, stride); \\\n\
+    Image img2 = create_image_from_image2d(output, 2); \\\n\
+ \\\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\
+ \\\n\
+    __global ptr_type data_ptr = (__global ptr_type)input_ptr; \\\n\
+    __global vxc_short8* dst_ptr = (__global vxc_short8* )output_ptr; \\\n\
+    read_type src = data_ptr[0]; \\\n\
+ \\\n\
+    vxc_half8  src0; \\\n\
+    vxc_short8 dst0; \\\n\
+    vxc_ushort8 ms0; \\\n\
+    _viv_asm(COPY, ms0, multAndoutZP0, 16); \\\n\
+    VXC_DP2x8(src0,src,ms0,VXC_MODIFIER(0,7,0,VXC_RM_TowardZero,1),uniU8MulAndPostShift_0_Lo_2x8); \\\n\
+    _viv_asm(COPY, dst0, src0, 16); \\\n\
+    dst_ptr[0] = dst0; \\\n\
+}\n\
+GATHER_ND_ARRAY_QINT_TO_F16_1D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\
+GATHER_ND_ARRAY_QINT_TO_F16_1D(I8, vxc_char16, vxc_char16*, 1)\n\
+GATHER_ND_ARRAY_QINT_TO_F16_1D(I16, vxc_short8, vxc_short8*, 2)\n\
+\n\
+#define GATHER_ND_ARRAY_F16_TO_QINT_1D(src1_type_name, write_type, ptr_type, stride) \\\n\
+__kernel void gather_nd_array_F16to##src1_type_name##_1D( \\\n\
+    __read_only image2d_t   input0, \\\n\
+    __read_only image2d_t   input1, \\\n\
+    __write_only image2d_t  output, \\\n\
+    int block_size, \\\n\
+    int coord_dim \\\n\
+    ) \\\n\
+{ \\\n\
+    int gidx = get_global_id(0); \\\n\
+    int gidy = get_global_id(1); \\\n\
+ \\\n\
+    int4 coord = (int4)(0, gidy, gidx, 0); \\\n\
+    Image img = create_image_from_image2d(input1, 4); \\\n\
+    uchar* indice_ptr = get_image_ptr_from_coord(img, coord.xy); \\\n\
+    int4 indice = ((int4 *)indice_ptr)[0]; \\\n\
+ \\\n\
+    coord.w = indice.x; \\\n\
+ \\\n\
+    Image img1 = create_image_from_image2d(input0, 2); \\\n\
+    Image img2 = create_image_from_image2d(output, stride); \\\n\
+ \\\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord.zw); \\\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord.zy); \\\n\
+ \\\n\
+    __global vxc_short8* data_ptr = (__global vxc_short8*)input_ptr; \\\n\
+    __global ptr_type dst_ptr = (__global ptr_type )output_ptr; \\\n\
+    vxc_short8 src = data_ptr[0]; \\\n\
+    vxc_ushort8 mp1; \\\n\
+    _viv_asm(COPY, mp1, multAndoutZP1, 16); \\\n\
+    vxc_half8 data; \\\n\
+    write_type dst; \\\n\
+    _viv_asm(COPY, data, src, 16); \\\n\
+    VXC_DP2x8(dst,data,mp1,VXC_MODIFIER(0,7,0,VXC_RM_ToNearestEven, 1),uniConvertFp16toU8_2x8); \\\n\
+    dst_ptr[0] = dst; \\\n\
+}\n\
+GATHER_ND_ARRAY_F16_TO_QINT_1D(U8, vxc_uchar16, vxc_uchar16*, 1)\n\
+GATHER_ND_ARRAY_F16_TO_QINT_1D(I8, vxc_char16, vxc_char16*, 1)\n\
+GATHER_ND_ARRAY_F16_TO_QINT_1D(I16, vxc_short8, vxc_short8*, 2)\n\
+\n\
+\n\
 "; /* end of gather_nd_mix_vx*/
 
 static const char get_matrix_vx[] = "#include \"cl_viv_vx_ext.h\"\n\
@@ -39749,7 +42776,7 @@ __kernel void pre_process_gray_half_U8toU8\n\
 \n\
     coord_in.xy = coord_in.xy >> 1;\n\
 \n\
-    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 11, 0, VXC_RM_TowardZero, 0));\n\
+    VXC_WriteImage(output, coord_in.xy, src0.s02468ace, VXC_MODIFIER(0, 7, 0, VXC_RM_TowardZero, 0));\n\
 }\n\
 "; /* end of pre_process_gray_2_vx*/
 
@@ -60368,6 +63395,169 @@ __kernel void clip_U8toF32_2D(\n\
 }\n\
 "; /* end of clip_U8_cl*/
 
+static const char col2im_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
+#include \"cl_viv_vx_ext.h\"\n\
+\n\
+_viv_uniform int width_pad;\n\
+_viv_uniform int height_pad;\n\
+_viv_uniform int depth_pad;\n\
+_viv_uniform int move_time_x;\n\
+_viv_uniform int move_time_y;\n\
+_viv_uniform int kernel_x_new;\n\
+_viv_uniform int kernel_y_new;\n\
+_viv_uniform int kernel_z_new;\n\
+_viv_uniform int depth;\n\
+\n\
+#define COL2IM(name, read_type, dst_type ,convert_type, write_type) \\\n\
+__kernel void col2im_##name \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              stride_w, \\\n\
+                 int              stride_h, \\\n\
+                 int              stride_d, \\\n\
+                 int              dilation_w, \\\n\
+                 int              dilation_h, \\\n\
+                 int              dilation_d, \\\n\
+                 int              pad_w_front, \\\n\
+                 int              pad_w_end, \\\n\
+                 int              pad_h_front, \\\n\
+                 int              pad_h_end, \\\n\
+                 int              pad_d_front, \\\n\
+                 int              pad_d_end, \\\n\
+                 int              kernel_x, \\\n\
+                 int              kernel_y, \\\n\
+                 int              kernel_z, \\\n\
+                 float            inOutScale, \\\n\
+                 float            inOutTile \\\n\
+) \\\n\
+{ \\\n\
+    int x = get_global_id(0); \\\n\
+    int y = get_global_id(1); \\\n\
+    int z = get_global_id(2); \\\n\
+    int4 coord_out = (int4)(x,y,z,0); \\\n\
+    int b = z / depth; \\\n\
+    z = z % depth; \\\n\
+    int4 coord_in = (int4)(0,0,b,0); \\\n\
+ \\\n\
+    float sum = 0.0f; \\\n\
+    x = x + pad_w_front; \\\n\
+    y = y + pad_h_front; \\\n\
+    z = z + pad_d_front; \\\n\
+    int offset_x = x % stride_w; \\\n\
+    int offset_y = y % stride_h; \\\n\
+    int offset_z = z % stride_d; \\\n\
+    int i,j,k; \\\n\
+    for (k = offset_z; k < kernel_z_new; k += stride_d) \\\n\
+    { \\\n\
+        if ((z - k) < 0 || (z + (kernel_z_new - k)) > depth_pad || k % dilation_d != 0) \\\n\
+        { \\\n\
+            continue; \\\n\
+        } \\\n\
+        for (j = offset_y; j < kernel_y_new; j = j + stride_h) \\\n\
+        { \\\n\
+            if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \\\n\
+            { \\\n\
+                continue; \\\n\
+            } \\\n\
+            for (i = offset_x; i < kernel_x_new; i = i + stride_w) \\\n\
+            { \\\n\
+                if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \\\n\
+                { \\\n\
+                    continue; \\\n\
+                } \\\n\
+                coord_in.x = (x - i + stride_w - 1) / stride_w + \\\n\
+                             (y - j + stride_h - 1) / stride_h * move_time_x + \\\n\
+                             (z - k + stride_d - 1) / stride_d * move_time_y * move_time_x; \\\n\
+                coord_in.y = i / dilation_w + j * kernel_x / dilation_h + k * kernel_x * kernel_y / dilation_d; \\\n\
+                sum = sum + convert_float(read_type(input, coord_in).x); \\\n\
+            } \\\n\
+        } \\\n\
+    } \\\n\
+    sum = sum * inOutScale + inOutTile; \\\n\
+    dst_type dst = 0; \\\n\
+    dst.x = convert_type(sum); \\\n\
+    write_type(output, coord_out, dst); \\\n\
+}\n\
+COL2IM(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)\n\
+COL2IM(U32toI32, read_imageui, int4,   convert_int,   write_imagei)\n\
+COL2IM(U32toF32, read_imageui, float4, convert_float, write_imagef)\n\
+COL2IM(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)\n\
+COL2IM(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)\n\
+COL2IM(I32toF32, read_imagei,  float4, convert_float, write_imagef)\n\
+COL2IM(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)\n\
+COL2IM(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)\n\
+COL2IM(F32toF32, read_imagef,  float4, convert_float, write_imagef)\n\
+\n\
+#define COL2IM_2D(name, read_type, dst_type ,convert_type, write_type) \\\n\
+__kernel void col2im_##name##_2D \\\n\
+( \\\n\
+    __read_only image2d_array_t   input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+                 int              stride_w, \\\n\
+                 int              stride_h, \\\n\
+                 int              stride_d, \\\n\
+                 int              dilation_w, \\\n\
+                 int              dilation_h, \\\n\
+                 int              dilation_d, \\\n\
+                 int              pad_w_front, \\\n\
+                 int              pad_w_end, \\\n\
+                 int              pad_h_front, \\\n\
+                 int              pad_h_end, \\\n\
+                 int              pad_d_front, \\\n\
+                 int              pad_d_end, \\\n\
+                 int              kernel_x, \\\n\
+                 int              kernel_y, \\\n\
+                 int              kernel_z, \\\n\
+                 float            inOutScale, \\\n\
+                 float            inOutTile \\\n\
+) \\\n\
+{ \\\n\
+    int x = get_global_id(0); \\\n\
+    int y = get_global_id(1); \\\n\
+    int z = get_global_id(2); \\\n\
+    int4 coord_out = (int4)(x,y,z,0); \\\n\
+    int4 coord_in = (int4)(0,0,z,0); \\\n\
+ \\\n\
+    float sum = 0.0f; \\\n\
+    x = x + pad_w_front; \\\n\
+    y = y + pad_h_front; \\\n\
+    int offset_x = x % stride_w; \\\n\
+    int offset_y = y % stride_h; \\\n\
+    int i,j; \\\n\
+    for (j = offset_y; j < kernel_y_new; j = j + stride_h) \\\n\
+    { \\\n\
+        if ((y - j) < 0 || (y + (kernel_y_new - j)) > height_pad || j % dilation_h != 0) \\\n\
+        { \\\n\
+            continue; \\\n\
+        } \\\n\
+        for (i = offset_x; i < kernel_x_new; i = i + stride_w) \\\n\
+        { \\\n\
+            if ((x - i) < 0 || (x + (kernel_x_new - i)) > width_pad || i % dilation_w != 0) \\\n\
+            { \\\n\
+                continue; \\\n\
+            } \\\n\
+            coord_in.x = (x - i + stride_w - 1) / stride_w + \\\n\
+                         (y - j + stride_h - 1) / stride_h * move_time_x; \\\n\
+            coord_in.y = i / dilation_w + j * kernel_x / dilation_h; \\\n\
+            sum = sum + convert_float(read_type(input, coord_in).x); \\\n\
+        } \\\n\
+    } \\\n\
+    sum = sum * inOutScale + inOutTile; \\\n\
+    dst_type dst = 0; \\\n\
+    dst.x = convert_type(sum); \\\n\
+    write_type(output, coord_out, dst); \\\n\
+}\n\
+COL2IM_2D(U32toU32, read_imageui, uint4,  convert_uint,  write_imageui)\n\
+COL2IM_2D(U32toI32, read_imageui, int4,   convert_int,   write_imagei)\n\
+COL2IM_2D(U32toF32, read_imageui, float4, convert_float, write_imagef)\n\
+COL2IM_2D(I32toU32, read_imagei,  uint4,  convert_uint,  write_imageui)\n\
+COL2IM_2D(I32toI32, read_imagei,  int4,   convert_int,   write_imagei)\n\
+COL2IM_2D(I32toF32, read_imagei,  float4, convert_float, write_imagef)\n\
+COL2IM_2D(F32toU32, read_imagef,  uint4,  convert_uint,  write_imageui)\n\
+COL2IM_2D(F32toI32, read_imagef,  int4,   convert_int,   write_imagei)\n\
+COL2IM_2D(F32toF32, read_imagef,  float4, convert_float, write_imagef)"; /* end of col2im_cl*/
+
 static const char crop_and_resize_bilinear_cl[] = "#pragma OPENCL EXTENSION cl_viv_vx_extension : enable\n\
 #include \"cl_viv_vx_ext.h\"\n\
 \n\
@@ -60582,7 +63772,339 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
         for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\
         {\n\
             float4 data = read_imagef(input, coord);\n\
-            coord_out.z--;\n\
+            coord_out.z--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.z = 0;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.z++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.z = 0; coord.z < channel; coord.z++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\
+__kernel void cumsum_##name##toU8_axis2( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint4 dst = (uint4)(0); \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst.x = convert_uint_sat(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0.0f; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.z = channel - 1; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            coord_out.z--; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.z = 0; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            coord_out.z++; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\
+CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\
+\n\
+\n\
+\n\
+__kernel void cumsum_F32toF32_axis1(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.y = height - 1;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.y--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord_out.y = 0;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.y++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord_out, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\
+__kernel void cumsum_##name##toU8_axis1( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint4 dst = (uint4)(0); \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst.x = convert_uint_sat(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.y = height - 1; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+ \\\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            coord_out.y--; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.y = 0; \\\n\
+        write_imageui(output, coord_out, dst); \\\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            coord_out.y++; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord_out, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+        { \\\n\
+            src_type data = read_image_type(input, coord); \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
+            write_imageui(output, coord, dst); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\
+CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\
+\n\
+\n\
+__kernel void cumsum_F32toF32_axis0(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int channel,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord_out.x = width - 1;\n\
+        write_imagef(output, coord_out, sum);\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord);\n\
+            coord_out.x--;\n\
             sum += data;\n\
 \n\
             write_imagef(output, coord_out, sum);\n\
@@ -60590,12 +64112,12 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        coord_out.z = 0;\n\
+        coord_out.x = 0;\n\
         write_imagef(output, coord_out, sum);\n\
-        for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
         {\n\
             float4 data = read_imagef(input, coord);\n\
-            coord_out.z++;\n\
+            coord_out.x++;\n\
             sum += data;\n\
 \n\
             write_imagef(output, coord_out, sum);\n\
@@ -60603,7 +64125,7 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     }\n\
     else if(rev)\n\
     {\n\
-        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
         {\n\
             float4 data = read_imagef(input, coord);\n\
             sum += data;\n\
@@ -60613,7 +64135,7 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     }\n\
     else\n\
     {\n\
-        for(coord.z = 0; coord.z < channel; coord.z++)\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
         {\n\
             float4 data = read_imagef(input, coord);\n\
             sum += data;\n\
@@ -60623,8 +64145,8 @@ static const char cumsum_cl[] = "__kernel void cumsum_F32toF32_axis2(\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_toU8_AXIS2_SH(name, src_type, read_image_type) \\\n\
-__kernel void cumsum_##name##toU8_axis2( \\\n\
+#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\
+__kernel void cumsum_##name##toU8_axis0( \\\n\
     __read_only image2d_array_t  input, \\\n\
     __write_only image2d_array_t  output, \\\n\
     int axis, \\\n\
@@ -60647,16 +64169,16 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
     int tmp_zp = convert_int_rte(output_zp); \\\n\
     dst.x = convert_uint_sat(tmp_zp); \\\n\
  \\\n\
-    float cnt = 0.0f; \\\n\
+    float cnt = 0; \\\n\
  \\\n\
     if(exclusive && rev) \\\n\
     { \\\n\
-        coord_out.z = channel - 1; \\\n\
+        coord_out.x = width - 1; \\\n\
         write_imageui(output, coord_out, dst); \\\n\
-        for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\
         { \\\n\
             src_type data = read_image_type(input, coord); \\\n\
-            coord_out.z--; \\\n\
+            coord_out.x--; \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
@@ -60669,12 +64191,12 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
     } \\\n\
     else if(exclusive) \\\n\
     { \\\n\
-        coord_out.z = 0; \\\n\
+        coord_out.x = 0; \\\n\
         write_imageui(output, coord_out, dst); \\\n\
-        for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\
         { \\\n\
             src_type data = read_image_type(input, coord); \\\n\
-            coord_out.z++; \\\n\
+            coord_out.x++; \\\n\
             cnt += 1.0f; \\\n\
             sum += data; \\\n\
  \\\n\
@@ -60687,7 +64209,7 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
     } \\\n\
     else if(rev) \\\n\
     { \\\n\
-        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\
         { \\\n\
             src_type data = read_image_type(input, coord); \\\n\
             cnt += 1.0f; \\\n\
@@ -60702,7 +64224,7 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
     } \\\n\
     else \\\n\
     { \\\n\
-        for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+        for(coord.x = 0; coord.x < width; coord.x++) \\\n\
         { \\\n\
             src_type data = read_image_type(input, coord); \\\n\
             cnt += 1.0f; \\\n\
@@ -60716,344 +64238,851 @@ __kernel void cumsum_##name##toU8_axis2( \\\n\
         } \\\n\
     } \\\n\
 }\n\
-CUMSUM_toU8_AXIS2_SH(U8,uint4,read_imageui)\n\
-CUMSUM_toU8_AXIS2_SH(F32,float4,read_imagef)\n\
+CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\
+CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\
+"; /* end of cumsum_cl*/
+
+static const char cumsum_2d_cl[] = "\n\
+__kernel void cumsum_F32toF32_axis1_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.w = height - 1;\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.w--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.w++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis1_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    int tmp_zp = convert_int_rte(output_zp);\n\
+    dst.x = convert_uint_sat(tmp_zp);\n\
+\n\
+    float cnt = 0;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.w = height - 1;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w--;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F32toU8_axis1_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+    int tmp_zp = convert_int_rte(output_zp);\n\
+    dst.x = convert_uint_sat(tmp_zp);\n\
+\n\
+    float cnt = 0;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.w = height - 1;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w--;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.w++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F32toF32_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.z--;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        write_imagef(output, coord.zw, sum);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.zw, sum);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            sum += data;\n\
+\n\
+            write_imagef(output, coord.xy, sum);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_U8toU8_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    uint4 sum = (uint4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+\n\
+    int tmp_zp = convert_int_rte(output_zp);\n\
+    dst.x = convert_uint_sat(tmp_zp);\n\
+\n\
+    float cnt = 0.0f;\n\
+\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            coord.z--;\n\
+            cnt += 1.0;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            uint4 data = read_imageui(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_F32toU8_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    float4 sum = (float4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+    int tmp_zp = convert_int_rte(output_zp);\n\
+    dst.x = convert_uint_sat(tmp_zp);\n\
+\n\
+    float cnt = 0.0f;\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            coord.z--;\n\
+            cnt += 1.0;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        write_imageui(output, coord.zw, dst);\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.zw, dst);\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
 \n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            float4 data = read_imagef(input, coord.xy);\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
 \n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
 \n\
-__kernel void cumsum_F32toF32_axis1(\n\
-    __read_only image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            write_imageui(output, coord.xy, dst);\n\
+        }\n\
+    }\n\
+}\n\
+"; /* end of cumsum_2d_cl*/
+
+static const char cumsum_array_2d_axis0_cl[] = "\n\
+__kernel void cumsum_array_F32toF32_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
     int axis,\n\
     int exclusive,\n\
     int rev,\n\
     int width,\n\
     int height,\n\
-    int channel,\n\
+    int chn,\n\
     int input_zp,\n\
     float in_out_scale,\n\
     float in_out_zp_scale,\n\
     float output_zp\n\
     )\n\
 {\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
-\n\
-    float4 sum = (float4)(0);\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
 \n\
+    float sum = (float)(0);\n\
+    Image img1 = create_image_from_image2d(input, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global float* out_ptr = (__global float*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
-        coord_out.y = height - 1;\n\
-        write_imagef(output, coord_out, sum);\n\
-        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+\n\
+        for(; coord.x > 0; coord.x--)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
-            coord_out.y--;\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord.z--;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord_out, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        coord_out.y = 0;\n\
-        write_imagef(output, coord_out, sum);\n\
-        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
+        coord.z = 0;\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
-            coord_out.y++;\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord.z++;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord_out, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
-        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else\n\
     {\n\
-        for(coord.y = 0; coord.y < height; coord.y++)\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
 }\n\
 \n\
-#define CUMSUM_toU8_AXIS1_SH(name, src_type, read_image_type) \\\n\
-__kernel void cumsum_##name##toU8_axis1( \\\n\
-    __read_only image2d_array_t  input, \\\n\
-    __write_only image2d_array_t  output, \\\n\
-    int axis, \\\n\
-    int exclusive, \\\n\
-    int rev, \\\n\
-    int width, \\\n\
-    int height, \\\n\
-    int channel, \\\n\
-    int input_zp, \\\n\
-    float in_out_scale, \\\n\
-    float in_out_zp_scale, \\\n\
-    float output_zp \\\n\
-    ) \\\n\
-{ \\\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
-    int4 coord_out = coord; \\\n\
- \\\n\
-    src_type sum = (src_type)(0); \\\n\
-    uint4 dst = (uint4)(0); \\\n\
-    int tmp_zp = convert_int_rte(output_zp); \\\n\
-    dst.x = convert_uint_sat(tmp_zp); \\\n\
- \\\n\
-    float cnt = 0; \\\n\
- \\\n\
-    if(exclusive && rev) \\\n\
-    { \\\n\
-        coord_out.y = height - 1; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
- \\\n\
-        for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            cnt += 1.0f; \\\n\
-            coord_out.y--; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
-        } \\\n\
-    } \\\n\
-    else if(exclusive) \\\n\
-    { \\\n\
-        coord_out.y = 0; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
-        for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            cnt += 1.0f; \\\n\
-            coord_out.y++; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
-        } \\\n\
-    } \\\n\
-    else if(rev) \\\n\
-    { \\\n\
-        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            cnt += 1.0f; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
-        } \\\n\
-    } \\\n\
-    else \\\n\
-    { \\\n\
-        for(coord.y = 0; coord.y < height; coord.y++) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            cnt += 1.0f; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
-        } \\\n\
-    } \\\n\
-}\n\
-CUMSUM_toU8_AXIS1_SH(U8,uint4,read_imageui)\n\
-CUMSUM_toU8_AXIS1_SH(F32,float4,read_imagef)\n\
+__kernel void cumsum_array_U8toU8_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
+    int axis,\n\
+    int exclusive,\n\
+    int rev,\n\
+    int width,\n\
+    int height,\n\
+    int chn,\n\
+    int input_zp,\n\
+    float in_out_scale,\n\
+    float in_out_zp_scale,\n\
+    float output_zp\n\
+    )\n\
+{\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
 \n\
+    uint sum = (uint)(0);\n\
+    uint dst = (uint)(0);\n\
 \n\
-__kernel void cumsum_F32toF32_axis0(\n\
-    __read_only image2d_array_t  input,\n\
-    __write_only image2d_array_t  output,\n\
+    int tmp_zp = convert_int_rte(output_zp);\n\
+    dst.x = convert_uint_sat(tmp_zp);\n\
+\n\
+    float cnt = 0.0f;\n\
+\n\
+    Image img1 = create_image_from_image2d(input, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global uint* in_ptr = (__global uint*)input_ptr;\n\
+    __global uint* out_ptr = (__global uint*)output_ptr;\n\
+    if(exclusive && rev)\n\
+    {\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
+        for(; coord.x > 0; coord.x--)\n\
+        {\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
+            coord.z--;\n\
+            cnt += 1.0;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
+        }\n\
+    }\n\
+    else if(exclusive)\n\
+    {\n\
+        coord.z = 0;\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        {\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
+            cnt += 1.0f;\n\
+            coord.z++;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
+        }\n\
+    }\n\
+    else if(rev)\n\
+    {\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        {\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
+        }\n\
+    }\n\
+    else\n\
+    {\n\
+        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        {\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
+            cnt += 1.0f;\n\
+            sum += data;\n\
+\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
+        }\n\
+    }\n\
+}\n\
+\n\
+__kernel void cumsum_array_F32toU8_axis0_2D(\n\
+    __read_only image2d_t  input,\n\
+    __write_only image2d_t  output,\n\
     int axis,\n\
     int exclusive,\n\
     int rev,\n\
     int width,\n\
     int height,\n\
-    int channel,\n\
+    int chn,\n\
     int input_zp,\n\
     float in_out_scale,\n\
     float in_out_zp_scale,\n\
     float output_zp\n\
     )\n\
 {\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
-    int4 coord_out = coord;\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
 \n\
     float4 sum = (float4)(0);\n\
+    uint4 dst = (uint4)(0);\n\
+    int tmp_zp = convert_int_rte(output_zp);\n\
+    dst.x = convert_uint_sat(tmp_zp);\n\
 \n\
+    float cnt = 0.0f;\n\
+    Image img1 = create_image_from_image2d(input, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global uint* out_ptr = (__global uint*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
-        coord_out.x = width - 1;\n\
-        write_imagef(output, coord_out, sum);\n\
-        for(coord.x = width - 1; coord.x > 0; coord.x--)\n\
+        coord.x = width - 1;\n\
+        coord.z = coord.x;\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global uint*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
+        for(; coord.x > 0; coord.x--)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
-            coord_out.x--;\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord.z--;\n\
+            cnt += 1.0;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord_out, sum);\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        coord_out.x = 0;\n\
-        write_imagef(output, coord_out, sum);\n\
+        coord.z = 0;\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global uint*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
-            coord_out.x++;\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            cnt += 1.0f;\n\
+            coord.z++;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord_out, sum);\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
         for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            cnt += 1.0f;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord, sum);\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
+\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else\n\
     {\n\
         for(coord.x = 0; coord.x < width; coord.x++)\n\
         {\n\
-            float4 data = read_imagef(input, coord);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            cnt += 1.0f;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord, sum);\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
+            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+\n\
+            dst.x = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
-\n\
-#define CUMSUM_toU8_AXIS0_SH(name, src_type, read_image_type) \\\n\
-__kernel void cumsum_##name##toU8_axis0( \\\n\
-    __read_only image2d_array_t  input, \\\n\
-    __write_only image2d_array_t  output, \\\n\
-    int axis, \\\n\
-    int exclusive, \\\n\
-    int rev, \\\n\
-    int width, \\\n\
-    int height, \\\n\
-    int channel, \\\n\
-    int input_zp, \\\n\
-    float in_out_scale, \\\n\
-    float in_out_zp_scale, \\\n\
-    float output_zp \\\n\
-    ) \\\n\
-{ \\\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
-    int4 coord_out = coord; \\\n\
- \\\n\
-    src_type sum = (src_type)(0); \\\n\
-    uint4 dst = (uint4)(0); \\\n\
-    int tmp_zp = convert_int_rte(output_zp); \\\n\
-    dst.x = convert_uint_sat(tmp_zp); \\\n\
- \\\n\
-    float cnt = 0; \\\n\
- \\\n\
-    if(exclusive && rev) \\\n\
-    { \\\n\
-        coord_out.x = width - 1; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
-        for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            coord_out.x--; \\\n\
-            cnt += 1.0f; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
-        } \\\n\
-    } \\\n\
-    else if(exclusive) \\\n\
-    { \\\n\
-        coord_out.x = 0; \\\n\
-        write_imageui(output, coord_out, dst); \\\n\
-        for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            coord_out.x++; \\\n\
-            cnt += 1.0f; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord_out, dst); \\\n\
-        } \\\n\
-    } \\\n\
-    else if(rev) \\\n\
-    { \\\n\
-        for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            cnt += 1.0f; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
-        } \\\n\
-    } \\\n\
-    else \\\n\
-    { \\\n\
-        for(coord.x = 0; coord.x < width; coord.x++) \\\n\
-        { \\\n\
-            src_type data = read_image_type(input, coord); \\\n\
-            cnt += 1.0f; \\\n\
-            sum += data; \\\n\
- \\\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha; \\\n\
- \\\n\
-            dst.x = (uint)convert_int_rte(tmpSum); \\\n\
-            write_imageui(output, coord, dst); \\\n\
-        } \\\n\
-    } \\\n\
-}\n\
-CUMSUM_toU8_AXIS0_SH(U8,uint4,read_imageui)\n\
-CUMSUM_toU8_AXIS0_SH(F32,float4,read_imagef)\n\
-"; /* end of cumsum_cl*/
+"; /* end of cumsum_array_2d_axis0_cl*/
 
-static const char cumsum_2d_cl[] = "\n\
-__kernel void cumsum_F32toF32_axis1_2D(\n\
+static const char cumsum_array_2d_axis1_cl[] = "\n\
+__kernel void cumsum_array_F32toF32_axis1_2D(\n\
     __read_only image2d_t  input,\n\
     __write_only image2d_t  output,\n\
     int axis,\n\
@@ -61070,19 +65099,30 @@ __kernel void cumsum_F32toF32_axis1_2D(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
 \n\
-    float4 sum = (float4)(0);\n\
-\n\
+    float sum = (float)(0);\n\
+    Image img1 = create_image_from_image2d(input, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global float* out_ptr = (__global float*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
         coord.w = height - 1;\n\
-        write_imagef(output, coord.zw, sum);\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
         for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             coord.w--;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.zw, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
@@ -61090,36 +65130,47 @@ __kernel void cumsum_F32toF32_axis1_2D(\n\
         write_imagef(output, coord.zw, sum);\n\
         for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             coord.w++;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.zw, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
-\n\
-            write_imagef(output, coord.xy, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else\n\
     {\n\
         for(coord.y = 0; coord.y < height; coord.y++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.xy, sum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_U8toU8_axis1_2D(\n\
+__kernel void cumsum_array_U8toU8_axis1_2D(\n\
     __read_only image2d_t  input,\n\
     __write_only image2d_t  output,\n\
     int axis,\n\
@@ -61136,82 +65187,107 @@ __kernel void cumsum_U8toU8_axis1_2D(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
 \n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
+    uint sum = (uint)(0);\n\
+    uint dst = (uint)(0);\n\
 \n\
     int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
+    dst = convert_uint_sat(tmp_zp);\n\
 \n\
     float cnt = 0;\n\
-\n\
+    Image img1 = create_image_from_image2d(input, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global uint* in_ptr = (__global uint*)input_ptr;\n\
+    __global uint* out_ptr = (__global uint*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
         coord.w = height - 1;\n\
-        write_imageui(output, coord.zw, dst);\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             coord.w--;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        write_imageui(output, coord.zw, dst);\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             coord.w++;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else\n\
     {\n\
         for(coord.y = 0; coord.y < height; coord.y++)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global uint*)input_ptr;\n\
+            uint data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_F32toU8_axis1_2D(\n\
+__kernel void cumsum_array_F32toU8_axis1_2D(\n\
     __read_only image2d_t  input,\n\
     __write_only image2d_t  output,\n\
     int axis,\n\
@@ -61228,334 +65304,757 @@ __kernel void cumsum_F32toU8_axis1_2D(\n\
 {\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
 \n\
-    float4 sum = (float4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
+    float sum = (float)(0);\n\
+    uint dst = (uint)(0);\n\
     int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
+    dst = convert_uint_sat(tmp_zp);\n\
 \n\
     float cnt = 0;\n\
-\n\
+    Image img1 = create_image_from_image2d(input, 4);\n\
+    Image img2 = create_image_from_image2d(output, 4);\n\
+    uchar* input_ptr = get_image_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_image_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global uint* out_ptr = (__global uint*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
         coord.w = height - 1;\n\
-        write_imageui(output, coord.zw, dst);\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global uint*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             coord.w--;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        write_imageui(output, coord.zw, dst);\n\
+        output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+        out_ptr = (__global uint*)output_ptr;\n\
+        out_ptr[0] = dst;\n\
         for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             coord.w++;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.zw);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
         for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
     else\n\
     {\n\
         for(coord.y = 0; coord.y < height; coord.y++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_image_ptr_from_coord(img1, coord.xy);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             cnt += 1.0f;\n\
             sum += data;\n\
 \n\
             float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha;\n\
 \n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            dst = (uint)convert_int_rte(tmpSum);\n\
+            output_ptr = get_image_ptr_from_coord(img2, coord.xy);\n\
+            out_ptr = (__global uint*)output_ptr;\n\
+            out_ptr[0] = dst;\n\
         }\n\
     }\n\
 }\n\
-\n\
-__kernel void cumsum_F32toF32_axis0_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
+"; /* end of cumsum_array_2d_axis1_cl*/
+
+static const char cumsum_array_axis0_cl[] = "\n\
+__kernel void cumsum_array_F32toF32_axis0(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
     int axis,\n\
     int exclusive,\n\
     int rev,\n\
     int width,\n\
     int height,\n\
-    int chn,\n\
+    int channel,\n\
     int input_zp,\n\
     float in_out_scale,\n\
     float in_out_zp_scale,\n\
     float output_zp\n\
     )\n\
 {\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    float4 sum = (float4)(0);\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
 \n\
+    float sum = (float)(0);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global float* out_ptr = (__global float*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
-        coord.x = width - 1;\n\
-        coord.z = coord.x;\n\
-        write_imagef(output, coord.zw, sum);\n\
-        for(; coord.x > 0; coord.x--)\n\
+        coord_out.x = width - 1;\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            coord.z--;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord_out.x--;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.zw, sum);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        coord.z = 0;\n\
-        write_imagef(output, coord.zw, sum);\n\
+        coord_out.x = 0;\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
         for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            coord.z++;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord_out.x++;\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.zw, sum);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
         for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.xy, sum);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else\n\
     {\n\
         for(coord.x = 0; coord.x < width; coord.x++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            write_imagef(output, coord.xy, sum);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
 }\n\
-\n\
-__kernel void cumsum_U8toU8_axis0_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
+\n\
+#define CUMSUM_ARRAY_toU8_AXIS0_SH(name, src_type) \\\n\
+__kernel void cumsum_array_##name##toU8_axis0( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint dst = (uint)(0); \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst = convert_uint_sat(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0; \\\n\
+ \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+    __global uint* out_ptr = (__global uint*)output_ptr; \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.x = width - 1; \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+        out_ptr = (__global uint*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
+        for(coord.x = width - 1; coord.x > 0; coord.x--) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            coord_out.x--; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.x = 0; \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+        out_ptr = (__global uint*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
+        for(coord.x = 0; coord.x < width - 1; coord.x++) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            coord_out.x++; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.x = width - 1; coord.x >= 0; coord.x--) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.x = 0; coord.x < width; coord.x++) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_ARRAY_toU8_AXIS0_SH(U8,uint)\n\
+CUMSUM_ARRAY_toU8_AXIS0_SH(F32,float)\n\
+"; /* end of cumsum_array_axis0_cl*/
+
+static const char cumsum_array_axis1_cl[] = "\n\
+__kernel void cumsum_array_F32toF32_axis1(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
     int axis,\n\
     int exclusive,\n\
     int rev,\n\
     int width,\n\
     int height,\n\
-    int chn,\n\
+    int channel,\n\
     int input_zp,\n\
     float in_out_scale,\n\
     float in_out_zp_scale,\n\
     float output_zp\n\
     )\n\
 {\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    uint4 sum = (uint4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-\n\
-    int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
-\n\
-    float cnt = 0.0f;\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
 \n\
+    float sum = (float)(0);\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global float* out_ptr = (__global float*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
-        coord.x = width - 1;\n\
-        coord.z = coord.x;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(; coord.x > 0; coord.x--)\n\
+        coord_out.y = height - 1;\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            coord.z--;\n\
-            cnt += 1.0;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord_out.y--;\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        coord.z = 0;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        coord_out.y = 0;\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.z++;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord_out.y++;\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else\n\
     {\n\
-        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        for(coord.y = 0; coord.y < height; coord.y++)\n\
         {\n\
-            uint4 data = read_imageui(input, coord.xy);\n\
-            cnt += 1.0f;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
 }\n\
 \n\
-__kernel void cumsum_F32toU8_axis0_2D(\n\
-    __read_only image2d_t  input,\n\
-    __write_only image2d_t  output,\n\
+#define CUMSUM_ARRAY_toU8_AXIS1_SH(name, src_type) \\\n\
+__kernel void cumsum_array_##name##toU8_axis1( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint dst = (uint4)(0); \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst = convert_uint_sat(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0; \\\n\
+ \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+    __global uint* out_ptr = (__global uint*)output_ptr; \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.y = height - 1; \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+        out_ptr = (__global uint*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
+ \\\n\
+        for(coord.y = height - 1; coord.y > 0; coord.y--) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            coord_out.y--; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.y = 0; \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+        out_ptr = (__global uint*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
+        for(coord.y = 0; coord.y < height - 1; coord.y++) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            coord_out.y++; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.y = height - 1; coord.y >= 0; coord.y--) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.y = 0; coord.y < height; coord.y++) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_ARRAY_toU8_AXIS1_SH(U8,uint)\n\
+CUMSUM_ARRAY_toU8_AXIS1_SH(F32,float)\n\
+"; /* end of cumsum_array_axis1_cl*/
+
+static const char cumsum_array_axis2_cl[] = "__kernel void cumsum_array_F32toF32_axis2(\n\
+    __read_only image2d_array_t  input,\n\
+    __write_only image2d_array_t  output,\n\
     int axis,\n\
     int exclusive,\n\
     int rev,\n\
     int width,\n\
     int height,\n\
-    int chn,\n\
+    int channel,\n\
     int input_zp,\n\
     float in_out_scale,\n\
     float in_out_zp_scale,\n\
     float output_zp\n\
     )\n\
 {\n\
-    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
-\n\
-    float4 sum = (float4)(0);\n\
-    uint4 dst = (uint4)(0);\n\
-    int tmp_zp = convert_int_rte(output_zp);\n\
-    dst.x = convert_uint_sat(tmp_zp);\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0);\n\
+    int4 coord_out = coord;\n\
 \n\
-    float cnt = 0.0f;\n\
+    float sum = 0;\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 4);\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 4);\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+    __global float* in_ptr = (__global float*)input_ptr;\n\
+    __global float* out_ptr = (__global float*)output_ptr;\n\
     if(exclusive && rev)\n\
     {\n\
-        coord.x = width - 1;\n\
-        coord.z = coord.x;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(; coord.x > 0; coord.x--)\n\
+        coord_out.z = channel - 1;\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+\n\
+        for(coord.z = channel - 1; coord.z > 0; coord.z--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            coord.z--;\n\
-            cnt += 1.0;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord_out.z--;\n\
             sum += data;\n\
-\n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(exclusive)\n\
     {\n\
-        coord.z = 0;\n\
-        write_imageui(output, coord.zw, dst);\n\
-        for(coord.x = 0; coord.x < width - 1; coord.x++)\n\
+        coord_out.z = 0;\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+        out_ptr = (__global float*)output_ptr;\n\
+        out_ptr[0] = sum;\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
-            coord.z++;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
+            coord_out.z++;\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.zw, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else if(rev)\n\
     {\n\
-        for(coord.x = width - 1; coord.x >= 0; coord.x--)\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
     else\n\
     {\n\
-        for(coord.x = 0; coord.x < width; coord.x++)\n\
+        for(coord.z = 0; coord.z < channel; coord.z++)\n\
         {\n\
-            float4 data = read_imagef(input, coord.xy);\n\
-            cnt += 1.0f;\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord);\n\
+            in_ptr = (__global float*)input_ptr;\n\
+            float data = in_ptr[0];\n\
             sum += data;\n\
 \n\
-            float tmpAlpha = cnt * in_out_zp_scale + output_zp;\n\
-            float tmpSum = sum.x * in_out_scale + tmpAlpha;\n\
-\n\
-            dst.x = (uint)convert_int_rte(tmpSum);\n\
-            write_imageui(output, coord.xy, dst);\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord);\n\
+            out_ptr = (__global float*)output_ptr;\n\
+            out_ptr[0] = sum;\n\
         }\n\
     }\n\
 }\n\
-"; /* end of cumsum_2d_cl*/
+\n\
+#define CUMSUM_ARRAY_toU8_AXIS2_SH(name, src_type) \\\n\
+__kernel void cumsum_array_##name##toU8_axis2( \\\n\
+    __read_only image2d_array_t  input, \\\n\
+    __write_only image2d_array_t  output, \\\n\
+    int axis, \\\n\
+    int exclusive, \\\n\
+    int rev, \\\n\
+    int width, \\\n\
+    int height, \\\n\
+    int channel, \\\n\
+    int input_zp, \\\n\
+    float in_out_scale, \\\n\
+    float in_out_zp_scale, \\\n\
+    float output_zp \\\n\
+    ) \\\n\
+{ \\\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(2), 0); \\\n\
+    int4 coord_out = coord; \\\n\
+ \\\n\
+    src_type sum = (src_type)(0); \\\n\
+    uint dst = (uint)(0); \\\n\
+    int tmp_zp = convert_int_rte(output_zp); \\\n\
+    dst = convert_uint_sat(tmp_zp); \\\n\
+ \\\n\
+    float cnt = 0.0f; \\\n\
+    Tensor img1 = create_tensor_from_image2d_array(input, 4); \\\n\
+    Tensor img2 = create_tensor_from_image2d_array(output, 4); \\\n\
+    uchar* input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+    uchar* output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+    __global src_type* in_ptr = (__global src_type*)input_ptr; \\\n\
+    __global uint* out_ptr = (__global uint*)output_ptr; \\\n\
+ \\\n\
+    if(exclusive && rev) \\\n\
+    { \\\n\
+        coord_out.z = channel - 1; \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+        out_ptr = (__global uint*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
+        for(coord.z = channel - 1; coord.z > 0; coord.z--) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            coord_out.z--; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else if(exclusive) \\\n\
+    { \\\n\
+        coord_out.z = 0; \\\n\
+        output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+        out_ptr = (__global uint*)output_ptr; \\\n\
+        out_ptr[0] = dst; \\\n\
+        for(coord.z = 0; coord.z < channel - 1; coord.z++) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            coord_out.z++; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord_out); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else if(rev) \\\n\
+    { \\\n\
+        for(coord.z = channel - 1; coord.z >= 0; coord.z--) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+    else \\\n\
+    { \\\n\
+        for(coord.z = 0; coord.z < channel; coord.z++) \\\n\
+        { \\\n\
+            input_ptr = get_tensor_ptr_from_coord(img1, coord); \\\n\
+            in_ptr = (__global src_type*)input_ptr; \\\n\
+            src_type data = in_ptr[0]; \\\n\
+            cnt += 1.0f; \\\n\
+            sum += data; \\\n\
+ \\\n\
+            float tmpAlpha = cnt * in_out_zp_scale + output_zp; \\\n\
+            float tmpSum = sum * in_out_scale + tmpAlpha; \\\n\
+ \\\n\
+            dst = (uint)convert_int_rte(tmpSum); \\\n\
+            output_ptr = get_tensor_ptr_from_coord(img2, coord); \\\n\
+            out_ptr = (__global uint*)output_ptr; \\\n\
+            out_ptr[0] = dst; \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+CUMSUM_ARRAY_toU8_AXIS2_SH(U8,uint)\n\
+CUMSUM_ARRAY_toU8_AXIS2_SH(F32,float)\n\
+\n\
+"; /* end of cumsum_array_axis2_cl*/
 
 static const char depth2space_crd_cl[] = "\n\
 __kernel void depth2space_crd_F32toF32(\n\
@@ -80476,8 +84975,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
  \\\n\
-    __local float local_data[128]; \\\n\
-    __local uint local_indices[128]; \\\n\
+    __local float local_data[LOCAL_SIZE0 * 2]; \\\n\
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\
  \\\n\
     float left = read_imagef(input, coord.xy).x; \\\n\
     coord.z += work_group_size; \\\n\
@@ -80509,7 +85008,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80536,13 +85035,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.xy, index.xxxx); \\\n\
     write_imagei(indices, coord.zy, index.yyyy); \\\n\
  }\n\
-TOPK_F32(1 << 0, 0)\n\
-TOPK_F32(1 << 1, 1)\n\
-TOPK_F32(1 << 2, 2)\n\
-TOPK_F32(1 << 3, 3)\n\
-TOPK_F32(1 << 4, 4)\n\
-TOPK_F32(1 << 5, 5)\n\
-TOPK_F32(1 << 6, 6)\n\
+TOPK_F32((1 << 0), 0)\n\
+TOPK_F32((1 << 1), 1)\n\
+TOPK_F32((1 << 2), 2)\n\
+TOPK_F32((1 << 3), 3)\n\
+TOPK_F32((1 << 4), 4)\n\
+TOPK_F32((1 << 5), 5)\n\
+TOPK_F32((1 << 6), 6)\n\
+TOPK_F32((1 << 9), 9)\n\
 \n\
 #define TOPK_U32(LOCAL_SIZE0, STAGES) \\\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_U32toU32_I32 \\\n\
@@ -80564,8 +85064,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
  \\\n\
-    __local uint local_data[128]; \\\n\
-    __local uint local_indices[128]; \\\n\
+    __local uint local_data[LOCAL_SIZE0 * 2]; \\\n\
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\
  \\\n\
     uint left = read_imageui(input, coord.xy).x; \\\n\
     coord.z += work_group_size; \\\n\
@@ -80597,7 +85097,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             uint left_elem = local_data[left_id]; \\\n\
             uint right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80624,13 +85124,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.xy, index.xxxx); \\\n\
     write_imagei(indices, coord.zy, index.yyyy); \\\n\
  }\n\
-TOPK_U32(1 << 0, 0)\n\
-TOPK_U32(1 << 1, 1)\n\
-TOPK_U32(1 << 2, 2)\n\
-TOPK_U32(1 << 3, 3)\n\
-TOPK_U32(1 << 4, 4)\n\
-TOPK_U32(1 << 5, 5)\n\
-TOPK_U32(1 << 6, 6)\n\
+TOPK_U32((1 << 0), 0)\n\
+TOPK_U32((1 << 1), 1)\n\
+TOPK_U32((1 << 2), 2)\n\
+TOPK_U32((1 << 3), 3)\n\
+TOPK_U32((1 << 4), 4)\n\
+TOPK_U32((1 << 5), 5)\n\
+TOPK_U32((1 << 6), 6)\n\
+TOPK_U32((1 << 9), 9)\n\
 \n\
 #define TOPK_I32(LOCAL_SIZE0, STAGES) \\\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_I32toI32_I32 \\\n\
@@ -80652,8 +85153,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
  \\\n\
-    __local int local_data[128]; \\\n\
-    __local int local_indices[128]; \\\n\
+    __local int local_data[LOCAL_SIZE0 * 2]; \\\n\
+    __local int local_indices[LOCAL_SIZE0 * 2]; \\\n\
  \\\n\
     int left = read_imagei(input, coord.xy).x; \\\n\
     coord.z += work_group_size; \\\n\
@@ -80685,7 +85186,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             int left_elem = local_data[left_id]; \\\n\
             int right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80712,13 +85213,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.xy, index.xxxx); \\\n\
     write_imagei(indices, coord.zy, index.yyyy); \\\n\
  }\n\
-TOPK_I32(1 << 0, 0)\n\
-TOPK_I32(1 << 1, 1)\n\
-TOPK_I32(1 << 2, 2)\n\
-TOPK_I32(1 << 3, 3)\n\
-TOPK_I32(1 << 4, 4)\n\
-TOPK_I32(1 << 5, 5)\n\
-TOPK_I32(1 << 6, 6)\n\
+TOPK_I32((1 << 0), 0)\n\
+TOPK_I32((1 << 1), 1)\n\
+TOPK_I32((1 << 2), 2)\n\
+TOPK_I32((1 << 3), 3)\n\
+TOPK_I32((1 << 4), 4)\n\
+TOPK_I32((1 << 5), 5)\n\
+TOPK_I32((1 << 6), 6)\n\
+TOPK_I32((1 << 9), 9)\n\
 \n\
 #define TOPK_F32toU32(LOCAL_SIZE0, STAGES) \\\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toU32_I32 \\\n\
@@ -80740,8 +85242,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
  \\\n\
-    __local float local_data[128]; \\\n\
-    __local uint local_indices[128]; \\\n\
+    __local float local_data[LOCAL_SIZE0 * 2]; \\\n\
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\
  \\\n\
     float left = read_imagef(input, coord.xy).x; \\\n\
     coord.z += work_group_size; \\\n\
@@ -80773,7 +85275,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80800,13 +85302,14 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.zy, index.yyyy); \\\n\
  }\n\
 \n\
-TOPK_F32toU32(1 << 0, 0)\n\
-TOPK_F32toU32(1 << 1, 1)\n\
-TOPK_F32toU32(1 << 2, 2)\n\
-TOPK_F32toU32(1 << 3, 3)\n\
-TOPK_F32toU32(1 << 4, 4)\n\
-TOPK_F32toU32(1 << 5, 5)\n\
-TOPK_F32toU32(1 << 6, 6)\n\
+TOPK_F32toU32((1 << 0), 0)\n\
+TOPK_F32toU32((1 << 1), 1)\n\
+TOPK_F32toU32((1 << 2), 2)\n\
+TOPK_F32toU32((1 << 3), 3)\n\
+TOPK_F32toU32((1 << 4), 4)\n\
+TOPK_F32toU32((1 << 5), 5)\n\
+TOPK_F32toU32((1 << 6), 6)\n\
+TOPK_F32toU32((1 << 9), 9)\n\
 \n\
 #define TOPK_F32toI32(LOCAL_SIZE0, STAGES) \\\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stage##STAGES##_F32toI32_I32 \\\n\
@@ -80828,8 +85331,8 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
  \\\n\
     int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1)); \\\n\
  \\\n\
-    __local float local_data[128]; \\\n\
-    __local uint local_indices[128]; \\\n\
+    __local float local_data[LOCAL_SIZE0 * 2]; \\\n\
+    __local uint local_indices[LOCAL_SIZE0 * 2]; \\\n\
  \\\n\
     float left = read_imagef(input, coord.xy).x; \\\n\
     coord.z += work_group_size; \\\n\
@@ -80861,7 +85364,7 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
             float left_elem = local_data[left_id]; \\\n\
             float right_elem = local_data[right_id]; \\\n\
  \\\n\
-            if ((left_elem < right_elem) ^ signo) \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
             { \\\n\
                 local_data[left_id] = right_elem; \\\n\
                 local_data[right_id] = left_elem; \\\n\
@@ -80888,13 +85391,384 @@ __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE0, 1, 1))) void topk_stag
     write_imagei(indices, coord.zy, index.yyyy); \\\n\
  }\n\
 \n\
-TOPK_F32toI32(1 << 0, 0)\n\
-TOPK_F32toI32(1 << 1, 1)\n\
-TOPK_F32toI32(1 << 2, 2)\n\
-TOPK_F32toI32(1 << 3, 3)\n\
-TOPK_F32toI32(1 << 4, 4)\n\
-TOPK_F32toI32(1 << 5, 5)\n\
-TOPK_F32toI32(1 << 6, 6)"; /* end of topk_cl*/
+TOPK_F32toI32((1 << 0), 0)\n\
+TOPK_F32toI32((1 << 1), 1)\n\
+TOPK_F32toI32((1 << 2), 2)\n\
+TOPK_F32toI32((1 << 3), 3)\n\
+TOPK_F32toI32((1 << 4), 4)\n\
+TOPK_F32toI32((1 << 5), 5)\n\
+TOPK_F32toI32((1 << 6), 6)\n\
+TOPK_F32toI32((1 << 9), 9)"; /* end of topk_cl*/
+
+static const char topk2_cl[] = "\n\
+#define BITONIC_STEP(dtype) \\\n\
+void bitonic_step_##dtype(uint num_stages, int lx, \\\n\
+        __local dtype *local_data, __local int *local_indices) \\\n\
+{ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (lx >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            int left_idx = local_indices[left_id]; \\\n\
+            int right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+            dtype left_elem = local_data[left_id]; \\\n\
+            dtype right_elem = local_data[right_id]; \\\n\
+ \\\n\
+            if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+            { \\\n\
+                local_data[left_id] = right_elem; \\\n\
+                local_data[right_id] = left_elem; \\\n\
+ \\\n\
+                local_indices[left_id] = right_idx; \\\n\
+                local_indices[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+BITONIC_STEP(int)\n\
+BITONIC_STEP(uint)\n\
+\n\
+#define BITONIC_STEP_ASCEND(dtype) \\\n\
+void bitonic_step_ascend_##dtype(uint num_stages, int lx, \\\n\
+        __local dtype *p_share_k, __local int *p_share_v) \\\n\
+{ \\\n\
+    for (uint stage = 0; stage < num_stages + 1; ++stage) \\\n\
+    { \\\n\
+        uint signo = (lx >> stage) & 1; \\\n\
+ \\\n\
+        for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+        { \\\n\
+            uint postShift = (stage - passOfStage); \\\n\
+            uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+            uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
+            uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+            int left_idx = p_share_v[left_id]; \\\n\
+            int right_idx = p_share_v[right_id]; \\\n\
+ \\\n\
+            dtype left_elem = p_share_k[left_id]; \\\n\
+            dtype right_elem = p_share_k[right_id]; \\\n\
+ \\\n\
+            if ((left_elem > right_elem || (left_elem == right_elem && left_idx > right_idx)) ^ signo) \\\n\
+            { \\\n\
+                p_share_k[left_id] = right_elem; \\\n\
+                p_share_k[right_id] = left_elem; \\\n\
+ \\\n\
+                p_share_v[left_id] = right_idx; \\\n\
+                p_share_v[right_id] = left_idx; \\\n\
+            } \\\n\
+ \\\n\
+            barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+        } \\\n\
+    } \\\n\
+}\n\
+BITONIC_STEP_ASCEND(int)\n\
+BITONIC_STEP_ASCEND(uint)\n\
+\n\
+#define BITONIC_MERGE(dtype) \\\n\
+void bitonic_merge_##dtype(uint num_stages, int lx, \\\n\
+        __local dtype *local_data, __local int *local_indices) \\\n\
+{ \\\n\
+    uint stage = num_stages; \\\n\
+    uint signo = (lx >> stage) & 1; \\\n\
+ \\\n\
+    for (uint passOfStage = 0; passOfStage < stage + 1; ++passOfStage) \\\n\
+    { \\\n\
+        uint postShift = (stage - passOfStage); \\\n\
+        uint pairDistance = 1 << postShift; \\\n\
+ \\\n\
+        uint left_id = ( (lx >> postShift) << (postShift + 1)) + (lx & (pairDistance - 1)); \\\n\
+        uint right_id = left_id + pairDistance; \\\n\
+ \\\n\
+        int left_idx = local_indices[left_id]; \\\n\
+        int right_idx = local_indices[right_id]; \\\n\
+ \\\n\
+        dtype left_elem = local_data[left_id]; \\\n\
+        dtype right_elem = local_data[right_id]; \\\n\
+ \\\n\
+        if ((left_elem < right_elem || (left_elem == right_elem && left_idx < right_idx)) ^ signo) \\\n\
+        { \\\n\
+            local_data[left_id] = right_elem; \\\n\
+            local_data[right_id] = left_elem; \\\n\
+ \\\n\
+            local_indices[left_id] = right_idx; \\\n\
+            local_indices[right_id] = left_idx; \\\n\
+        } \\\n\
+ \\\n\
+        barrier(CLK_LOCAL_MEM_FENCE); \\\n\
+    } \\\n\
+}\n\
+BITONIC_MERGE(int)\n\
+BITONIC_MERGE(uint)\n\
+\n\
+#define BLOCK_SIZE              (512)\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_I32toI32_I32\n\
+(\n\
+  __read_only  image2d_t input,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
+               int       _num_stages,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lx = get_local_id(0);\n\
+    const int init_k = -2147483647;\n\
+    const int init_v = -2147483647;\n\
+    const int num_stages = 9;\n\
+    const int threads_per_block = BLOCK_SIZE;\n\
+    const int index_minus_1 = threads_per_block * 2 - 1;\n\
+    uint offset = 0;\n\
+    uint lx1 = lx + threads_per_block;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    __local int local_data[1536];\n\
+    __local int local_indices[1536];\n\
+\n\
+    int left = read_imagei(input, coord.xy).x;\n\
+    coord.z += threads_per_block;\n\
+    int right = read_imagei(input, coord.zy).x;\n\
+\n\
+    local_data[lx] = left;\n\
+    local_indices[lx] = coord.x;\n\
+    local_data[lx1] = right;\n\
+    local_indices[lx1] = coord.z;\n\
+\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    bitonic_step_int(num_stages, lx, local_data, local_indices);\n\
+\n\
+    int min_data = local_data[511];\n\
+\n\
+    int *p_share_k = local_data + threads_per_block;\n\
+    int *p_share_v = local_indices + threads_per_block;\n\
+\n\
+    int limit = (width >> 10) << 10;\n\
+    p_share_k[lx] = init_k;\n\
+    p_share_v[lx] = init_v;\n\
+\n\
+    p_share_k[lx1] = init_k;\n\
+    p_share_v[lx1] = init_v;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
+    {\n\
+        int2 data;\n\
+        coord.z = coord.x + threads_per_block;\n\
+        data.x = read_imagei(input, coord.xy).x;\n\
+        data.y = read_imagei(input, coord.zy).x;\n\
+\n\
+        p_share_k[lx] = data.x;\n\
+        p_share_v[lx] = coord.x;\n\
+\n\
+        p_share_k[lx1] = data.y;\n\
+        p_share_v[lx1] = coord.z;\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] < min_data)\n\
+        {\n\
+            continue;\n\
+        }\n\
+\n\
+        p_share_k[lx] = p_share_k[lx1];\n\
+        p_share_v[lx] = p_share_v[lx1];\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
+\n\
+        min_data = local_data[511];\n\
+        p_share_k[lx] = init_k;\n\
+        p_share_v[lx] = init_v;\n\
+        p_share_k[lx1] = init_k;\n\
+        p_share_v[lx1] = init_v;\n\
+    }\n\
+\n\
+    if (width > limit)\n\
+    {\n\
+        if (coord.x < width)\n\
+        {\n\
+            int2 data;\n\
+            data.x = read_imagei(input, coord.xy).x;\n\
+            coord.z = coord.x + threads_per_block;\n\
+            data.y = read_imagei(input, coord.zy).x;\n\
+\n\
+            p_share_k[lx] = data.x;\n\
+            p_share_v[lx] = coord.x;\n\
+\n\
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
+        }\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_int(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] >= min_data)\n\
+        {\n\
+            p_share_k[lx] = p_share_k[lx1];\n\
+            p_share_v[lx] = p_share_v[lx1];\n\
+            barrier(CLK_LOCAL_MEM_FENCE);\n\
+            bitonic_merge_int(num_stages, lx, local_data, local_indices);\n\
+        }\n\
+    }\n\
+\n\
+    int4 dst;\n\
+    dst.x = local_data[lx];\n\
+\n\
+    coord.x = lx;\n\
+    write_imagei(output, coord.xy, dst.xxxx);\n\
+\n\
+    int4 index;\n\
+    index.x = local_indices[lx];\n\
+\n\
+    write_imagei(indices, coord.xy, index.xxxx);\n\
+}\n\
+\n\
+__kernel __attribute__((reqd_work_group_size(BLOCK_SIZE, 1, 1))) void topk_stage_U32toU32_I32\n\
+(\n\
+  __read_only  image2d_t input,\n\
+  __write_only image2d_t output,\n\
+  __write_only image2d_t indices,\n\
+               float     input_scale,\n\
+               float     input_tail,\n\
+               float     output_scale,\n\
+               float     output_tail,\n\
+               int       _num_stages,\n\
+               int       width\n\
+  )\n\
+ {\n\
+    uint lx = get_local_id(0);\n\
+    const uint init_k = 0;\n\
+    const int init_v = -2147483647;\n\
+    const int num_stages = 9;\n\
+    const int threads_per_block = BLOCK_SIZE;\n\
+    const int index_minus_1 = threads_per_block * 2 - 1;\n\
+    uint offset = 0;\n\
+    uint lx1 = lx + threads_per_block;\n\
+\n\
+    int4 coord = (int4)(get_global_id(0), get_global_id(1), get_global_id(0), get_global_id(1));\n\
+\n\
+    __local uint local_data[1536];\n\
+    __local int local_indices[1536];\n\
+\n\
+    uint left = read_imageui(input, coord.xy).x;\n\
+    coord.z += threads_per_block;\n\
+    uint right = read_imageui(input, coord.zy).x;\n\
+\n\
+    local_data[lx] = left;\n\
+    local_indices[lx] = coord.x;\n\
+    local_data[lx1] = right;\n\
+    local_indices[lx1] = coord.z;\n\
+\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    bitonic_step_uint(num_stages, lx, local_data, local_indices);\n\
+\n\
+    uint min_data = local_data[511];\n\
+\n\
+    uint *p_share_k = local_data + threads_per_block;\n\
+    int *p_share_v = local_indices + threads_per_block;\n\
+\n\
+    int limit = (width >> 10) << 10;\n\
+    p_share_k[lx] = init_k;\n\
+    p_share_v[lx] = init_v;\n\
+\n\
+    p_share_k[lx1] = init_k;\n\
+    p_share_v[lx1] = init_v;\n\
+    barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+    for (coord.x = lx + threads_per_block * 2; coord.x < limit; coord.x = coord.x + threads_per_block * 2)\n\
+    {\n\
+        uint2 data;\n\
+        coord.z = coord.x + threads_per_block;\n\
+        data.x = read_imageui(input, coord.xy).x;\n\
+        data.y = read_imageui(input, coord.zy).x;\n\
+\n\
+        p_share_k[lx] = data.x;\n\
+        p_share_v[lx] = coord.x;\n\
+\n\
+        p_share_k[lx1] = data.y;\n\
+        p_share_v[lx1] = coord.z;\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] < min_data)\n\
+        {\n\
+            continue;\n\
+        }\n\
+\n\
+        p_share_k[lx] = p_share_k[lx1];\n\
+        p_share_v[lx] = p_share_v[lx1];\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
+\n\
+        min_data = local_data[511];\n\
+        p_share_k[lx] = init_k;\n\
+        p_share_v[lx] = init_v;\n\
+        p_share_k[lx1] = init_k;\n\
+        p_share_v[lx1] = init_v;\n\
+    }\n\
+\n\
+    if (width > limit)\n\
+    {\n\
+        if (coord.x < width)\n\
+        {\n\
+            uint2 data;\n\
+            data.x = read_imageui(input, coord.xy).x;\n\
+            coord.z = coord.x + threads_per_block;\n\
+            data.y = read_imageui(input, coord.zy).x;\n\
+\n\
+            p_share_k[lx] = data.x;\n\
+            p_share_v[lx] = coord.x;\n\
+\n\
+            p_share_k[lx1] = coord.z < width ? data.y : init_k;\n\
+            p_share_v[lx1] = coord.z < width ? coord.z : init_v;\n\
+        }\n\
+        barrier(CLK_LOCAL_MEM_FENCE);\n\
+\n\
+        bitonic_step_ascend_uint(num_stages, lx, p_share_k, p_share_v);\n\
+\n\
+        if (p_share_k[index_minus_1] >= min_data)\n\
+        {\n\
+            p_share_k[lx] = p_share_k[lx1];\n\
+            p_share_v[lx] = p_share_v[lx1];\n\
+            barrier(CLK_LOCAL_MEM_FENCE);\n\
+            bitonic_merge_uint(num_stages, lx, local_data, local_indices);\n\
+        }\n\
+    }\n\
+\n\
+    uint4 dst;\n\
+    dst.x = local_data[lx];\n\
+\n\
+    coord.x = lx;\n\
+    write_imageui(output, coord.xy, dst.xxxx);\n\
+\n\
+    int4 index;\n\
+    index.x = local_indices[lx];\n\
+\n\
+    write_imagei(indices, coord.xy, index.xxxx);\n\
+}\n\
+"; /* end of topk2_cl*/
 
 static const char topk_odd_even_sort_cl[] = "#define LOCAL_SIZE_X    (32)\n\
 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X, 1, 1))) void topk_odd_even_sort_F32toF32_I32\n\
@@ -81702,6 +86576,14 @@ static const source_map_t evis_resource[] =
     {"crop_and_resize_nearest_neighbor_vx", crop_and_resize_nearest_neighbor_vx},
     {"cumsum_vx", cumsum_vx},
     {"cumsum_2d_vx", cumsum_2d_vx},
+    {"cumsum_array_vx", cumsum_array_vx},
+    {"cumsum_array_2d_vx", cumsum_array_2d_vx},
+    {"cumsum_array_bf16_vx", cumsum_array_bf16_vx},
+    {"cumsum_array_ex_rev_axis0_vx", cumsum_array_ex_rev_axis0_vx},
+    {"cumsum_array_ex_rev_axis1_vx", cumsum_array_ex_rev_axis1_vx},
+    {"cumsum_array_ex_rev_axis2_vx", cumsum_array_ex_rev_axis2_vx},
+    {"cumsum_array_f16_u8_vx", cumsum_array_f16_u8_vx},
+    {"cumsum_array_f16_u8_2d_vx", cumsum_array_f16_u8_2d_vx},
     {"cumsum_bf16_vx", cumsum_bf16_vx},
     {"cumsum_ex_rev_axis0_vx", cumsum_ex_rev_axis0_vx},
     {"cumsum_ex_rev_axis1_vx", cumsum_ex_rev_axis1_vx},
@@ -81986,10 +86868,16 @@ static const source_map_t cl_resource[] =
     {"clip_F32_cl", clip_F32_cl},
     {"clip_I32_cl", clip_I32_cl},
     {"clip_U8_cl", clip_U8_cl},
+    {"col2im_cl", col2im_cl},
     {"crop_and_resize_bilinear_cl", crop_and_resize_bilinear_cl},
     {"crop_and_resize_nearest_neighbor_cl", crop_and_resize_nearest_neighbor_cl},
     {"cumsum_cl", cumsum_cl},
     {"cumsum_2d_cl", cumsum_2d_cl},
+    {"cumsum_array_2d_axis0_cl", cumsum_array_2d_axis0_cl},
+    {"cumsum_array_2d_axis1_cl", cumsum_array_2d_axis1_cl},
+    {"cumsum_array_axis0_cl", cumsum_array_axis0_cl},
+    {"cumsum_array_axis1_cl", cumsum_array_axis1_cl},
+    {"cumsum_array_axis2_cl", cumsum_array_axis2_cl},
     {"depth2space_crd_cl", depth2space_crd_cl},
     {"eltwise_ops_helper_cl", eltwise_ops_helper_cl},
     {"eltwise_unary_0_cl", eltwise_unary_0_cl},
@@ -82114,6 +87002,7 @@ static const source_map_t cl_resource[] =
     {"swish_cl", swish_cl},
     {"tile_cl", tile_cl},
     {"topk_cl", topk_cl},
+    {"topk2_cl", topk2_cl},
     {"topk_odd_even_sort_cl", topk_odd_even_sort_cl},
     {"topk_odd_even_sort2_cl", topk_odd_even_sort2_cl},
     {"upsample_cl", upsample_cl},
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
index dec079cb..6415ac0c 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_argmaxmin.c
@@ -62,11 +62,20 @@ static vsi_status _argmaxmin_op_compute
     }
     status = VSI_FAILURE;
 
-    param =vsi_nn_kernel_param_create();
+    param = vsi_nn_kernel_param_create();
     if (strcmp(kernel_name, "argmax") == 0)
     {
         vsi_nn_argmax_param * p = &(self->nn_param.argmax);
         axis = p->axis;
+#if (VX_ARGMAX_VX_SUPPORT)
+        vsi_nn_kernel_param_add_int32(param, "axis", axis);
+        self->n = (vx_node)vsi_nn_kernel_selector(self->graph,
+            kernel_name,
+            inputs, 1,
+            outputs, 1, param);
+        goto final;
+#endif
+
     }
     else
     {
@@ -101,6 +110,10 @@ static vsi_status _argmaxmin_op_compute
         vsi_nn_ReleaseTensor( &reshape_tensors[0] );
         vsi_nn_ReleaseTensor( &reshape_tensors[1] );
     }
+
+#if (VX_ARGMAX_VX_SUPPORT)
+final:
+#endif
     if( self->n )
     {
         status = VSI_SUCCESS;
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
new file mode 100644
index 00000000..c47bd279
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_bitcast.c
@@ -0,0 +1,153 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "kernel/vsi_nn_kernel_gpu_shape_optimize.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_error.h"
+
+typedef struct _bitcast_local_data_t {
+    int32_t placeholder;
+} bitcast_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status           status  = VSI_FAILURE;
+    vsi_nn_kernel_node_t n       = NULL;
+
+    n = vsi_nn_kernel_selector( self->graph, "bitcast", inputs, 1, outputs, 1, NULL );
+    if (n != NULL)
+    {
+        status = VSI_SUCCESS;
+    }
+    self->n = (vx_node)n;
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    int32_t i = 0;
+
+    VSI_UNREFERENCED(self);
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        uint32_t input_byte = 0;
+        uint32_t output_byte = 0;
+        uint32_t in_dim = inputs[0]->attr.dim_num;
+        input_byte = vsi_nn_TypeGetBytesExt(inputs[0]->attr.dtype.vx_type);
+        output_byte = vsi_nn_TypeGetBytesExt(outputs[0]->attr.dtype.vx_type);
+
+        if (input_byte == output_byte)
+        {
+            outputs[0]->attr.dim_num = in_dim;
+            for (i = 0; i < (int32_t)(in_dim); i++)
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+            }
+        }
+        else if (input_byte > output_byte)
+        {
+            outputs[0]->attr.dim_num = in_dim + 1;
+            outputs[0]->attr.size[0] = input_byte / output_byte;
+            for (i = 1;i < (int32_t)(outputs[0]->attr.dim_num); i++)
+            {
+                outputs[0]->attr.size[i] = inputs[0]->attr.size[i - 1];
+            }
+        }
+        else
+        {
+            if ((uint32_t)(inputs[0]->attr.size[in_dim - 1]) != output_byte / input_byte)
+            {
+                VSILOGE("If input datatype is smaller than output datatype, bitcast op requires that \
+                    the rightmost dimension be equal to sizeof(output datatype) / sizeof(input datatype)");
+                return FALSE;
+            }
+            outputs[0]->attr.dim_num = in_dim - 1;
+            if (outputs[0]->attr.dim_num == 0)
+            {
+                outputs[0]->attr.size[0] = 1;
+                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+            }
+            else
+            {
+                for (i = 0; i < (int32_t)(outputs[0]->attr.dim_num); i++)
+                {
+                    outputs[0]->attr.size[i] = inputs[0]->attr.size[i + 1];
+                }
+            }
+        }
+    }
+
+    return TRUE;
+} /* op_setup() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ BITCAST,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ NULL,
+    /* check      */ NULL,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c b/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c
new file mode 100644
index 00000000..d82f349c
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_col2im.c
@@ -0,0 +1,258 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+typedef struct _col2im_local_data_t {
+    int32_t placeholder;
+} col2im_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (1)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t* param = NULL;
+    param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32( param, "stride_w", self->nn_param.col2im.strides[0] );
+    vsi_nn_kernel_param_add_int32( param, "stride_h", self->nn_param.col2im.strides[1] );
+    vsi_nn_kernel_param_add_int32( param, "stride_d", self->nn_param.col2im.strides[2] );
+    vsi_nn_kernel_param_add_int32( param, "pad_w_front", self->nn_param.col2im.pads[0] );
+    vsi_nn_kernel_param_add_int32( param, "pad_w_end", self->nn_param.col2im.pads[1] );
+    vsi_nn_kernel_param_add_int32( param, "pad_h_front", self->nn_param.col2im.pads[2] );
+    vsi_nn_kernel_param_add_int32( param, "pad_h_end", self->nn_param.col2im.pads[3] );
+    vsi_nn_kernel_param_add_int32( param, "pad_d_front", self->nn_param.col2im.pads[4] );
+    vsi_nn_kernel_param_add_int32( param, "pad_d_end", self->nn_param.col2im.pads[5] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_w", self->nn_param.col2im.dilations[0] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_h", self->nn_param.col2im.dilations[1] );
+    vsi_nn_kernel_param_add_int32( param, "dilation_d", self->nn_param.col2im.dilations[2] );
+    vsi_nn_kernel_param_add_buffer( param, "block_shape", (void*)self->nn_param.col2im.block_shape, \
+                                    self->nn_param.col2im.dim_num );
+
+    self->n = (vx_node)vsi_nn_kernel_selector( self->graph, "col2im",
+        inputs, 1, outputs, 1, param );
+
+    if (self->n)
+    {
+        status = VSI_SUCCESS;
+    }
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    BEGIN_IO_TYPE_DECL(COL2IM, 1, 1)
+        IO_TYPE(D_F32,        D_F32)
+        IO_TYPE(D_F32,        D_I32)
+        IO_TYPE(D_F32,        D_U32)
+        IO_TYPE(D_F32,        D_F16)
+        IO_TYPE(D_I32,        D_F32)
+        IO_TYPE(D_I32,        D_I32)
+        IO_TYPE(D_I32,        D_U32)
+        IO_TYPE(D_I32,        D_F16)
+        IO_TYPE(D_U32,        D_F32)
+        IO_TYPE(D_U32,        D_I32)
+        IO_TYPE(D_U32,        D_U32)
+        IO_TYPE(D_F16,        D_I16|Q_DFP)
+        IO_TYPE(D_F16,        D_I16|Q_ASYM)
+        IO_TYPE(D_F16,        D_I16|Q_SYM)
+        IO_TYPE(D_F16,        D_I8|Q_DFP)
+        IO_TYPE(D_F16,        D_I8|Q_ASYM)
+        IO_TYPE(D_F16,        D_I8|Q_SYM)
+        IO_TYPE(D_F16,        D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_DFP,  D_F16)
+        IO_TYPE(D_I16|Q_DFP,  D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_DFP,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_ASYM, D_F16)
+        IO_TYPE(D_I16|Q_ASYM, D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_U8|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_F16)
+        IO_TYPE(D_I16|Q_SYM,  D_I8|Q_DFP)
+        IO_TYPE(D_I16|Q_SYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I16,        D_F16)
+        IO_TYPE(D_I16,        D_I8|Q_DFP)
+        IO_TYPE(D_I16,        D_U8|Q_ASYM)
+        IO_TYPE(D_I16,        D_I32)
+        IO_TYPE(D_I16,        D_U32)
+        IO_TYPE(D_I16,        D_F32)
+        IO_TYPE(D_I8|Q_DFP,   D_F16)
+        IO_TYPE(D_I8|Q_DFP,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_DFP,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_ASYM,  D_F16)
+        IO_TYPE(D_I8|Q_ASYM,  D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_F16)
+        IO_TYPE(D_I8|Q_SYM,   D_I16|Q_DFP)
+        IO_TYPE(D_I8|Q_SYM,   D_U8|Q_ASYM)
+        IO_TYPE(D_I8,         D_F16)
+        IO_TYPE(D_I8,         D_I16|Q_DFP)
+        IO_TYPE(D_I8,         D_U8|Q_ASYM)
+        IO_TYPE(D_I8,         D_I32)
+        IO_TYPE(D_I8,         D_U32)
+        IO_TYPE(D_I8,         D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_I16|Q_DFP)
+        IO_TYPE(D_U8|Q_ASYM,  D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_F16)
+        IO_TYPE(D_U8,         D_I16|Q_DFP)
+        IO_TYPE(D_U8,         D_I8|Q_DFP)
+        IO_TYPE(D_U8,         D_I32)
+        IO_TYPE(D_U8,         D_U32)
+        IO_TYPE(D_U8,         D_F32)
+        IO_TYPE(D_F32,        D_I16|Q_DFP)
+        IO_TYPE(D_F32,        D_I16|Q_ASYM)
+        IO_TYPE(D_F32,        D_I16|Q_SYM)
+        IO_TYPE(D_F32,        D_I8|Q_DFP)
+        IO_TYPE(D_F32,        D_I8|Q_ASYM)
+        IO_TYPE(D_F32,        D_I8|Q_SYM)
+        IO_TYPE(D_F32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32,        D_I16|Q_DFP)
+        IO_TYPE(D_I32,        D_I16|Q_ASYM)
+        IO_TYPE(D_I32,        D_I16|Q_SYM)
+        IO_TYPE(D_I32,        D_I8|Q_DFP)
+        IO_TYPE(D_I32,        D_I8|Q_ASYM)
+        IO_TYPE(D_I32,        D_I8|Q_SYM)
+        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_F16,        D_F32)
+        IO_TYPE(D_F16,        D_I32)
+        IO_TYPE(D_F16,        D_I16)
+        IO_TYPE(D_F16,        D_U8)
+        IO_TYPE(D_F16,        D_I8)
+        IO_TYPE(D_F16,        D_F16)
+        IO_TYPE(D_U8|Q_ASYM,  D_U8|Q_ASYM)
+        IO_TYPE(D_I8|Q_DFP,   D_I8|Q_DFP)
+        IO_TYPE(D_I8|Q_ASYM,  D_I8|Q_ASYM)
+        IO_TYPE(D_I8|Q_SYM,   D_I8|Q_SYM)
+        IO_TYPE(D_I16|Q_DFP,  D_I16|Q_DFP)
+        IO_TYPE(D_I16|Q_ASYM, D_I16|Q_ASYM)
+        IO_TYPE(D_I16|Q_SYM,  D_I16|Q_SYM)
+        IO_TYPE(D_U8|Q_ASYM,  D_F32)
+        IO_TYPE(D_U8|Q_ASYM,  D_I32)
+        IO_TYPE(D_BF16,       D_BF16)
+    END_IO_TYPE_DECL(COL2IM)
+    if (!VALIDATE_OP_IO_TYPES(COL2IM, self, inputs, self->input.num, outputs, self->output.num)) {
+        char* desc = generate_op_io_types_desc(inputs,
+                self->input.num, outputs, self->output.num);
+        VSILOGE("Inputs/Outputs data type not support: %s", desc);
+        destroy_op_io_types_desc(desc);
+        return FALSE;
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_nn_col2im_param *p = NULL;
+    p = (vsi_nn_col2im_param* )&(self->nn_param.col2im);
+    int32_t i = 0;
+    vsi_size_t block_size = 1;
+    vsi_size_t channel = 1;
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = p->dim_num + 2;
+        for (i = 0; i < p->dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = (vsi_size_t)p->image_shape[i];
+            block_size = block_size * (vsi_size_t)p->block_shape[i];
+        }
+        channel = inputs[0]->attr.size[1] / block_size;
+        outputs[0]->attr.size[i + 1] = channel;
+        outputs[0]->attr.size[i + 2] = inputs[0]->attr.size[0];
+
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    self->nn_param.col2im.pads[0] = 0;
+    self->nn_param.col2im.pads[1] = 0;
+    self->nn_param.col2im.pads[2] = 0;
+    self->nn_param.col2im.pads[3] = 0;
+    self->nn_param.col2im.pads[4] = 0;
+    self->nn_param.col2im.pads[5] = 0;
+    self->nn_param.col2im.strides[0] = 1;
+    self->nn_param.col2im.strides[1] = 1;
+    self->nn_param.col2im.strides[2] = 1;
+    self->nn_param.col2im.dilations[0] = 1;
+    self->nn_param.col2im.dilations[1] = 1;
+    self->nn_param.col2im.dilations[2] = 1;
+
+    return VSI_SUCCESS;
+}
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ COL2IM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ vsi_nn_op_common_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
index ecb16406..26d25664 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_concat.c
@@ -28,6 +28,7 @@
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_ops.h"
 #include "vsi_nn_tensor.h"
@@ -278,7 +279,7 @@ static vsi_status op_compute
     if(_is_tensorview_support(self, outputs)
         && _is_same_quant(self, inputs, outputs)
         && (_has_norm_input(self, inputs) == FALSE)
-        && self->graph->ctx->options.enable_concat_optimize)
+        && ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize)
     {
         iter = self->nn_param.concat.lcl_data;
         while( NULL != iter )
@@ -443,7 +444,7 @@ static vsi_status op_optimize
     if (_is_tensorview_support(self, outputs) == FALSE ||
         _is_same_quant(self, inputs, outputs) == FALSE ||
         _has_norm_input(self, inputs) == TRUE ||
-        self->graph->ctx->options.enable_concat_optimize == 0)
+        ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_concat_optimize == 0)
     {
         return status;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
index bfeeab29..11f0268a 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_dataconvert.c
@@ -23,6 +23,7 @@
 *****************************************************************************/
 #include <string.h>
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_graph.h"
@@ -95,7 +96,7 @@ static vsi_status op_optimize
 
     status = VSI_SUCCESS;
 
-    if( !self->graph->ctx->options.enable_dataconvert_optimize )
+    if( !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_dataconvert_optimize )
     {
         return status;
     }
@@ -266,14 +267,14 @@ static vsi_bool op_check
         IO_TYPE(D_BF16,       D_BF16)
         IO_TYPE(D_BF16,       D_F16)
         IO_TYPE(D_BF16,       D_F32)
-        IO_TYPE(D_I32,        D_I32)
-        IO_TYPE(D_I32,        D_F32)
-        IO_TYPE(D_I32,        D_F16)
-        IO_TYPE(D_I32,        D_I16|Q_DFP)
-        IO_TYPE(D_I32,        D_I8|Q_DFP)
-        IO_TYPE(D_I32,        D_U32)
-        IO_TYPE(D_I32,        D_U16)
-        IO_TYPE(D_I32,        D_U8|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_I32|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_F32)
+        IO_TYPE(D_I32|Q_ASYM, D_F16)
+        IO_TYPE(D_I32|Q_ASYM, D_I16|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM, D_I8|Q_DFP)
+        IO_TYPE(D_I32|Q_ASYM, D_U32|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_U16|Q_ASYM)
+        IO_TYPE(D_I32|Q_ASYM, D_U8|Q_ASYM)
         IO_TYPE(D_U32,        D_U32)
         IO_TYPE(D_U32,        D_I16|Q_DFP)
         IO_TYPE(D_U32,        D_I8|Q_DFP)
@@ -281,7 +282,7 @@ static vsi_bool op_check
         IO_TYPE(D_U32,        D_U8|Q_ASYM)
         IO_TYPE(D_U32,        D_U8)
         IO_TYPE(D_BF16,       D_I32)
-        IO_TYPE(D_I32,        D_BF16)
+        IO_TYPE(D_I32|Q_ASYM, D_BF16)
         IO_TYPE(D_U4|Q_ASYM,  D_U8|Q_ASYM)
         IO_TYPE(D_U4|Q_SYM,   D_U8|Q_ASYM)
         IO_TYPE(D_U8|Q_ASYM,  D_U4|Q_ASYM)
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
index 44e051e9..a768b467 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_eltwise.c
@@ -183,10 +183,16 @@ vsi_bool vsi_nn_op_eltwise_setup
         shape[i] = sz0;
     }
 
-    if( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
     {
         outputs[0]->attr.dim_num = out_rank;
         memcpy( outputs[0]->attr.size, shape, out_rank * sizeof(vsi_size_t) );
+        if (out_rank == 1 &&
+            vsi_nn_GetTensorIsScalar(inputs[0]) &&
+            vsi_nn_GetTensorIsScalar(inputs[1]))
+        {
+            vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
+        }
     }
     else
     {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
index d035ddae..404588b9 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grid_sample.c
@@ -54,10 +54,12 @@ static vsi_status op_compute
     vsi_nn_kernel_param_t* param = NULL;
     int32_t align_corners = self->nn_param.gridsample.align_corners;
     int32_t pad_mode = (int32_t)self->nn_param.gridsample.padding_mode;
+    int32_t mode = (int32_t)self->nn_param.gridsample.mode;
     vsi_nn_kernel_node_t n;
     char kernel_name[128];
 
     param = vsi_nn_kernel_param_create();
+    vsi_nn_kernel_param_add_int32(param, "mode", mode);
     vsi_nn_kernel_param_add_int32(param, "align_corners", align_corners);
     vsi_nn_kernel_param_add_int32(param, "padding_mode", pad_mode);
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c
new file mode 100644
index 00000000..8ac872c4
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_grouped_conv3d.c
@@ -0,0 +1,412 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_platform.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_graph.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "utils/vsi_nn_math.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "vsi_nn_tensor_util.h"
+#include "utils/vsi_nn_util.h"
+#include "utils/vsi_nn_dtype_util.h"
+#include "utils/vsi_nn_constraint_check.h"
+
+
+/*
+ Declare number of input and output.
+ */
+#define _ARG_NUM            (1)
+#define _INPUT_NUM          (3)
+#define _OUTPUT_NUM         (1)
+#define _IO_NUM             (_INPUT_NUM + _OUTPUT_NUM)
+#define _PARAM_NUM          (_ARG_NUM + _IO_NUM)
+
+#define LOCAL() ((vsi_nn_grouped_conv3d_param_local_data *)nn_param->local)
+
+typedef struct _vsi_nn_grouped_conv3d_param_local_data {
+    vsi_nn_tensor_t ** input_tensor_group;
+    vsi_nn_tensor_t ** weight_tensor_group;
+    vsi_nn_tensor_t ** bias_tensor_group;
+    vsi_nn_tensor_t ** output_tensor_group;
+} vsi_nn_grouped_conv3d_param_local_data;
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+#if VX_CONV_3D_API_SUPPORT
+#define _TENSOR_LEN 64
+    vsi_bool res;
+    uint32_t i;
+    char tensor_name[_TENSOR_LEN];
+    vsi_nn_grouped_conv3d_param *nn_param = &self->nn_param.grouped_conv3d;
+    nn_param->local = (vsi_nn_grouped_conv3d_param_local_data*)malloc(
+        sizeof(vsi_nn_grouped_conv3d_param_local_data));
+    if (NULL == nn_param->local)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(nn_param->local, 0, sizeof(vsi_nn_grouped_conv3d_param_local_data));
+    LOCAL()->input_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->input_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->input_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, inputs[0], 3,
+        LOCAL()->input_tensor_group, nn_param->group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+
+    LOCAL()->weight_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->weight_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->weight_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, inputs[1], 4,
+        LOCAL()->weight_tensor_group, nn_param->group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+
+    LOCAL()->bias_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->bias_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->bias_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (inputs[2] != NULL)
+    {
+        res = vsi_nn_CreateTensorGroup(self->graph, inputs[2], 0,
+            LOCAL()->bias_tensor_group, nn_param->group);
+        if (res == FALSE)
+        {
+            VSILOGE("CreateTensorGroup fail, (GROUPED_CONV2D) at [%s : %d]\n", __FILE__, __LINE__);
+            return VSI_FAILURE;
+        }
+    }
+
+    LOCAL()->output_tensor_group = (vsi_nn_tensor_t **)malloc(
+        nn_param->group * sizeof(vsi_nn_tensor_t *));
+    if (NULL == LOCAL()->output_tensor_group)
+    {
+        VSILOGE("Malloc fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+    memset(LOCAL()->output_tensor_group, 0, nn_param->group * sizeof(vsi_nn_tensor_t *));
+    res = vsi_nn_CreateTensorGroup(self->graph, outputs[0], 3,
+        LOCAL()->output_tensor_group, nn_param->group);
+    if (res == FALSE)
+    {
+        VSILOGE("CreateTensorGroup fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+        return VSI_FAILURE;
+    }
+
+    for (i = 0; i < nn_param->group; i++)
+    {
+        vx_tensor bias;
+        vx_nn_convolution_3d_params_t *param = NULL;
+        vx_nn_convolution_3d_params_t param_;
+        memset( &param_, 0, sizeof( vx_nn_convolution_3d_params_t ) );
+        param = &param_;
+        param->padding_w_left = self->nn_param.grouped_conv3d.pad[0];
+        param->padding_w_right = self->nn_param.grouped_conv3d.pad[1];
+        param->padding_h_top = self->nn_param.grouped_conv3d.pad[2];
+        param->padding_h_bottom = self->nn_param.grouped_conv3d.pad[3];
+        param->padding_d_front = self->nn_param.grouped_conv3d.pad[4];
+        param->padding_d_rear = self->nn_param.grouped_conv3d.pad[5];
+
+        param->stride_w = self->nn_param.grouped_conv3d.stride[0];
+        param->stride_h = self->nn_param.grouped_conv3d.stride[1];
+        param->stride_d = self->nn_param.grouped_conv3d.stride[2];
+
+        if (self->nn_param.grouped_conv3d.dilation[0] *
+            self->nn_param.grouped_conv3d.dilation[1] *
+            self->nn_param.grouped_conv3d.dilation[2] > 1)
+        {
+            VSILOGE("conv3d could not support dilation > 1\n");
+            return VSI_FAILURE;
+        }
+        if ( self->nn_param.grouped_conv3d.dilation[0] > 0 )
+        {
+            param->dilation_w = self->nn_param.grouped_conv3d.dilation[0] - 1;
+        }
+        if ( self->nn_param.grouped_conv3d.dilation[1] > 0 )
+        {
+            param->dilation_h = self->nn_param.grouped_conv3d.dilation[1] - 1;
+        }
+        if ( self->nn_param.grouped_conv3d.dilation[2] > 0 )
+        {
+            param->dilation_d = self->nn_param.grouped_conv3d.dilation[2] - 1;
+        }
+        param->pad_mode = vsi_nn_get_vx_pad_mode(nn_param->pad_mode);
+        param->depth_multiplier = self->nn_param.grouped_conv3d.multiplier;
+        param->overflow_policy = self->vx_param.overflow_policy;
+        param->rounding_policy = self->vx_param.rounding_policy;
+        param->down_scale_size_rounding = self->vx_param.down_scale_size_rounding;
+
+        if ( inputs[2] == NULL )
+        {
+            bias = NULL;
+        }
+        else
+        {
+            bias = LOCAL()->bias_tensor_group[i]->t;
+        }
+
+        self->n = vxConv3dLayer(
+            self->graph->g,
+            LOCAL()->input_tensor_group[i]->t,
+            LOCAL()->weight_tensor_group[i]->t,
+            bias,
+            (vx_nn_convolution_3d_params_t* )param,
+            sizeof( vx_nn_convolution_3d_params_t),
+            LOCAL()->output_tensor_group[i]->t
+            );
+
+        memset(tensor_name, 0, sizeof(tensor_name));
+        snprintf(tensor_name, sizeof(tensor_name), "uid_%u_sub_uid_%u_out_0", self->uid, i);
+        if (vxSetReferenceName((vx_reference)LOCAL()->output_tensor_group[i]->t, tensor_name) == VSI_FAILURE)
+        {
+            VSILOGW("Set uid %u copy node output name fail", self->uid);
+            return VSI_FAILURE;
+        }
+        if ( NULL == self->n )
+        {
+            VSILOGE("Add vxConvolutionLayer fail, (GROUPED_CONV3D) at [%s : %d]\n", __FILE__, __LINE__);
+            return VSI_FAILURE;
+        }
+        else
+        {
+            // no need to maintain self->n
+            vxReleaseNode( &self->n );
+            self->n = NULL;
+        }
+    }
+#else
+    VSI_UNREFERENCED(self);
+    VSI_UNREFERENCED(inputs);
+    VSI_UNREFERENCED(outputs);
+#endif
+    return VSI_SUCCESS;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = FALSE;
+
+    ret = vsi_nn_OpCheck(VSI_NN_OP_CONV3D, self, inputs, outputs);
+
+    return ret;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    /* TODO: Add code to comput outputs' shape. */
+    vsi_nn_grouped_conv3d_param *nn_param;
+    vsi_size_t perm[] = { 3, 2, 0, 1 };
+
+#ifdef VX_CONVERT_POLICY_WRAP_ENABLE
+    if ( vsi_nn_compareVersion(self->graph, 1, 1, 21) == -1 )
+    {
+        self->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+    }
+#endif
+
+    if ( VSI_NN_DIM_FMT_NHWC == inputs[1]->attr.dtype.fmt &&
+        VSI_NN_TYPE_VDATA != inputs[1]->attr.dtype.vx_type )
+    {
+        vsi_nn_TransposeTensor( self->graph, inputs[1], perm, 4, NULL );
+        inputs[1]->attr.dtype.fmt = VSI_NN_DIM_FMT_NCHW;
+    }
+
+    nn_param = &self->nn_param.grouped_conv3d;
+    {
+        vsi_size_t i, pad[_cnt_of_array(nn_param->pad)] = {0};
+        for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
+        {
+            pad[i] = self->nn_param.grouped_conv3d.pad[i];
+        }
+        vsi_nn_compute_padding_3d(
+            inputs[0]->attr.size,
+            inputs[1]->attr.size,
+            nn_param->stride,
+            nn_param->dilation,
+            nn_param->pad_type,
+            pad
+        );
+        for (i = 0; i < _cnt_of_array(nn_param->pad); i++)
+        {
+            self->nn_param.grouped_conv3d.pad[i] = (uint32_t)pad[i];
+        }
+    }
+
+    if ( VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num )
+    {
+        outputs[0]->attr.size[0] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[0],
+            inputs[1]->attr.size[0],
+            &nn_param->pad[0],
+            nn_param->stride[0],
+            nn_param->dilation[0],
+            VSI_NN_ROUND_FLOOR
+            );
+        outputs[0]->attr.size[1] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[1],
+            inputs[1]->attr.size[1],
+            &nn_param->pad[2],
+            nn_param->stride[1],
+            nn_param->dilation[1],
+            VSI_NN_ROUND_FLOOR
+            );
+        outputs[0]->attr.size[2] = vsi_nn_ComputeFilterSize
+            (
+            inputs[0]->attr.size[2],
+            inputs[1]->attr.size[2],
+            &nn_param->pad[4],
+            nn_param->stride[2],
+            nn_param->dilation[2],
+            VSI_NN_ROUND_FLOOR
+            );
+        if (self->nn_param.grouped_conv3d.weights > 0)
+        {
+            outputs[0]->attr.size[3] = self->nn_param.grouped_conv3d.weights;
+        }
+        else if (self->nn_param.grouped_conv3d.multiplier > 0)
+        {
+            outputs[0]->attr.size[3] = inputs[0]->attr.size[3] * self->nn_param.grouped_conv3d.multiplier;
+        }
+        else
+        {
+            outputs[0]->attr.size[3] = inputs[1]->attr.size[4];
+        }
+        outputs[0]->attr.size[4] = inputs[0]->attr.size[4];
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+    }
+    return TRUE;
+} /* op_setup() */
+
+
+static vsi_status op_deinit
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_nn_grouped_conv3d_param *nn_param = &(self->nn_param.grouped_conv3d);
+    uint32_t i;
+    if (LOCAL())
+    {
+        if (LOCAL()->input_tensor_group)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->input_tensor_group[i]));
+            }
+            free(LOCAL()->input_tensor_group);
+        }
+        if (LOCAL()->weight_tensor_group)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->weight_tensor_group[i]));
+            }
+            free(LOCAL()->weight_tensor_group);
+        }
+        if (LOCAL()->bias_tensor_group != NULL)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->bias_tensor_group[i]));
+            }
+            free(LOCAL()->bias_tensor_group);
+        }
+        if (LOCAL()->output_tensor_group != NULL)
+        {
+            for (i = 0; i < nn_param->group; i++)
+            {
+                vsi_nn_ReleaseTensor(&(LOCAL()->output_tensor_group[i]));
+            }
+            free(LOCAL()->output_tensor_group);
+        }
+
+        free(LOCAL());
+    }
+    vsi_nn_op_common_deinit(self);
+    return VSI_SUCCESS;
+} /* op_deinit() */
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ GROUPED_CONV3D,
+    /* init       */ NULL,
+    /* compute    */ op_compute,
+    /* deinit     */ op_deinit,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
new file mode 100644
index 00000000..5dbe4a40
--- /dev/null
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_l1_layer_norm.c
@@ -0,0 +1,206 @@
+/****************************************************************************
+*
+*    Copyright (c) 2020 Vivante Corporation
+*
+*    Permission is hereby granted, free of charge, to any person obtaining a
+*    copy of this software and associated documentation files (the "Software"),
+*    to deal in the Software without restriction, including without limitation
+*    the rights to use, copy, modify, merge, publish, distribute, sublicense,
+*    and/or sell copies of the Software, and to permit persons to whom the
+*    Software is furnished to do so, subject to the following conditions:
+*
+*    The above copyright notice and this permission notice shall be included in
+*    all copies or substantial portions of the Software.
+*
+*    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+*    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+*    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+*    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+*    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+*    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+*    DEALINGS IN THE SOFTWARE.
+*
+*****************************************************************************/
+
+
+#include <string.h>
+#include <stdlib.h>
+
+#include "vsi_nn_types.h"
+#include "vsi_nn_log.h"
+#include "vsi_nn_node.h"
+#include "vsi_nn_prv.h"
+#include "vsi_nn_ops.h"
+#include "vsi_nn_tensor.h"
+#include "utils/vsi_nn_util.h"
+#include "kernel/vsi_nn_kernel.h"
+#include "utils/vsi_nn_constraint_check.h"
+#include "vsi_nn_tensor_util_prv.h"
+
+typedef struct _l1_layer_norm_local_data_t {
+    int32_t placeholder;
+} l1_layer_norm_local_data_t;
+
+/*
+ Declare number of input and output.
+ */
+#define _INPUT_NUM          (4)
+#define _OUTPUT_NUM         (1)
+
+static vsi_status op_compute
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    vsi_nn_kernel_param_t * param = NULL;
+    vsi_nn_kernel_node_t    n = NULL;
+    float eps = self->nn_param.l1_layer_norm.eps;
+    int32_t axis = self->nn_param.l1_layer_norm.axis;
+
+    param = vsi_nn_kernel_param_create();
+
+    vsi_nn_kernel_param_add_float32( param, "eps", eps );
+    vsi_nn_kernel_param_add_int32( param, "axis", axis );
+    n = vsi_nn_kernel_selector( self->graph, "l1_layer_norm",
+                    inputs, _INPUT_NUM, outputs, _OUTPUT_NUM, param );
+
+    if ( n != NULL )
+    {
+        self->n = (vx_node)n;
+        status = VSI_SUCCESS;
+    }
+    if (param != NULL)
+    {
+        vsi_nn_kernel_param_release( &param );
+    }
+
+    return status;
+} /* op_compute() */
+
+static vsi_bool op_check
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    vsi_bool ret = vsi_nn_is_stream_process_supported_types(self->graph, inputs, self->input.num);
+
+    if (!ret)
+    {
+        BEGIN_IO_TYPE_DECL(L1_LAYER_NORM, 4, 1)
+            IO_TYPE(D_F32,        D_F32,  D_F32,  D_F32,  D_F32)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_F16,        D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_F16,        D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_BF16,       D_F32,  D_F32,  D_F32,  D_BF16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F16,  D_F32,  D_F16)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_U8|Q_ASYM)
+            IO_TYPE(D_U8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_I16|Q_DFP)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32,  D_I16|Q_ASYM)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32,  D_I16|Q_SYM)
+            IO_TYPE(D_I16|Q_DFP,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_ASYM, D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I16|Q_SYM,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32,  D_I8|Q_DFP)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_I8|Q_ASYM)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32,  D_I8|Q_SYM)
+            IO_TYPE(D_I8|Q_DFP,   D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_ASYM,  D_F32,  D_F32,  D_F32,  D_F16)
+            IO_TYPE(D_I8|Q_SYM,   D_F32,  D_F32,  D_F32,  D_F16)
+        END_IO_TYPE_DECL(L1_LAYER_NORM)
+        if (!VALIDATE_OP_IO_TYPES(L1_LAYER_NORM, self, inputs, self->input.num, outputs, self->output.num))
+        {
+            char* desc = generate_op_io_types_desc(inputs,
+                    self->input.num, outputs, self->output.num);
+            VSILOGE("Inputs/Outputs data type not support: %s", desc);
+            destroy_op_io_types_desc(desc);
+            return FALSE;
+        }
+    }
+
+    return TRUE;
+} /* op_check() */
+
+static vsi_bool op_setup
+    (
+    vsi_nn_node_t * self,
+    vsi_nn_tensor_t ** inputs,
+    vsi_nn_tensor_t ** outputs
+    )
+{
+    int32_t i = 0;
+    VSI_UNREFERENCED(self);
+
+    if (VSI_NN_DIM_AUTO == outputs[0]->attr.dim_num)
+    {
+        outputs[0]->attr.dim_num = inputs[0]->attr.dim_num;
+        for (i = 0; i < (int32_t)inputs[0]->attr.dim_num; i++)
+        {
+            outputs[0]->attr.size[i] = inputs[0]->attr.size[i];
+        }
+    }
+    return TRUE;
+} /* op_setup() */
+
+static vsi_status op_init
+    (
+    vsi_nn_node_t* self
+    )
+{
+    vsi_status status = VSI_SUCCESS;
+
+    self->nn_param.l1_layer_norm.axis = 0;
+
+    return status;
+} /* op_init() */
+
+
+__BEGIN_DECLS
+
+/* Registrar */
+DEF_OP_REG
+    (
+    /* op_name    */ L1_LAYER_NORM,
+    /* init       */ op_init,
+    /* compute    */ op_compute,
+    /* deinit     */ NULL,
+    /* check      */ op_check,
+    /* setup      */ op_setup,
+    /* optimize   */ NULL,
+    /* input_num  */ _INPUT_NUM,
+    /* output_num */ _OUTPUT_NUM
+    );
+
+__END_DECLS
+
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
index 682628c2..a9739381 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_pre_process.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_graph.h"
@@ -161,7 +162,7 @@ static vsi_bool op_setup
             if (p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR ||
                 p->type == VSI_NN_SOURCE_FORMAT_IMAGE_RGB888_PLANAR_SEP)
             {
-                enable_rgb88_planar_nhwc = self->graph->ctx->options.enable_rgb88_planar_nhwc;
+                enable_rgb88_planar_nhwc = ((vsi_nn_graph_prv_t*)(self->graph))->options->enable_rgb88_planar_nhwc;
             }
         }
 
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
index 2051c453..4c314b85 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_reduce.c
@@ -183,7 +183,8 @@ static vsi_bool _check_is_sp_supported_type
         return FALSE;
     }
 
-    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2)) ||
+    if ( (axes_num == 1 && (axes[0] == 0 || axes[0] == 2 ||
+         (axes[0] == 1 && (input->attr.size[0] == 1 || input->attr.size[2] == 1)))) ||
          (axes_num == 2 && ((axes[0] < 2 && axes[1] < 2) || (axes[0] == 1 && axes[1] == 2))) )
     {
         return TRUE;
@@ -1167,6 +1168,7 @@ static vsi_bool op_setup
             {
                 outputs[0]->attr.dim_num = 1;
                 outputs[0]->attr.size[0] = 1;
+                vsi_nn_SetTensorIsScalar(outputs[0], TRUE);
             }
             else
             {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
index 84387d7f..8d1610f5 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_rmsnorm.c
@@ -93,52 +93,32 @@ static vsi_bool op_check
     if (!ret)
     {
         BEGIN_IO_TYPE_DECL(RMS_NORM, 2, 1)
-            IO_TYPE(D_F32, D_F32, D_F32)
-            IO_TYPE(D_F16, D_F32, D_F16)
-            IO_TYPE(D_F16, D_F32, D_F16)
-            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_F16, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_BF16, D_F32, D_BF16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_U8 | Q_ASYM)
-            IO_TYPE(D_U8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_I16 | Q_DFP)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_I16 | Q_ASYM)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_I16 | Q_SYM)
-            IO_TYPE(D_I16 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I16 | Q_SYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_I8 | Q_DFP)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_I8 | Q_ASYM)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_I8 | Q_SYM)
-            IO_TYPE(D_I8 | Q_DFP, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_ASYM, D_F32, D_F16)
-            IO_TYPE(D_I8 | Q_SYM, D_F32, D_F16)
+            IO_TYPE(D_F32,          D_F32,  D_F32)
+            IO_TYPE(D_F32,          D_F32,  D_F16)
+            IO_TYPE(D_F16,          D_F32,  D_F16)
+            IO_TYPE(D_F16,          D_F32,  D_F32)
+            IO_TYPE(D_F16,          D_F32,  D_U8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_DFP)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I8 | Q_SYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_DFP)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_ASYM)
+            IO_TYPE(D_F16,          D_F32,  D_I16 | Q_SYM)
+            IO_TYPE(D_BF16,         D_F32,  D_BF16)
+            IO_TYPE(D_U8 | Q_ASYM,  D_F32,  D_F16)
+            IO_TYPE(D_U8 | Q_ASYM,  D_F32,  D_U8 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_DFP,  D_F32,  D_I16 | Q_DFP)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32,  D_I16 | Q_ASYM)
+            IO_TYPE(D_I16 | Q_SYM,  D_F32,  D_I16 | Q_SYM)
+            IO_TYPE(D_I16 | Q_DFP,  D_F32,  D_F16)
+            IO_TYPE(D_I16 | Q_ASYM, D_F32,  D_F16)
+            IO_TYPE(D_I16 | Q_SYM,  D_F32,  D_F16)
+            IO_TYPE(D_I8 | Q_DFP,   D_F32,  D_I8 | Q_DFP)
+            IO_TYPE(D_I8 | Q_ASYM,  D_F32,  D_I8 | Q_ASYM)
+            IO_TYPE(D_I8 | Q_SYM,   D_F32,  D_I8 | Q_SYM)
+            IO_TYPE(D_I8 | Q_DFP,   D_F32,  D_F16)
+            IO_TYPE(D_I8 | Q_ASYM,  D_F32,  D_F16)
+            IO_TYPE(D_I8 | Q_SYM,   D_F32,  D_F16)
             END_IO_TYPE_DECL(RMS_NORM)
             if (!VALIDATE_OP_IO_TYPES(RMS_NORM, self, inputs, self->input.num, outputs, self->output.num))
             {
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
index 95dc76ab..5d5768fc 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_strided_slice.c
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_platform.h"
 #include "vsi_nn_prv.h"
 #include "vsi_nn_log.h"
@@ -776,7 +777,7 @@ static vsi_status op_optimize
 
     /* Only forward run stride_slice's optimize */
     if ( direction == VSI_NN_OPTIMIZE_BACKWARD ||
-         !self->graph->ctx->options.enable_slice_optimize )
+         !((vsi_nn_graph_prv_t*)(self->graph))->options->enable_slice_optimize )
     {
         return status;
     }
diff --git a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
index 6291e5c0..e93fe454 100644
--- a/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
+++ b/src/tim/vx/internal/src/ops/vsi_nn_op_tile.c
@@ -78,9 +78,10 @@ static vsi_status _tile_op_compute
     vsi_size_t new_rank                      = 0;
     vsi_bool   ret                          = FALSE;
     uint32_t i                              = 0;
-    vsi_size_t* multiples                   = (vsi_size_t*)self->nn_param.tile.multiples;
+    int32_t* multiples_                     = (int32_t*)self->nn_param.tile.multiples;
     vsi_nn_tensor_t* temp_tensors[3]        = { NULL };
     vsi_nn_tensor_t* reshape_tensors[3]     = { NULL };
+    vsi_size_t multiples[VSI_NN_MAX_DIM_NUM] = {1};
     int32_t   multiples_value[VSI_NN_MAX_DIM_NUM] = {0};
     vsi_nn_tensor_attr_t attr;
 
@@ -101,6 +102,11 @@ static vsi_status _tile_op_compute
         temp_tensors[2] = outputs[0];
     }
 
+    for (i = 0; i < inputs[0]->attr.dim_num; i ++)
+    {
+        multiples[i] = (vsi_size_t)multiples_[i];
+    }
+
     ret = vsi_nn_kernel_optimize_tile_shape(
             inputs[0]->attr.size, inputs[0]->attr.dim_num,
             multiples, inputs[0]->attr.dim_num,
@@ -111,6 +117,7 @@ static vsi_status _tile_op_compute
     {
         if (_is_supported_axis(shapes[1], new_rank) == FALSE)
         {
+            uint32_t _multiples = (uint32_t)(new_rank > 4 && shapes[1][4] > 1 ? 3 : 2);
             reshape_tensors[0] = vsi_nn_reshape_tensor( self->graph, inputs[0],\
                 shapes[0], (vsi_size_t)new_rank );
             reshape_tensors[2] = vsi_nn_reshape_tensor( self->graph, temp_tensors[2],\
@@ -125,8 +132,11 @@ static vsi_status _tile_op_compute
             memcpy( &attr, &reshape_tensors[0]->attr, sizeof(attr));
             attr.is_const = FALSE;
             attr.vtl = TRUE;
-            attr.size[0] = reshape_tensors[2]->attr.size[0];
-            attr.size[1] = reshape_tensors[2]->attr.size[1];
+
+            for (i = 0; i < _multiples; i++)
+            {
+                attr.size[i] = reshape_tensors[2]->attr.size[i];
+            }
             temp_tensors[0] = vsi_nn_CreateTensor( self->graph, &attr );
 
             memset( &attr, 0 , sizeof(vsi_nn_tensor_attr_t) );
@@ -136,9 +146,11 @@ static vsi_status _tile_op_compute
             attr.size[0] = new_rank;
             attr.dim_num = 1;
 
-            multiples_value[0] = (int32_t)shapes[1][0];
-            multiples_value[1] = (int32_t)shapes[1][1];
-            for (i = 0; i < new_rank; i++)
+            for (i = 0; i < _multiples; i++)
+            {
+                multiples_value[i] = (int32_t)shapes[1][i];
+            }
+            for (i = _multiples; i < new_rank; i++)
             {
                 multiples_value[i] = 1;
             }
@@ -150,9 +162,11 @@ static vsi_status _tile_op_compute
                 goto final;
             }
 
-            multiples_value[0] = 1;
-            multiples_value[1] = 1;
-            for (i = 0; i < new_rank; i++)
+            for (i = 0; i < _multiples; i++)
+            {
+                multiples_value[i] = 1;
+            }
+            for (i = _multiples; i < new_rank; i++)
             {
                 multiples_value[i] = (int32_t)shapes[1][i];
             }
@@ -257,6 +271,7 @@ static vsi_bool op_check
         IO_TYPE(D_F32,          D_F32)
         IO_TYPE(D_F32,          D_U8|Q_ASYM)
         IO_TYPE(D_F16,          D_U8|Q_ASYM)
+        IO_TYPE(D_BOOL8,        D_BOOL8)
     END_IO_TYPE_DECL(TILE)
     if (!VALIDATE_OP_IO_TYPES(TILE, self, inputs, self->input.num, outputs, self->output.num)) {
         char* desc = generate_op_io_types_desc(inputs,
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
index 4b2aa7ae..feaa0fcf 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_code_generator.c
@@ -471,6 +471,10 @@ static _op_param_gen_t s_op_gen[] =
     /* TAN */                   NULL,
     /* RMSNORM */               NULL,
     /* SHAPE */                 NULL,
+    /* BITCAST */               NULL,
+    /* GROUPED_CONV3D */        NULL,
+    /* COL2IM */                NULL,
+    /* L1_LAYER_NORM */         NULL,
 };
 _compiler_assert( _cnt_of_array(s_op_gen) == VSI_NN_OP_NUM, vsi_nn_code_generator_c );
 
diff --git a/src/tim/vx/internal/src/utils/vsi_nn_util.c b/src/tim/vx/internal/src/utils/vsi_nn_util.c
index e1d9b819..3a40e106 100644
--- a/src/tim/vx/internal/src/utils/vsi_nn_util.c
+++ b/src/tim/vx/internal/src/utils/vsi_nn_util.c
@@ -772,6 +772,7 @@ vsi_bool vsi_nn_CreateTensorGroup
     end[1] = in_tensor->attr.size[1];
     end[2] = in_tensor->attr.size[2];
     end[3] = in_tensor->attr.size[3];
+    end[4] = in_tensor->attr.size[4];
     end[axis] = 0;
     for( i = 0; i <  group_number; i ++ )
     {
@@ -1259,6 +1260,32 @@ vsi_bool vsi_nn_is_same_quant_type(
             }
             break;
         }
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+        case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC: {
+            const float diff = (float)1e-5;
+            int32_t i = 0;
+            int32_t scale_cnt0 = src_dtype->group_count;
+            int32_t scale_cnt1 = dst_dtype->group_count;
+            int32_t group_size0 = src_dtype->group_size;
+            int32_t group_size1 = dst_dtype->group_size;
+            if (scale_cnt0 == scale_cnt1 && group_size0 == group_size1)
+            {
+                const float* src_scale_ptr = src_dtype->group_scales;
+                const float* dst_scale_ptr = dst_dtype->group_scales;
+                for (i = 0; i < scale_cnt0; i++)
+                {
+                    if (vsi_nn_float_compare(
+                            src_scale_ptr[i], dst_scale_ptr[i], diff) == FALSE)
+                    {
+                        return FALSE;
+                    }
+                }
+            } else {
+                return FALSE;
+            }
+            break;
+        }
+#endif
         default:
             break;
     }
diff --git a/src/tim/vx/internal/src/vsi_nn_context.c b/src/tim/vx/internal/src/vsi_nn_context.c
index 7c7ed61d..4fd9be74 100644
--- a/src/tim/vx/internal/src/vsi_nn_context.c
+++ b/src/tim/vx/internal/src/vsi_nn_context.c
@@ -22,10 +22,10 @@
 *
 *****************************************************************************/
 #include <stdlib.h>
-#include "vsi_nn_types.h"
 #include "vsi_nn_test.h"
 #include "vsi_nn_context.h"
 #include "vsi_nn_platform.h"
+#include "vsi_nn_types.h"
 
 static vsi_status query_hardware_caps
     (
@@ -103,6 +103,9 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "vendor.VSI_VX_ENABLE_STREAM_PR
 static const char* ENV_FORCE_RGB888_OUT_NHWC = "vendor.VSI_NN_FORCE_RGB888_OUT_NHWC";
 static const char* ENV_ENABLE_SLICE_OPTIMIZE = "vendor.VSI_NN_ENABLE_SLICE_OPTIMIZE";
 static const char* ENV_ENABLE_BATCH_OPT = "vendor.VSI_VX_ENABLE_BATCH_OPT";
+static const char* ENV_SAVE_FILE_TYPE = "vendor.VSI_SAVE_FILE_TYPE";
+static const char* VSI_USE_IMAGE_PROCESS = "vendor.VSI_USE_IMAGE_PROCESS";
+static const char* VSI_USE_FROM_HANDLE = "vendor.VSI_USE_FROM_HANDLE";
 #else
 static const char* ENV_ENABLE_SHADER = "VIV_VX_ENABLE_SHADER";
 static const char* ENV_ENABLE_OPCHECK = "VSI_NN_ENABLE_OPCHECK";
@@ -113,8 +116,11 @@ static const char* ENV_ENABLE_STREAM_PROCESSOR = "VSI_VX_ENABLE_STREAM_PROCESSOR
 static const char* ENV_FORCE_RGB888_OUT_NHWC = "VSI_NN_FORCE_RGB888_OUT_NHWC";
 static const char* ENV_ENABLE_SLICE_OPTIMIZE = "VSI_NN_ENABLE_SLICE_OPTIMIZE";
 static const char* ENV_ENABLE_BATCH_OPT = "VSI_VX_ENABLE_BATCH_OPT";
+static const char* ENV_SAVE_FILE_TYPE = "VSI_SAVE_FILE_TYPE";
+static const char* VSI_USE_IMAGE_PROCESS = "VSI_USE_IMAGE_PROCESS";
+static const char* VSI_USE_FROM_HANDLE = "VSI_USE_FROM_HANDLE";
 #endif
-static vsi_status vsi_nn_initOptions
+vsi_status vsi_nn_initOptions
     (
     vsi_nn_runtime_option_t *options
     )
@@ -129,7 +135,7 @@ static vsi_status vsi_nn_initOptions
     default_value = 1;
 #endif
     options->enable_concat_optimize = vsi_nn_getenv_asint(ENV_ENABLE_CONCAT_OPTIMIZE, default_value);
-    options->enable_asymi8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
+    options->enable_i8_to_u8 = vsi_nn_getenv_asint(ENV_ENABLE_I8TOU8, 1);
     options->enable_dataconvert_optimize = vsi_nn_getenv_asint(ENV_ENABLE_DATACONVERT_OPTIMIZE, 1);
     options->enable_stream_processor = vsi_nn_getenv_asint(ENV_ENABLE_STREAM_PROCESSOR, 1);
     options->enable_rgb88_planar_nhwc = vsi_nn_getenv_asint(ENV_FORCE_RGB888_OUT_NHWC, 0);
@@ -140,6 +146,9 @@ static vsi_status vsi_nn_initOptions
 #endif
     options->enable_slice_optimize = vsi_nn_getenv_asint(ENV_ENABLE_SLICE_OPTIMIZE, default_value);
     options->enable_batch_opt = vsi_nn_getenv_asint(ENV_ENABLE_BATCH_OPT, 0);
+    options->enable_save_file_type = vsi_nn_getenv_asint(ENV_SAVE_FILE_TYPE, 0);
+    options->enable_use_image_process = vsi_nn_getenv_asint(VSI_USE_IMAGE_PROCESS, -1);
+    options->enable_use_from_handle = vsi_nn_getenv_asint(VSI_USE_FROM_HANDLE, -1);
 
     return VSI_SUCCESS;
 }
diff --git a/src/tim/vx/internal/src/vsi_nn_graph.c b/src/tim/vx/internal/src/vsi_nn_graph.c
index 3242621b..85cad885 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph.c
@@ -1354,20 +1354,26 @@ vsi_nn_graph_t * vsi_nn_CreateGraph
             graph->node_num = 0;
             graph->ctx = ctx;
             graph->rnn_wksp = NULL;
+            ((vsi_nn_graph_prv_t*) graph)->options =
+                (vsi_nn_runtime_option_t *)malloc( sizeof( vsi_nn_runtime_option_t ));
+            CHECK_PTR_FAIL_GOTO(((vsi_nn_graph_prv_t*) graph)->options, "Create graph options fail.", error);
             graph->node_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
             graph->tensor_table = (vsi_nn_map_t *)malloc( sizeof( vsi_nn_map_t ) );
             graph->isAllowFastMode = TRUE;
             vsi_nn_MapInit( graph->node_table );
             vsi_nn_MapInit( graph->tensor_table );
+            vsi_nn_initOptions( ((vsi_nn_graph_prv_t*) graph)->options );
         }
         else
         {
             VSILOGE( "Create vx graph fail." );
-            free( graph );
+            free(graph);
             graph = NULL;
         }
     }
 
+    return graph;
+error:
     return graph;
 } /* vsi_nn_CreateGraph() */
 
@@ -1429,6 +1435,10 @@ void vsi_nn_ReleaseGraph
                 free( tmp );
             }
         }
+        if (NULL != ((vsi_nn_graph_prv_t*)ptr)->options)
+        {
+            free(((vsi_nn_graph_prv_t*)ptr)->options);
+        }
         free( ptr );
         *graph = NULL;
     }
@@ -1500,7 +1510,7 @@ vsi_status vsi_nn_SetupGraph
     }
 
 #if VX_GRAPH_BATCH_OPT_SUPPORT
-    if (graph->ctx->options.enable_batch_opt)
+    if (((vsi_nn_graph_prv_t*)graph)->options->enable_batch_opt)
     {
         /*processing batch splitting*/
         status = batchInference_graph(graph, nodes_list);
@@ -2064,7 +2074,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
     const char          * kernel_name
     )
 {
-    vsi_nn_node_t * node;
+    vsi_nn_node_prv_t* node;
     vsi_nn_node_id_t id;
     vsi_nn_op_proc_t * node_proc;
 
@@ -2076,16 +2086,17 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
     {
         return NULL;
     }
-    node = (vsi_nn_node_t *)malloc( sizeof( vsi_nn_node_t ) );
+    node = (vsi_nn_node_prv_t*)malloc(sizeof(vsi_nn_node_prv_t));
 
     if( NULL != node )
     {
-        memset( node, 0, sizeof( vsi_nn_node_t ) );
-        node->graph = graph;
-        node->op = op;
-        node->vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
-        node->vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
-        node->vx_param.down_scale_size_rounding = VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
+        memset(node, 0, sizeof(vsi_nn_node_prv_t));
+        node->pon.graph = graph;
+        node->pon.op = op;
+        node->pon.vx_param.overflow_policy = VX_CONVERT_POLICY_SATURATE;
+        node->pon.vx_param.rounding_policy = VX_ROUND_POLICY_TO_ZERO;
+        node->pon.vx_param.down_scale_size_rounding =
+            VX_CONVOLUTIONAL_NETWORK_DS_SIZE_ROUNDING_FLOOR;
 
         /* init op */
         if(node_proc->init != NULL){
@@ -2093,31 +2104,31 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
         }
 
         /* init output struct */
-        node->output.num = node_proc->output_num;
-        node->output.tensors = (vsi_nn_tensor_id_t *) malloc(
+        node->pon.output.num = node_proc->output_num;
+        node->pon.output.tensors = (vsi_nn_tensor_id_t*)malloc(
             node_proc->output_num * sizeof( vsi_nn_tensor_id_t ) );
-        if ( NULL == node->output.tensors )
+        if (NULL == node->pon.output.tensors)
         {
             VSILOGE("Create output tensor id %s. fail", vsi_nn_OpGetName(op));
             vsi_nn_safe_free(node);
             return NULL;
         }
-        vsi_nn_InitTensorsId( node->output.tensors, node_proc->output_num );
+        vsi_nn_InitTensorsId(node->pon.output.tensors, node_proc->output_num);
 
         /* init input struct */
-        node->input.num = node_proc->input_num;
-        node->input.tensors = (vsi_nn_tensor_id_t *) malloc(
+        node->pon.input.num = node_proc->input_num;
+        node->pon.input.tensors = (vsi_nn_tensor_id_t*)malloc(
             node_proc->input_num * sizeof( vsi_nn_tensor_id_t ) );
-        if ( NULL == node->input.tensors )
+        if (NULL == node->pon.input.tensors)
         {
             VSILOGE("Create input tensor id %s. fail", vsi_nn_OpGetName(op));
-            vsi_nn_safe_free(node->output.tensors);
+            vsi_nn_safe_free(node->pon.output.tensors);
             vsi_nn_safe_free(node);
             return NULL;
         }
-        vsi_nn_InitTensorsId( node->input.tensors, node_proc->input_num );
-        node->attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
-        node->attr.enable_op_constraint_check = TRUE;
+        vsi_nn_InitTensorsId(node->pon.input.tensors, node_proc->input_num);
+        node->pon.attr.const_tensor_preload_type = VSI_NN_NODE_PRELOAD_NONE;
+        node->pon.attr.enable_op_constraint_check = TRUE;
     }
     id = graph->cur_nid;
     if(NULL != node){
@@ -2126,7 +2137,7 @@ vsi_nn_node_t * vsi_nn_AddExternalNode
         graph->cur_nid ++;
     }
     vsi_nn_OpRegisterExternalOvxInit(op, kernel_name, node_proc);
-    return node;
+    return (vsi_nn_node_t*)node;
 } /* vsi_nn_AddExternalNode() */
 
 void vsi_nn_RemoveNode
@@ -3354,24 +3365,245 @@ vsi_status vsi_nn_ExecuteGraphLoop
     return status;
 } /* vsi_nn_ExecuteGraphLoop() */
 
+typedef enum {
+    VSI_NN_ENABLE_I8TOU8 = 0,
+    VSI_NN_ENABLE_OPCHECK,
+    VSI_SAVE_FILE_TYPE,
+    VSI_USE_IMAGE_PROCESS,
+    VSI_NN_LOG_LEVEL,
+    VSI_NN_ENABLE_CONCAT_OPTIMIZE,
+    VSI_NN_ENABLE_DATACONVERT_OPTIMIZE,
+    VSI_VX_ENABLE_STREAM_PROCESSOR,
+    VSI_NN_FORCE_RGB888_OUT_NHWC,
+    VSI_NN_ENABLE_SLICE_OPTIMIZE,
+    VSI_VX_ENABLE_BATCH_OPT,
+    VIV_VX_ENABLE_SHADER,
+    VSI_USE_FROM_HANDLE,
+    VIV_VX_ENABLE_GRAPH_TRANSFORM
+} VSI_PUBLIC_TYPE vsi_nn_runtime_variable;
+
+typedef struct {
+    const char* key;
+    int32_t value;
+} VSI_PUBLIC_TYPE keyValuePair;
+
+char* vsi_nn_GetRunTimeVariable
+    (
+    const vsi_nn_graph_t* graph,
+    const char* key
+    )
+{
+    int32_t isVaid = 1;
+    int32_t value = -1;
+#define varSize 256
+    char* value_str = (char*)malloc(sizeof(char) * varSize);
+    CHECK_PTR_FAIL_GOTO(value_str, "Create value_str fail.", final);
+    memset(value_str, 0, varSize);
+    char tmp_value[varSize] = {0};
+    VSI_UNREFERENCED(tmp_value);
+    vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
+    switch (vsi_nn_GetVariable(key))
+    {
+        case VIV_VX_ENABLE_SHADER:
+            value =options->enable_shader;
+            break;
+        case VSI_NN_ENABLE_OPCHECK:
+            value = options->enable_opcheck;
+            break;
+        case VSI_NN_ENABLE_I8TOU8:
+            value = options->enable_i8_to_u8;
+            break;
+        case VSI_VX_ENABLE_STREAM_PROCESSOR:
+            value = options->enable_stream_processor;
+            break;
+        case VSI_VX_ENABLE_BATCH_OPT:
+            value = options->enable_batch_opt;
+            break;
+        case VSI_NN_FORCE_RGB888_OUT_NHWC:
+            value = options->enable_rgb88_planar_nhwc;
+            break;
+        case VSI_SAVE_FILE_TYPE:
+            value = options->enable_save_file_type;
+            break;
+        case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
+            value = options->enable_concat_optimize;
+            break;
+        case VSI_NN_ENABLE_SLICE_OPTIMIZE:
+            value = options->enable_slice_optimize;
+            break;
+        case VSI_USE_IMAGE_PROCESS:
+            if (options->enable_use_image_process != -1)
+            {
+                value = options->enable_use_image_process;
+            }
+            else
+            {
+                isVaid = 0;
+            }
+            break;
+        case VSI_USE_FROM_HANDLE:
+            if (options->enable_use_from_handle != -1)
+            {
+                value = options->enable_use_from_handle;
+            }
+            else
+            {
+                isVaid = 0;
+            }
+            break;
+        default:
+            isVaid = 0;
+            VSILOGE("Not support this key: %s.", key);
+    }
+    if (isVaid == 1)
+    {
+        snprintf(tmp_value, varSize, "%d", value);
+        memcpy(value_str, tmp_value, varSize);
+    } else
+    {
+        goto final;
+    }
+#undef varSize
+    return value_str;
+final:
+#undef varSize
+    vsi_nn_safe_free(value_str);
+    return value_str;
+}
 
-vsi_status vsi_nn_SetGraphTransformOption
+vsi_status vsi_nn_SetRunTimeVariable
     (
     vsi_nn_graph_t* graph,
-    const char* ctrl_str,
-    size_t size
-    )
+    const char* key,
+    const char* value
+     )
 {
-    vsi_status status = VSI_FAILURE;
-    VSI_UNREFERENCED(graph);
-    VSI_UNREFERENCED(ctrl_str);
+    vsi_status status = VSI_SUCCESS;
+    size_t size = 1;  // placeholder, not used in vxSetGraphAttribute.
+    if (graph == NULL)
+    {
+        status = VSI_FAILURE;
+        return status;
+    }
+    vsi_nn_runtime_option_t* options = ((vsi_nn_graph_prv_t*)graph)->options;
     VSI_UNREFERENCED(size);
+    if (vsi_nn_getenv(key) == NULL)
+    {
+        switch (vsi_nn_GetVariable(key) )
+        {
+            case VIV_VX_ENABLE_SHADER:
+                options->enable_shader = atoi(value);
+                break;
+            case VSI_NN_ENABLE_OPCHECK:
+                options->enable_opcheck = atoi(value);
+                break;
+            case VSI_NN_ENABLE_I8TOU8:
+                options->enable_i8_to_u8 = atoi(value);
+                break;
+            case VSI_VX_ENABLE_STREAM_PROCESSOR:
+                options->enable_stream_processor = atoi(value);
+                break;
+            case VSI_VX_ENABLE_BATCH_OPT:
+                options->enable_batch_opt = atoi(value);
+                break;
+            case VSI_NN_FORCE_RGB888_OUT_NHWC:
+                options->enable_rgb88_planar_nhwc = atoi(value);
+                break;
+            case VSI_NN_ENABLE_CONCAT_OPTIMIZE:
+                options->enable_concat_optimize = atoi(value);
+                break;
+            case VSI_NN_ENABLE_DATACONVERT_OPTIMIZE:
+                options->enable_dataconvert_optimize = atoi(value);
+                break;
+            case VSI_NN_ENABLE_SLICE_OPTIMIZE:
+                options->enable_slice_optimize = atoi(value);
+                break;
+            case VSI_SAVE_FILE_TYPE:
+                options->enable_save_file_type = atoi(value);
+                break;
+            case VSI_USE_IMAGE_PROCESS:
+                options->enable_use_image_process = atoi(value);
+                break;
+            case VSI_USE_FROM_HANDLE:
+                options->enable_use_from_handle = atoi(value);
+                break;
+            case VIV_VX_ENABLE_GRAPH_TRANSFORM:
 #ifdef VX_GRAPH_TRANSFORM_OPTION_SUPPORT
+                if (graph && graph->g) {
+                    status = vxSetGraphAttribute(
+                        graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, value, size);
+                }
+#else
+                status = VSI_FAILURE;
+                VSILOGE("VX_GRAPH_TRANSFORM_OPTION_SUPPORT is not defined, please check driver version.");
+#endif
+                break;
+            default:
+#ifdef VX_GRAPH_ENV_SUPPORT
+                status = vxSetGraphEnv(graph->g, key, value);
+#else
+                status = VSI_FAILURE;
+                VSILOGE("VX_GRAPH_ENV_SUPPORT is not defined, please check driver version.");
+#endif
+                break;
+        }
+    }
+    return status;
+}
 
-    if(graph && graph->g)
+int32_t vsi_nn_GetVariable(const char* variableKey) {
+    keyValuePair dict[] = {
+        {"VSI_NN_ENABLE_I8TOU8", VSI_NN_ENABLE_I8TOU8},
+        {"VSI_NN_ENABLE_OPCHECK", VSI_NN_ENABLE_OPCHECK},
+        {"VSI_SAVE_FILE_TYPE", VSI_SAVE_FILE_TYPE},
+        {"VSI_USE_IMAGE_PROCESS", VSI_USE_IMAGE_PROCESS},
+        {"VSI_NN_ENABLE_CONCAT_OPTIMIZE", VSI_NN_ENABLE_CONCAT_OPTIMIZE},
+        {"VSI_NN_ENABLE_DATACONVERT_OPTIMIZE", VSI_NN_ENABLE_DATACONVERT_OPTIMIZE},
+        {"VSI_VX_ENABLE_STREAM_PROCESSOR", VSI_VX_ENABLE_STREAM_PROCESSOR},
+        {"VSI_NN_FORCE_RGB888_OUT_NHWC", VSI_NN_FORCE_RGB888_OUT_NHWC},
+        {"VSI_NN_ENABLE_SLICE_OPTIMIZE", VSI_NN_ENABLE_SLICE_OPTIMIZE},
+        {"VSI_VX_ENABLE_BATCH_OPT", VSI_VX_ENABLE_BATCH_OPT},
+        {"VIV_VX_ENABLE_SHADER", VIV_VX_ENABLE_SHADER},
+        {"VSI_USE_FROM_HANDLE", VSI_USE_FROM_HANDLE},
+        {"VIV_VX_ENABLE_GRAPH_TRANSFORM", VIV_VX_ENABLE_GRAPH_TRANSFORM},
+        {NULL, -1}
+    };
+    for (int32_t i = 0; dict[i].key != NULL; i++) {
+        if (strcmp(dict[i].key, variableKey) == 0) {
+            return dict[i].value;
+        }
+    }
+    return -1;
+}
+
+OVXLIB_API char* vsi_nn_GenerateGraphJson
+    (
+    vsi_nn_graph_t* graph
+    )
+{
+    char* json = NULL;
+    VSI_UNREFERENCED(graph);
+#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
+    if (graph && graph->g)
     {
-        status = vxSetGraphAttribute(graph->g, VX_GRAPH_VSI_TRANSFORM_OPTIONS, ctrl_str, size);
+        json = vxGenerateGraphJson(graph->g);
     }
 #endif
+    return json;
+}
+
+OVXLIB_API vsi_status vsi_nn_ReleaseGraphJson
+    (
+    char* json
+    )
+{
+    vsi_status status = VSI_FAILURE;
+    VSI_UNREFERENCED(json);
+#ifdef VX_GENERATE_GRAPH_JSON_API_SUPPORT
+    if (json) {
+        status = vxReleaseGraphJson(json);
+    }
+#endif
+
     return status;
 }
\ No newline at end of file
diff --git a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
index aafc8903..c017ea50 100644
--- a/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
+++ b/src/tim/vx/internal/src/vsi_nn_graph_optimization.c
@@ -26,6 +26,7 @@
 #include "vsi_nn_graph_optimization.h"
 #include "vsi_nn_tensor_util.h"
 #include "vsi_nn_graph.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_log.h"
 #include "vsi_nn_error.h"
 
@@ -37,14 +38,50 @@ static vsi_bool _is_asymm_int8_norm_tensor
 {
     vsi_bool ret = FALSE;
 
-    ret = ( tensor != NULL
-   && tensor->attr.vtl == FALSE && tensor->attr.is_const == FALSE
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+    ret = ( tensor != NULL &&
+            tensor->attr.vtl == FALSE &&
+            tensor->attr.is_const == FALSE &&
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+          );
 
     return ret;
 }/* _is_asymm_int8_norm_tensor() */
 
+static vsi_bool _is_symm_int8_norm_tensor
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+
+    ret = (tensor != NULL &&
+           tensor->attr.vtl == FALSE &&
+           tensor->attr.is_const == FALSE &&
+           tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+           tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
+        );
+
+    return ret;
+}/* _is_symm_int8_norm_tensor() */
+
+static vsi_bool _is_int8_norm_tensor
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+    vsi_bool support_symi8 =
+       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
+
+
+    ret = _is_asymm_int8_norm_tensor(tensor);
+    ret = ret || (support_symi8 && _is_symm_int8_norm_tensor(tensor));
+
+    return ret;
+}/* _is_int8_norm_tensor() */
+
 static vsi_bool _is_asymm_int8_const_tensor
     (
         vsi_nn_tensor_t * tensor
@@ -52,14 +89,47 @@ static vsi_bool _is_asymm_int8_const_tensor
 {
     vsi_bool ret = FALSE;
 
-    ret = ( tensor != NULL
-   && tensor->attr.is_const == TRUE
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+    ret = ( tensor != NULL &&
+            tensor->attr.is_const == TRUE &&
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+          );
 
     return ret;
 }/* _is_asymm_int8_const_tensor() */
 
+static vsi_bool _is_symm_int8_const_tensor
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+
+    ret = (tensor != NULL &&
+        tensor->attr.is_const == TRUE &&
+        tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+        tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
+        );
+
+    return ret;
+}/* _is_symm_int8_const_tensor() */
+
+static vsi_bool _is_int8_const_tensor
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+    vsi_bool support_symi8 =
+       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
+
+    ret = _is_asymm_int8_const_tensor(tensor);
+    ret = ret || (support_symi8 && _is_symm_int8_const_tensor(tensor));
+
+    return ret;
+}/* _is_int8_const_tensor() */
+
 static vsi_bool _is_asymm_int8_virtual_tensor
     (
         vsi_nn_tensor_t * tensor
@@ -67,14 +137,47 @@ static vsi_bool _is_asymm_int8_virtual_tensor
 {
     vsi_bool ret = FALSE;
 
-    ret = ( tensor != NULL
-   && tensor->attr.vtl == TRUE
-   && tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8
-   && tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC);
+    ret = ( tensor != NULL &&
+            tensor->attr.vtl == TRUE &&
+            tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+            tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC
+          );
 
     return ret;
 }/* _is_asymm_int8_virtual_tensor() */
 
+static vsi_bool _is_symm_int8_virtual_tensor
+(
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+
+    ret = (tensor != NULL &&
+        tensor->attr.vtl == TRUE &&
+        tensor->attr.dtype.vx_type == VSI_NN_TYPE_INT8 &&
+        tensor->attr.dtype.qnt_type == VSI_NN_QNT_TYPE_AFFINE_SYMMETRIC
+        );
+
+    return ret;
+}/* _is_symm_int8_virtual_tensor() */
+
+static vsi_bool _is_int8_virtual_tensor
+(
+    vsi_nn_graph_t* graph,
+    vsi_nn_tensor_t* tensor
+)
+{
+    vsi_bool ret = FALSE;
+    vsi_bool support_symi8 =
+       ((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8 == 2;
+
+    ret = _is_asymm_int8_virtual_tensor(tensor);
+    ret = ret || (support_symi8 && _is_symm_int8_virtual_tensor(tensor));
+
+    return ret;
+}/* _is_int8_virtual_tensor() */
+
 static vsi_status _add_forward_node
     (
     vsi_nn_graph_t* graph,
@@ -199,7 +302,7 @@ static void _get_graph_input_asymm_int8_norm_tensor
             vsi_nn_tensor_id_t id = node->input.tensors[j];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-            if (_is_asymm_int8_norm_tensor(tensor))
+            if (_is_int8_norm_tensor(graph, tensor))
             {
                 if(tensor_ids != NULL)
                 {
@@ -251,7 +354,7 @@ static void _get_graph_output_asymm_int8_norm_tensor
             vsi_nn_tensor_id_t id = node->output.tensors[j];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-            if (_is_asymm_int8_norm_tensor(tensor))
+            if (_is_int8_norm_tensor(graph, tensor))
             {
                 if(tensor_ids != NULL)
                 {
@@ -360,6 +463,7 @@ static vsi_status _add_graph_dataconvert_for_int8
                 {
                    memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
                    attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+                   attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
                    attr.dtype.zero_point += 128;
                    attr.vtl = TRUE;
                    output = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@@ -383,6 +487,7 @@ static vsi_status _add_graph_dataconvert_for_int8
             {
                 memcpy(&attr, &tensor->attr, sizeof(vsi_nn_tensor_attr_t));
                 attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+                attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
                 attr.dtype.zero_point += 128;
                 attr.vtl = TRUE;
                 input = vsi_nn_AddTensor( graph, VSI_NN_TENSOR_ID_AUTO, &attr, NULL );
@@ -788,6 +893,7 @@ static void _convert_const_I8toU8
     }
 
     attr->dtype.vx_type = VSI_NN_TYPE_UINT8;
+    attr->dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
     attr->dtype.zero_point += 128;
 
     if ( tensor->t ) vxReleaseTensor(&tensor->t);
@@ -818,7 +924,7 @@ static vsi_status _convert_graph_const_tensor
            vsi_nn_tensor_id_t id = node->input.tensors[j];
            vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-           if (_is_asymm_int8_const_tensor(tensor))
+           if (_is_int8_const_tensor(graph, tensor))
            {
                _convert_const_I8toU8(graph, id);
            }
@@ -835,11 +941,9 @@ static vsi_status _convert_virtual_tensor_attr
     vsi_nn_tensor_t * tensor
     )
 {
-    if (_is_asymm_int8_virtual_tensor(tensor))
-    {
-        tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
-        tensor->attr.dtype.zero_point += 128;
-    }
+    tensor->attr.dtype.vx_type = VSI_NN_TYPE_UINT8;
+    tensor->attr.dtype.qnt_type = VSI_NN_QNT_TYPE_AFFINE_ASYMMETRIC;
+    tensor->attr.dtype.zero_point += 128;
 
     return VSI_SUCCESS;
 }/* _convert_virtual_tensor_attr() */
@@ -849,7 +953,7 @@ static vsi_status _convert_graph_virtual_tensor
     vsi_nn_graph_t* graph
     )
 {
-    vsi_status status = VSI_FAILURE;
+    vsi_status status = VSI_SUCCESS;
     uint32_t node_num = graph->node_num;
     vsi_nn_node_t* node = NULL;
     uint32_t i = 0;
@@ -865,7 +969,10 @@ static vsi_status _convert_graph_virtual_tensor
             vsi_nn_tensor_id_t id = node->input.tensors[j];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-            status = _convert_virtual_tensor_attr(tensor);
+            if (_is_int8_virtual_tensor(graph, tensor))
+            {
+                status = _convert_virtual_tensor_attr(tensor);
+            }
         }
 
         for(j = 0; j < node->output.num; j++)
@@ -873,7 +980,10 @@ static vsi_status _convert_graph_virtual_tensor
             vsi_nn_tensor_id_t id = node->output.tensors[j];
             vsi_nn_tensor_t * tensor = vsi_nn_GetTensor(graph, id);
 
-            status = _convert_virtual_tensor_attr(tensor);
+            if (_is_int8_virtual_tensor(graph, tensor))
+            {
+                status = _convert_virtual_tensor_attr(tensor);
+            }
         }
     }
 
@@ -925,7 +1035,7 @@ vsi_status vsi_nn_OptimizeGraph
 
     status = VSI_SUCCESS;
 
-    if (!nbg_flag && graph->ctx->options.enable_asymi8_to_u8)
+    if (!nbg_flag &&((vsi_nn_graph_prv_t*)graph)->options->enable_i8_to_u8)
     {
         status = _graph_optimization_convert_int8_to_uint8(graph, dirty);
         CHECK_STATUS_FAIL_GOTO(status, final);
diff --git a/src/tim/vx/internal/src/vsi_nn_internal_node.c b/src/tim/vx/internal/src/vsi_nn_internal_node.c
index c240d3be..b8f43111 100644
--- a/src/tim/vx/internal/src/vsi_nn_internal_node.c
+++ b/src/tim/vx/internal/src/vsi_nn_internal_node.c
@@ -452,7 +452,8 @@ void vsi_nn_internal_init_tensor_attr
     if( dtype->qnt_type == VSI_NN_QNT_TYPE_NONE &&
         ( dtype->vx_type != VSI_NN_TYPE_FLOAT16 &&
           dtype->vx_type != VSI_NN_TYPE_FLOAT32 &&
-          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 ) )
+          dtype->vx_type != VSI_NN_TYPE_BFLOAT16 &&
+          dtype->vx_type != VSI_NN_TYPE_INT32) )
     {
         attr->dtype.qnt_type = VSI_NN_QNT_TYPE_NONE;
         attr->dtype.vx_type = VSI_NN_TYPE_FLOAT16;
diff --git a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
index 4a9caeaf..eb51f99e 100644
--- a/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
+++ b/src/tim/vx/internal/src/vsi_nn_node_attr_template.c
@@ -208,6 +208,10 @@ static _node_template s_template[] =
     /* RESIZE_3D */                NULL,
     /* REDUCEL2 */              NULL,
     /* CROP_AND_RESIZE */       NULL,
+    /* BITCAST */       NULL,
+    /* GROUPED_CONV3D */        NULL,
+    /* CO2IM */        NULL,
+    /* L1_LAYER_NORM */         NULL,
 };
 //_compiler_assert( _cnt_of_array(s_template) == VSI_NN_OP_NUM, vsi_nn_node_attr_template_c );
 
diff --git a/src/tim/vx/internal/src/vsi_nn_ops.c b/src/tim/vx/internal/src/vsi_nn_ops.c
index b706240c..950f9570 100644
--- a/src/tim/vx/internal/src/vsi_nn_ops.c
+++ b/src/tim/vx/internal/src/vsi_nn_ops.c
@@ -26,6 +26,7 @@
 #include "vsi_nn_client_op.h"
 #include "vsi_nn_node.h"
 #include "vsi_nn_types.h"
+#include "vsi_nn_types_prv.h"
 #include "vsi_nn_graph.h"
 #include "vsi_nn_log.h"
 
@@ -281,7 +282,7 @@ vsi_bool vsi_nn_OpCheck
     if ( NULL != proc )
     {
         ret = TRUE;
-        if ( proc->check && node->graph->ctx->options.enable_opcheck)
+        if ( proc->check && ((vsi_nn_graph_prv_t*)(node->graph))->options->enable_opcheck)
         {
             ret = proc->check( node, inputs, outputs );
         }
diff --git a/src/tim/vx/internal/src/vsi_nn_tensor.c b/src/tim/vx/internal/src/vsi_nn_tensor.c
index 4d102225..179755f9 100644
--- a/src/tim/vx/internal/src/vsi_nn_tensor.c
+++ b/src/tim/vx/internal/src/vsi_nn_tensor.c
@@ -144,6 +144,17 @@ static void print_tensor
                          tensor->attr.dtype.scale_dim);
         ext_attr[count] = 0;
         break;
+#endif
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
+        count = snprintf(&ext_attr[0],
+                         _EXT_ATTR_BUF_SZ,
+                         "SYM GPTQ axis=%d, count=%d, group_size=%d",
+                         tensor->attr.dtype.group_channel_dim,
+                         tensor->attr.dtype.group_count,
+                         tensor->attr.dtype.group_size);
+        ext_attr[count] = 0;
+        break;
 #endif
     default:
         vsi_nn_strncpy(ext_attr, "NONE", _EXT_ATTR_BUF_SZ);
@@ -430,6 +441,25 @@ static vsi_bool _init_tensor
         VSILOGE(
             "can't support qnt_type "
             "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_ASYMMETRIC.");
+#endif
+    case VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC:
+#ifdef VSI_PER_GROUP_QUANTIZATION_SUPPORT
+        params.quant_format = (vsi_enum)VX_QUANT_AFFINE_SCALE_PER_GROUP;
+        // This is a hack that driver doesn't support const scales
+        scales = (float*)malloc(sizeof(float) * tensor->attr.dtype.group_count);
+        CHECK_PTR_FAIL_GOTO( scales, "Create buffer fail.", final );
+        memcpy(scales, tensor->attr.dtype.group_scales, tensor->attr.dtype.group_count * sizeof(float));
+        params.quant_data.affinePerGroup.channel_dim = tensor->attr.dtype.group_channel_dim;
+        params.quant_data.affinePerGroup.group_size = tensor->attr.dtype.group_size;
+        params.quant_data.affinePerGroup.scale_group_count = tensor->attr.dtype.group_count;
+        params.quant_data.affinePerGroup.scales = scales;
+        params.quant_data.affinePerGroup.zero_points = NULL;
+        params.quant_data.affinePerGroup.zero_point_group_count = 0;
+        break;
+#else
+        VSILOGE(
+            "can't support qnt_type "
+            "VSI_NN_QNT_TYPE_AFFINE_PERCHANNEL_GROUP_SYMMETRIC.");
 #endif
     default:
         break;
diff --git a/src/tim/vx/internal/src/vsi_nn_types_prv.h b/src/tim/vx/internal/src/vsi_nn_types_prv.h
index 00b55fd2..4f9fd0bf 100644
--- a/src/tim/vx/internal/src/vsi_nn_types_prv.h
+++ b/src/tim/vx/internal/src/vsi_nn_types_prv.h
@@ -58,6 +58,7 @@ typedef struct _vsi_nn_graph_prv
 
     // Add graph internal attribute here...
     vsi_nn_swap_handle_cache_t swap_handle_cache;
+    vsi_nn_runtime_option_t* options;
 } vsi_nn_graph_prv_t;
 
 /** Internal Node structure, internal use only. */